Data Processor

This file is used to format, generate, filter, and split our data.

The raw data is a CSV file represented as a list of tuples 
containing x and y co-ordinates for each "stroke/drag" across 
the screen. It also contained added meta data that will be 
filtered out leaving only the data and its label.
e.g. [[[x1, ...], [y1, ...]], ...]

1. Formating Data to Images
The first step is to convert the data into images.
While images are more data intensive for rempresting the data,
some of our algorithms such as CNN's which use kernel filters 
preform well wioth images.

2. Generating New Data
The second step is to generate new data from the orignals.
This new data will have randomized spacial translations 
(Affine trnaslations) applied to it. 

3. Filtering Data
The third step is to filter the image data to make the model 
more robust. This is done by bluring the image and applying 
a canny edge detector to the image. This will remove noise and
make the image more clear.

4. Split Data
The fourth step is to split the data into a training, and 
testing for the model. This is done by splitting the data
into 80% training and 20% testing.

Note: This file is not meant to be run once to prepare the data.
we will train our model using the raw data and then use the
images generated from the raw data to train our model and
compare the results.

In [None]:
#1. Formating Data to Images



In [None]:
#2. Generating Images
# class to generate new doodle images from a given images
import cv2
import os
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.preprocessing.image import load_img

class Image_Generator:
    def __init__(self, src_image_path, gen_image_path, gen_image_name, num_images):
        self.src_image_path = src_image_path
        self.gen_image_path = gen_image_path
        self.gen_image_name = gen_image_name
        self.num_images = num_images

    def affine_transform(self, image):
        #generate new image using affine transformation
        #return new image
        image = img_to_array(image)
        image = image.reshape((1,) + image.shape)
        datagen = ImageDataGenerator(
            rotation_range=40,
            width_shift_range=0.2,
            height_shift_range=0.2,
            shear_range=0.2,
            zoom_range=0.2,
            horizontal_flip=True,
            fill_mode='nearest')
        i = 0
        for batch in datagen.flow(image, batch_size=1, save_to_dir=self.gen_image_path, save_prefix=self.gen_image_name, save_format='npy'):
            i += 1
            if i > self.num_images:
                break

    def load_image(self):
        #load image from curr_image_path
        #return image
        image = load_img(self.src_image_path)
        return image

    def generate_image(self):
        #generate new images
        image = self.load_image()
        self.affine_transform(image)
    
    def generate_images(self):
        #generate new images
        for image in os.listdir(self.src_image_path):
            image_path = os.path.join(self.src_image_path, image)
            self.generate_image()

#generate new images
source = 'data/raw'
destination = 'data/generated'
name = 'gen_image'
num_copies = 10

image_generator = Image_Generator(source, destination, name, num_copies)
image_generator.generate_images()

In [None]:
#3. filter images
# class to filter images
import numpy as np
import cv2
import os
import sys
import matplotlib.pyplot as plt

class Image_Filter:
    def __init__(self, image_path = None, destination_path = None):
        self.image_path = image_path
        self.destination_path = destination_path
    
    def set_image_path(self, source_path, destination_path):
        #set image path
        #return nothing
        self.image_path = source_path
        self.destination_path = destination_path

    def filter_image(self):
        #filter image
        #return filtered image
        image = cv2.imread(self.image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        image = cv2.GaussianBlur(image, (5,5), 0)
        image = cv2.Canny(image, 50, 150)
        return image

    def filter_images(self):
        #filter images
        for image in os.listdir(self.image_path):
            path = os.path.join(self.destination_path, image)
            filtered_image = self.filter_image()
            cv2.imwrite(path, filtered_image)

#filter images
raw_img_path = 'data/raw'
gen_img_path = 'data/generated'
destination = 'data/processed'
image_filter = Image_Filter(raw_img_path, destination)
image_filter.filter_images()
image_filter.set_image_path(gen_img_path, destination)
image_filter.filter_images()

In [None]:
#4. Split Data
# function to organize data into training and testing
def split_Data(self, src_data_path, train_data_path, test_data_path):
    #split data into 80% train, 20% test
    #return None
    for root, dirs, files in os.walk(src_data_path):
        for file in files:
            if file.endswith(".npy"):
                img = cv2.imread(os.path.join(root, file), 0)
                img = img.flatten()
                if np.random.rand() < 0.8:
                    cv2.imwrite(os.path.join(train_data_path, root.split("/")[-1], file), img)
                else:
                    cv2.imwrite(os.path.join(test_data_path, root.split("/")[-1], file), img)

#split data
source = 'data/preprocessed/processed'
train = 'data/train'
test = 'data/test'
split_Data(source, train, test)