### Data Augmentation

As we have a fairly small dataset we have to perform data augmentation in order to create more images to train the NN.

In [2]:
from keras.preprocessing.image import ImageDataGenerator,img_to_array
import cv2
import time
from os import listdir

import warnings
warnings.filterwarnings('ignore')

In [3]:
def augment_data(**args):
    """
    Arguments:
        file_dir: A string representing the directory where images that we want to augment are found.
        n_generated_samples: A string representing the number of generated samples using the given image.
        save_to_dir: A string representing the directory in which the generated images will be saved.
    """
    datagen = ImageDataGenerator(rotation_range=10,
                                 width_shift_range=0.1,
                                 height_shift_range=0.1,
                                 shear_range=0.15,
                                 zoom_range=0.1,
                                 channel_shift_range = 10,
                                 horizontal_flip=True)
    
#  iterate over all the images in the folders
#  and perform augmentation
    directory = args.get('file_dir')
    generated_samples = args.get('n_generated_samples')
    save_to_dir = args.get('save_to_dir')
    
    for file in listdir(directory):
#       load the image
        read_path = directory + '\\' + file
        image = cv2.imread( read_path )
        image = img_to_array(image)
        image = image.reshape((1,)+image.shape)
        save_prefix = 'aug_'+file[:-4]
        i=0
        for batch in datagen.flow(x=image,
                                  batch_size=1,
                                  save_to_dir= save_to_dir,
                                  save_prefix=save_prefix,
                                  save_format='jpeg'):
            i+= 1
            if i > generated_samples:
                break

Remember that 61% of the data (155 images) are tumorous. And, 39% of the data (98 images) are non-tumorous. So, in order to balance the data we can generate 9 new images for every image that belongs to 'no' class and 6 images for every image that belongs the 'yes' class.

In [4]:
start_time = time.time()
# Augment data for images with yes label
augment_data(file_dir = "Dataset\yes",
             n_generated_samples = 6,
             save_to_dir = r"Dataset\augmented_data\yes" )
end_time = time.time()

elapsed_time  = end_time - start_time
print('Elapsed time in seconds :{}'.format(elapsed_time))

Elapsed time in seconds :218.35673666000366


In [5]:
start_time = time.time()
augment_data(file_dir = r"Dataset\no",
             n_generated_samples = 9,
             save_to_dir = r"Dataset\augmented_data\no" )

end_time = time.time()

elapsed_time  = end_time - start_time
print('Elapsed time in seconds :{}'.format(elapsed_time))

Elapsed time in seconds :131.82588696479797


In [6]:
# Data summary
def data_summary():
    total_yes_files = len(listdir(r"Dataset\augmented_data\yes"))
    total_no_files = len(listdir(r"Dataset\augmented_data\no"))
    
    total_files = total_yes_files + total_no_files
    percentage_yes = ( total_yes_files / total_files ) * 100
    percentage_no = ( total_no_files / total_files ) * 100
    
    print("Total Examples : {}".format(total_files))
    print('Percentage of yes examples : {:,.2f} %'.format(percentage_yes) )
    print('Percentage of no examples : {:,.2f} %'.format(percentage_no) )

In [7]:
data_summary()

Total Examples : 2065
Percentage of yes examples : 52.54 %
Percentage of no examples : 47.46 %
