# Data Augmentation

We have 155 positives and 98 negatives. This means the dataset is skewed. Plus it is a small dataset as well. To fix both of the issues of imbalance and insufficient data we apply data augmentation

In [2]:
import tensorflow as tf
from keras_preprocessing.image import ImageDataGenerator
import cv2
import imutils
import matplotlib.pyplot as plt
from os import listdir
import time
%matplotlib inline 

In [3]:
def hmsstring(seconds_elapsed):
    h=int(seconds_elapsed/3600)
    m=int((seconds_elapsed % (60*60))/60)
    s=seconds_elapsed % 60
    return f"{h}:{m}:{round(s,1)}"

In [4]:
def augment_data(file_dir, n_generated_samples, save_to_dir):
    """
    Arguments:
        file_dir: A string representing the directory where images that we want to augment are found.
        n_generated_samples: A string representing the number of generated samples using the given image.
        save_to_dir: A string representing the directory in which the generated images will be saved.
    """
    
    
    data_gen = ImageDataGenerator(rotation_range=10, 
                                  width_shift_range=0.1, 
                                  height_shift_range=0.1, 
                                  shear_range=0.1, 
                                  brightness_range=(0.3, 1.0),
                                  horizontal_flip=True, 
                                  vertical_flip=True, 
                                  fill_mode='nearest'
                                 )

    
    for filename in listdir(file_dir):
        # load the image
        image = cv2.imread(file_dir + '\\' + filename)
        # reshape the image
        image = image.reshape((1,)+image.shape)
        # prefix of the names for the generated sampels.
        save_prefix = 'aug_' + filename[:-4]
        # generate 'n_generated_samples' sample images
        i=0
        for batch in data_gen.flow(x=image, batch_size=1, save_to_dir=save_to_dir, 
                                           save_prefix=save_prefix, save_format='jpg'):
            i += 1
            if i > n_generated_samples:
                break

In order to balance the data we can generate 16 new images for every image that belongs to 'no' class and 10 images for every image that belongs the 'yes' class.

In [6]:
import os
start_time=time.time()
augmented_data_path='augmented data/'
yes='/yes/'
no='/no/'
path1=os.path.join(augmented_data_path,'yes')
path2=os.path.join(augmented_data_path,'no')
os.makedirs(path1)
os.makedirs(path2)
augment_data(file_dir='yes/', n_generated_samples=10, save_to_dir='augmented data/yes')
augment_data(file_dir='no/', n_generated_samples=15, save_to_dir='augmented data/no')
end_time = time.time()
execution_time = (end_time - start_time)
print(f"Elapsed time: {hmsstring(execution_time)}")

Elapsed time: 0:2:51.0


In [7]:
def data_summary(main_path):
    
    yes_path = main_path+'yes'
    no_path = main_path+'no'
        
    # number of files (images) that are in the the folder named 'yes' that represent tumorous (positive) examples
    m_pos = len(listdir(yes_path))
    # number of files (images) that are in the the folder named 'no' that represent non-tumorous (negative) examples
    m_neg = len(listdir(no_path))
    # number of all examples
    m = (m_pos+m_neg)
    
    pos_prec = (m_pos* 100.0)/ m
    neg_prec = (m_neg* 100.0)/ m
    
    print(f"Number of examples: {m}")
    print(f"Percentage of positive examples: {pos_prec}%, number of pos examples: {m_pos}") 
    print(f"Percentage of negative examples: {neg_prec}%, number of neg examples: {m_neg}") 

In [8]:
data_summary(augmented_data_path)

Number of examples: 3273
Percentage of positive examples: 52.092881148793154%, number of pos examples: 1705
Percentage of negative examples: 47.907118851206846%, number of neg examples: 1568
