# Data Augmentation

**About the data:** <br>
The dataset contains 2 folders: yes and no which contains 253 Brain MRI Images. The folder yes contains 155 Brain MRI Images that are tumorous and the folder no contains 98 Brain MRI Images that are non-tumorous. You can find [here](https://www.kaggle.com/navoneel/brain-mri-images-for-brain-tumor-detection).

Since this is a small dataset, I used data augmentation in order to create more images.

In [4]:

from keras.preprocessing.image import ImageDataGenerator
import cv2
import imutils
import matplotlib.pyplot as plt
from os import listdir
import time    

%matplotlib inline

Using TensorFlow backend.


In [0]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return f"{h}:{m}:{round(s,1)}"

In [0]:
def augment_data(file_dir, n_generated_samples, save_to_dir):
    
    data_gen = ImageDataGenerator(rotation_range=10, 
                                  width_shift_range=0.1, 
                                  height_shift_range=0.1, 
                                  shear_range=0.1, 
                                  brightness_range=(0.3, 1.0),
                                  horizontal_flip=True, 
                                  vertical_flip=True, 
                                  fill_mode='nearest'
                                 )

    
    for filename in listdir(file_dir):
        print(file_dir+"/"+filename)
        # load the image
        image = cv2.imread(file_dir + '/' + filename)
        # reshape the image
        image = image.reshape((1,)+image.shape)
        # prefix of the names for the generated sampels.
        save_prefix = 'aug_' + filename[:-4]
        # generate 'n_generated_samples' sample images
        i=0
        for batch in data_gen.flow(x=image, batch_size=1, save_to_dir=save_to_dir, 
                                           save_prefix=save_prefix, save_format='jpg'):
            i += 1
            if i > n_generated_samples:
                break

In [18]:
start_time = time.time()

augmented_data_path = '/content/drive/My Drive/MRI_Data/augmented_data/'
yes_path="/content/drive/My Drive/MRI_Data/yes"
no_path="/content/drive/My Drive/MRI_Data/no"

# augment data for the examples with label equal to 'yes' representing tumurous examples
augment_data(file_dir=yes_path, n_generated_samples=6, save_to_dir=augmented_data_path+'yes')
# augment data for the examples with label equal to 'no' representing non-tumurous examples
augment_data(file_dir=no_path, n_generated_samples=9, save_to_dir=augmented_data_path+'no')

end_time = time.time()
execution_time = (end_time - start_time)
print(f"Elapsed time: {hms_string(execution_time)}")

/content/drive/My Drive/MRI_Data/yes/Y148.JPG
/content/drive/My Drive/MRI_Data/yes/Y153.jpg
/content/drive/My Drive/MRI_Data/yes/Y15.jpg
/content/drive/My Drive/MRI_Data/yes/Y154.jpg
/content/drive/My Drive/MRI_Data/yes/Y14.jpg
/content/drive/My Drive/MRI_Data/yes/Y146.JPG
/content/drive/My Drive/MRI_Data/yes/Y147.JPG
/content/drive/My Drive/MRI_Data/yes/Y120.JPG
/content/drive/My Drive/MRI_Data/yes/Y13.jpg
/content/drive/My Drive/MRI_Data/yes/Y12.jpg
/content/drive/My Drive/MRI_Data/yes/Y113.JPG
/content/drive/My Drive/MRI_Data/yes/Y114.JPG
/content/drive/My Drive/MRI_Data/yes/Y116.JPG
/content/drive/My Drive/MRI_Data/yes/Y115.JPG
/content/drive/My Drive/MRI_Data/yes/Y117.JPG
/content/drive/My Drive/MRI_Data/yes/Y11.jpg
/content/drive/My Drive/MRI_Data/yes/Y111.JPG
/content/drive/My Drive/MRI_Data/yes/Y112.JPG
/content/drive/My Drive/MRI_Data/yes/Y109.JPG
/content/drive/My Drive/MRI_Data/yes/Y108.jpg
/content/drive/My Drive/MRI_Data/yes/Y105.jpg
/content/drive/My Drive/MRI_Data/yes/Y1

In [0]:
def data_summary(main_path):
    
    yes_path = main_path+'yes'
    no_path = main_path+'no'
        
    # number of files (images) that are in the the folder named 'yes' that represent tumorous (positive) examples
    m_pos = len(listdir(yes_path))
    # number of files (images) that are in the the folder named 'no' that represent non-tumorous (negative) examples
    m_neg = len(listdir(no_path))
    # number of all examples
    m = (m_pos+m_neg)
    
    pos_prec = (m_pos* 100.0)/ m
    neg_prec = (m_neg* 100.0)/ m
    
    print(f"Number of examples: {m}")
    print(f"Percentage of positive examples: {pos_prec}%, number of pos examples: {m_pos}") 
    print(f"Percentage of negative examples: {neg_prec}%, number of neg examples: {m_neg}") 

In [20]:
data_summary(augmented_data_path)

Number of examples: 2065
Percentage of positive examples: 52.54237288135593%, number of pos examples: 1085
Percentage of negative examples: 47.45762711864407%, number of neg examples: 980


we will now use this augmented data to train the model.
