# The data we have is very small with just  so I'm going to use data augmentation

In data augmentation i'm going to increase the number of images in the dataset by,

* Randomly cropping
* Zooming
* Horizontal flipping
* Vertical flipping 
* Height, weight shifting

All of these transformations will be applied and the image will be saved as new one.

### Importing all the necessary modules

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
import os
from PIL import Image
import numpy as np

### Let's look at the data before we augment it

In [10]:
DATADIR = os.getcwd() + '/brain_tumor_dataset'
subdirs = (os.listdir(DATADIR))
pathstodir = [os.path.join(DATADIR, x) for x in subdirs]

def data_summary(ptd):
    yes_images = len([x for x in os.listdir(ptd[1])])
    no_images = len([x for x in os.listdir(ptd[0])])
    total = yes_images + no_images
    print("Total number of images: ", yes_images + no_images)
    print("Number of yes(tumour is malignant) images: ", yes_images)
    print("Number of no(tumour is benign) images: ", no_images)
    print()
    print(r"Percentage of yes(malignant) images: {:.2f}".format(100 * yes_images/total))
    print(r"Percentage of no(benign) images: {:.2f}".format(100 * no_images/total))

data_summary(pathstodir)

Total number of images:  331
Number of yes(tumour is malignant) images:  186
Number of no(tumour is benign) images:  145

Percentage of yes(malignant) images: 56.19
Percentage of no(benign) images: 43.81


 As you can see there's also instability in data i.e. the number of 'yes' images is more than the number of 'no' images by almost 16%. So Let's fix that as well.

### Using Keras built in ImageDataGenerator for data augmentation

In [37]:
def aug_data(file_dir, n_generated_samples, save_dir):
    
    datagen = ImageDataGenerator(
        rotation_range=10, 
        width_shift_range=0.1, 
        height_shift_range=0.1, 
        shear_range=0.1, 
        brightness_range=(0.3, 1.0),
        horizontal_flip=True, 
        vertical_flip=True, 
        fill_mode='nearest'
    )
    
    for img_file in os.listdir(file_dir):
        # load the image and converting it into greyscale
        image = Image.open(os.path.join(file_dir, img_file)).convert('L')
        # The shape of all the images need to be same, hence resizing
        image = image.resize((240,240))
        # reshaping the image as (1,240,240,1) because this will be convinient for cnn later
        image = np.array(image).reshape((1,)+ (240,240,1))
        
        i=0
        for batch in datagen.flow(x=image, batch_size=1, save_to_dir=save_dir, 
                                           save_prefix='aug', save_format='jpg'):
            i += 1
            if i > n_generated_samples:
                break

In [38]:
apath = DATADIR + '/augmented_data/'
aug_data(pathstodir[1], 7, apath + 'yes')
aug_data(pathstodir[0], 9, apath + 'no')

### After Data augmentation

In [47]:
tempdir = [os.path.join(apath, x) for x in ['yes', 'no']]
data_summary(tempdir[::-1])

Total number of images:  2738
Number of yes(tumour is malignant) images:  1384
Number of no(tumour is benign) images:  1354

Percentage of yes(malignant) images: 50.55
Percentage of no(benign) images: 49.45


### Great now the dataset is increased and dataset is also balanced!