In [1]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
 %cd /content/gdrive/My Drive/Cornell/2020-2021/Biomedical ML Final Project - Melted Paper/

/content/gdrive/.shortcut-targets-by-id/13OShW0589KdYJRWaN9GrhtF2vtwdrUUW/Biomedical ML Final Project - Melted Paper


# Augment Dataset
### Perform random number of image transformations to balance the classes

In [3]:
import os, time, cv2, random
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import skimage as sk
import skimage.transform

In [4]:
augment_path = os.path.join('Datasets', 'final-dataset-augment')
normal_images_path = os.path.join('Datasets', 'final-dataset', 'N')
benign_images_path = os.path.join('Datasets', 'final-dataset', 'B')
malignant_images_path = os.path.join('Datasets', 'final-dataset', 'M')

In [5]:
normal_images = os.listdir(normal_images_path)
benign_images = os.listdir(benign_images_path)
malignant_images = os.listdir(malignant_images_path)

In [6]:
images, labels = [], []

In [7]:
for img in normal_images: 
  labels.append('N')
  image = cv2.imread(os.path.join(normal_images_path, img), cv2.IMREAD_GRAYSCALE)
  image = image.reshape((image.shape[0], image.shape[1], 1))
  images.append(image)

In [8]:
for img in benign_images: 
  labels.append('B')
  image = cv2.imread(os.path.join(benign_images_path, img), cv2.IMREAD_GRAYSCALE)
  image = image.reshape((image.shape[0], image.shape[1], 1))
  images.append(image)

In [9]:
for img in malignant_images: 
  labels.append('M')
  image = cv2.imread(os.path.join(malignant_images_path, img), cv2.IMREAD_GRAYSCALE)
  image = image.reshape((image.shape[0], image.shape[1], 1))
  images.append(image)

In [10]:
le = LabelEncoder()
le_labels = to_categorical(le.fit_transform(labels))

In [11]:
def get_class_count(labels):
  '''
  Get the number of images in each class. 
  '''
  num_classes = len(labels[0])
  counts = np.zeros(num_classes)
  for label in labels:
    for i in range(num_classes):
      counts[i] += label[i]
  return counts.tolist()

In [12]:
def random_shearing(img):
  tf = sk.transform.AffineTransform(shear=random.uniform(-0.3, 0.3))
  return sk.transform.warp(img, tf, order=1, preserve_range=True, mode='wrap')

def random_noise(img):
  return sk.util.random_noise(img)

def random_rotation(img):
  return sk.transform.rotate(img, random.uniform(-30, 30))

def horizontal_flip(img):
  return img[:, ::-1]

In [13]:
transformation_functions = {
  'shear': random_shearing,
  'rotate': random_rotation,
  'noise': random_noise,
  'horizontal_flip': horizontal_flip,
}

In [14]:
def transform_images(img, transforms: dict):
  '''
  Perform a random number of image transformations.
  '''
  num_transformations = random.randint(0, len(transforms))
  transformed_image = img
  for i in range(num_transformations):
    key = random.choice(list(transforms))
    transformed_image = transforms[key](img)

  return transformed_image

In [15]:
def generate_more_images(images, labels, transformation_functions):
  '''
  Determine the number of images needed in each class for balance, 
  then transform images to augment. 
  '''
  more_images = images
  more_labels = labels

  class_balance = get_class_count(labels)
  img_to_add = [max(class_balance) - i for i in class_balance]
  
  for i in range(len(img_to_add)):
    if int(img_to_add[i]) == 0:
      continue
    label = np.zeros(len(img_to_add))
    label[i] = 1
    class_label_indices = [i for i, x in enumerate(labels) if np.array_equal(x, label)]
    class_images = [images[i] for i in class_label_indices]

    for k in range(int(img_to_add[i])):
      transformed_image = transform_images(class_images[k % len(class_images)], transformation_functions)
      transformed_image = transformed_image.reshape(1, transformed_image.shape[0], transformed_image.shape[1], 1)
  
      more_images = np.append(more_images, transformed_image, axis=0)
      more_labels = np.append(more_labels, label.reshape(1, len(label)), axis=0)
  return more_images, more_labels

In [16]:
augmented_images, augmented_labels = generate_more_images(images, le_labels, transformation_functions)

In [17]:
print('Original number of images: {}'.format(len(augmented_images)))
print('Augmented number of images: {}'.format(len(images)))

Original number of images: 3948
Augmented number of images: 2616


In [None]:
for i in range(len(augmented_images)): 
  path = os.path.join(augment_path, augmented_labels[i], 'image' + str(i) + '.jpg')
  cv2.imwrite(path, augmented_images[i])