# 1- Preprocessing

* Read original data
* Filter and only keep labels D, C, M, H (8 original labels reduced to 4 labels, for training efficiency and resource limitations)
* Splitting 10% held-out data for testing
* Create Train Set A (~1600 samples for each label, fulfilled by repeating samples)

This notebook produces:
* `test_imgs.zip`: zip file containing iamges from 90%/10% train/test split of original data
* `test_labels.csv`: filename/label for images in test_imgs.zip

* `train_imgs.zip`: zip file of training images from 90%/10% train/test split
    * unbalanced, has class imbalance problem
* `train_labels.csv`: filename/label for images in train_imgs.zip

* `train_rep_imgs.zip`: zip file of training set of images
    * balanced classes through sample repetition
* `train_rep_labels.csv`: filename/label for images in train_rep_imgs.zip


--Notebook was run on Google Colab--

In [1]:
import zipfile
from google.colab import drive

drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [None]:
!unzip "/content/drive/MyDrive/DS4440_Project/data/original/preprocessed_images.zip"

In [3]:
# Imports
import pandas as pd

import os
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

In [5]:
odir_df = pd.read_csv('/content/drive/MyDrive/DS4440_Project/data/original/odir_labels.csv')

'D': "Diabetes",
'C': "Cataract",
'H': "Hypertension",
'M': "Pathological Myopia"

In [6]:
# Filter to 4 classes
labels = ['D', 'C', 'M', 'H']
odir_df = odir_df[odir_df['labels'].isin(labels)]

In [7]:
# Sample Distribution Check
x = list(odir_df.labels.unique())
y_before = [odir_df['labels'].value_counts()[label] for label in x]

for cla in zip(x, y_before):
    print(cla[0], cla[1])

D 1608
O 708
M 232
H 128


In [8]:
# Split to train and test, 10% holdout in test
X_train, X_test, y_train, y_test = train_test_split(list(odir_df.filename), list(odir_df.labels), test_size=0.1, random_state=42)

In [9]:
train_df = pd.DataFrame({'filename': X_train, 'labels': y_train})
# Set aside and save test df, 10% holdout data
test_df = pd.DataFrame({'filename': X_test, 'labels': y_test})

In [10]:
# trainA_df to store filename:label for repeated dataset
trainA_df = train_df.copy()

In [16]:
rep_folder = "train_rep_imgs"
train_folder = "train_imgs"
test_folder = "test_imgs"
os.makedirs(rep_folder, exist_ok=True)
os.makedirs(train_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)

In [31]:
# import initial test images, pre-repetition of samples
for i, row in tqdm(train_df.iterrows()):
  img = row['filename']
  from_path = f'/content/preprocessed_images/{img}'
  to_path = f'/content/{train_folder}/{img}'
  !cp $from_path $to_path

0it [00:00, ?it/s]

In [12]:
# copy over test images to test img folder
for i, row in tqdm(test_df.iterrows()):
  img = row['filename']
  from_path = f'/content/preprocessed_images/{img}'
  to_path = f'/content/{test_folder}/{img}'
  !cp $from_path $to_path

0it [00:00, ?it/s]

In [13]:
# import initial test images, pre-repetition of samples
for i, row in tqdm(trainA_df.iterrows()):
  img = row['filename']
  from_path = f'/content/{train_folder}/{img}'
  to_path = f'/content/{rep_folder}/{img}'
  !cp $from_path $to_path

0it [00:00, ?it/s]

In [14]:
rep_filenames = []
rep_labels = []
goal_samples=1600
BATCH_SIZE=30

for label in tqdm(labels):
 
  cur_samples = trainA_df['labels'].value_counts()[label]

  to_generate = goal_samples - cur_samples
  num_batches = to_generate // BATCH_SIZE

  label_df = trainA_df[trainA_df['labels']==label].reset_index(drop=True).copy()
  i = 0
  j = 30

  if to_generate < BATCH_SIZE: # if goal is met, no need to generate
    continue
  
  pref = f"REP_"
  for batch in range(num_batches):
    # get images of the current label
    samples_to_add = label_df.loc[i:j].copy()
    for ind, row in samples_to_add.iterrows():
      filename = row['filename']
      # craft new filename
      new_filename = f"{pref}{batch}_{filename}"

      # copy image over to new folder under new filename
      from_path = f'/content/train_imgs/{filename}'
      to_path = f'/content/{rep_folder}/{new_filename}'
      !cp $from_path $to_path

      rep_filenames.append(new_filename)
      rep_labels.append(label) # append letter label

    # update count and index
    i += 30
    j += 30
    if i < len(label_df) and j > len(label_df):
      j = len(label_df)
    if i > len(label_df):
      # update variable for filename
      i = 0
      j = 30



  0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# Add augmented labels to old df
# create df for augmented filename and labels
rep_dict = {'filename': rep_filenames, 'labels': rep_labels}
rep_df = pd.DataFrame(rep_dict)

# merge the two dataframes containing aug and original data
trainA_df = pd.concat([trainA_df, rep_df])

In [18]:
# Sample Distribution Check
def get_distribution(data_df):
  x = list(data_df.labels.unique())
  y_after = [data_df['labels'].value_counts()[label] for label in x]

  for cla in zip(x, y_after):
      print(cla[0], cla[1])

In [32]:
print("original train set")
get_distribution(train_df)

print("\ntest  train set")
get_distribution(test_df)

print("\nrepetition train set")
get_distribution(trainA_df)

original train set
D 1450
O 628
M 218
H 112

test  train set
D 158
O 80
H 16
M 14

repetition train set
D 1605
O 1617
M 1630
H 1607


In [20]:
# Export new CSV
train_df.to_csv('/content/train_labels.csv', index=False)
test_df.to_csv('/content/test_labels.csv', index=False)
trainA_df.to_csv('/content/train_rep_labels.csv', index=False)

In [34]:
# check that the original image were moved
test_imgs = os.listdir('/content/test_imgs')
train_imgs = os.listdir('/content/train_imgs')
trainA_imgs = os.listdir('/content/train_rep_imgs')

In [36]:
for i, row in test_df.iterrows():
  img = row['filename']
  if img not in test_imgs:
    print(f"{img} not copied")

for i, row in train_df.iterrows():
  img = row['filename']
  if img not in train_imgs:
    print(f"{img} not copied")

for i, row in trainA_df.iterrows():
  img = row['filename']
  if img not in trainA_imgs:
    print(f"{img} not copied")

In [37]:
!zip -r /content/test_imgs.zip /content/test_imgs
!zip -r /content/train_imgs.zip /content/train_imgs
!zip -r /content/train_rep_imgs.zip /content/train_rep_imgs

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: content/train_rep_imgs/REP_1_946_right.jpg (deflated 1%)
  adding: content/train_rep_imgs/REP_8_1056_left.jpg (deflated 1%)
  adding: content/train_rep_imgs/REP_6_2048_left.jpg (deflated 1%)
  adding: content/train_rep_imgs/223_right.jpg (deflated 1%)
  adding: content/train_rep_imgs/4288_right.jpg (deflated 1%)
  adding: content/train_rep_imgs/4407_right.jpg (deflated 1%)
  adding: content/train_rep_imgs/1534_right.jpg (deflated 1%)
  adding: content/train_rep_imgs/REP_5_831_right.jpg (deflated 1%)
  adding: content/train_rep_imgs/520_right.jpg (deflated 1%)
  adding: content/train_rep_imgs/REP_2_1310_right.jpg (deflated 1%)
  adding: content/train_rep_imgs/487_right.jpg (deflated 1%)
  adding: content/train_rep_imgs/REP_11_95_right.jpg (deflated 1%)
  adding: content/train_rep_imgs/REP_1_1377_left.jpg (deflated 1%)
  adding: content/train_rep_imgs/REP_8_323_left.jpg (deflated 1%)
  adding: content/train_rep_im

In [None]:
# Download New Data
from google.colab import files
files.download("/content/test_imgs.zip")
files.download("/content/test_labels.csv")

files.download("/content/train_imgs.zip")
files.download("/content/train_labels.csv")

files.download("/content/train_rep_imgs.zip")
files.download("/content/train_rep_labels.csv")