<a href="https://colab.research.google.com/github/robert-pineau/CIND-860-Capstone/blob/main/CIND860_build_numpy_batches.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys
import os
import glob
import re
import random
import pandas as pd
import numpy as np
import math
import random
import cv2

In [None]:
image_dir = "/mnt/wd/CIND860/database/square_cc_images"
dataset_file = "/mnt/wd/CIND860/train.csv"

In [None]:
def create_loadable_data(indexes, df, cnn_use, final_dir, batch_size=12):
  #Calculate how many batches total.
  data_len = int(np.floor(len(indexes) / batch_size))

  #Cycle through each batch
  for n in range(0,data_len):

    #Initialize the arrays used to hold this batch.
    X = np.zeros([0,224,224,3])
    Y = []
    Z = []

    #Calculate starting and ending ID for this batch.
    start = int(n*batch_size)
    end = int((n+1)*batch_size)

    #Cycle through each image for this batch
    for i in range(start,end):
      png_name = indexes[i]

      #Extract use(train, validate, test) from the image name, plus the patient-id and the image-id.
      results = re.search(r"((train)|(test)|(validate))\/(\d+)\_(\d+)(\_(\d+)){0,1}\.png", png_name)
      patient_id = int(results[5])
      image_id = int(results[6])

      #Read image into a numpy array using cv2
      img = cv2.imread(png_name)

      #Resize image as 224x224
      img = cv2.resize(img, (224, 224))

      #convert images to floats, and rescale from 0-1
      img = cv2.normalize(img, None, 0, 1.0, cv2.NORM_MINMAX, dtype=cv2.CV_32F)

      #Extract class variable (cancer) for this image.
      df_index = int(np.where(df['image_id'] == image_id)[0])
      cancer = df.at[df_index,'cancer']

      #Append image, Class(cancer), and image-id into three separate arrays
      #X,Y used to train/validate and test.
      #Z used to be able to retrieve image-id after the fact to see
      #exactly which images the final algorithm works on, and which it does not work on.

      X = np.append(X,[img],axis=0)
      Y = np.append(Y,[cancer],axis=0)
      Z = np.append(Z,[image_id],axis=0)

    #Save the three arrays into a file, one for each variable(X,Y,Z) and each batch(of 12 images).
    np.save(f"{final_dir}/{cnn_use}_data_X_{n}",X)
    np.save(f"{final_dir}/{cnn_use}_data_Y_{n}",Y)
    np.save(f"{final_dir}/{cnn_use}_data_Z_{n}",Z)


In [None]:
#Read in the database file for details regarding each image/patient.
df=pd.read_csv(dataset_file,sep=',')

In [None]:
#Cycle through all png files
#Find all files, and the file name into
#appropriate list for train, validate, and test.
#
glob_string = f"{image_dir}/*/*.png"
image_list = glob.glob(os.path.join("", glob_string))
random.shuffle(image_list)

#Initialize lists to hold image names for each category (train/validate/test)
train_ids = []
validate_ids = []
test_ids = []

#Cycle through every image
for png_name in image_list:
  results = re.search(r"((train)|(test)|(validate))\/(\d+)\_(\d+)(\_(\d+)){0,1}\.png", png_name)
  cnn_use = results[1]

  #assign image name into proper category list
  if cnn_use == "train":
    train_ids = np.append(train_ids,[png_name],axis=0)
  elif cnn_use == "validate":
    validate_ids = np.append(validate_ids,[png_name],axis=0)
  elif cnn_use == "test":
    test_ids = np.append(test_ids,[png_name],axis=0)


#Final shuffle within each category before splitting them up into batches
random.shuffle(train_ids)
random.shuffle(validate_ids)
random.shuffle(test_ids)

print(train_ids.shape)
print(validate_ids.shape)
print(test_ids.shape)

In [None]:
final_dir = f"{image_dir}/numpy"

#Take each list(train, validate, test), and create numpy files for each variable, and batch.
create_loadable_data(train_ids, df, "train", final_dir, batch_size=12)
create_loadable_data(validate_ids, df, "validate", final_dir, batch_size=12)
create_loadable_data(test_ids, df, "test", final_dir, batch_size=12)

#Tar and compress(gzip) the numpy files for transfer.
#This is needed, as the CNN training runs significantly faster when loading numpy files
#from the google colab /tmp directory, then it does when running from a
#mounted google drive.
#
#However, the file transfer is 12x faster if the files are compressed before transfer.
#(GZIP provides a 25x compression on numpy image data for the train dataset, 28GB vs 1.1GB)
os.system(f"tar -cpzvf {image_dir}/numpy/train.tgz {image_dir}/numpy/train*.npy")
os.system(f"tar -cpzvf {image_dir}/numpy/validate.tgz {image_dir}/numpy/validate*.npy")
os.system(f"tar -cpzvf {image_dir}/numpy/test.tgz {image_dir}/numpy/test*.npy")