<a href="https://colab.research.google.com/github/robert-pineau/CIND-860-Capstone/blob/main/CIND860_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

CIND860: W24: Advanced Data Analytics Project
Robert M. Pineau
941-049-371

In [1]:
import sys
import os
import glob
import re

from google.colab import drive

import pandas as pd
import numpy as np
import math
import random
import cv2
import matplotlib.pyplot as plt
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import keras
from keras import layers
from keras.layers import Dropout
from keras import models
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Dropout

device_names = tf.config.list_physical_devices('GPU')
print(device_names)

if tf.test.gpu_device_name() != "/device:GPU:0":
  print("raise SystemError('GPU device not found')")
print('Found GPU at: {}'.format(device_names))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Found GPU at: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
#Data Generator, this is used to provide to the training(fit) method
#one batch of data at a time.  This is needed because all of the data cannot fit into memory at the same
#time, so it needs to be loaded by the batch.
#
#This uses a custom class for this, with expected standard methods __init__, __len__, and __getitem__
#The __init__ method simply sets some internal values us batch size, overall lengths, etc.
#
#The __len__ method returns the number of batches.
#The __getitem__  method loads from file the appropriatte batch: X(the image), and Y(the class variable) and
#returns it to the fit method.
class DataGenerator(keras.utils.Sequence):
  def __init__(self, cnn_use, numpy_dir, total_size, batch_size=12):
    self.cnn_use = cnn_use
    self.total_size = total_size
    self.batch_size = batch_size
    self.numpy_dir = numpy_dir

  def __len__(self):
    return int(self.total_size / self.batch_size)

  def __getitem__(self, subset_index):
    X = np.load(f"{self.numpy_dir}/{self.cnn_use}_data_X_{subset_index}.npy")
    Y = np.load(f"{self.numpy_dir}/{self.cnn_use}_data_Y_{subset_index}.npy")
    return X, Y



#These two methods work very similarly to __getitem__ above,
#but are not part of the DataGenerator Class.  They are used to be
#able to extract the test and validate data itself in order to perform
#manual checks on the algorithms performance(using predict, instead of evaluate)

def get_test_X_Y_Z(numpy_dir, subset_index):
    X = np.load(f"{numpy_dir}/test_data_X_{subset_index}.npy")
    Y = np.load(f"{numpy_dir}/test_data_Y_{subset_index}.npy")
    Z = np.load(f"{numpy_dir}/test_data_Z_{subset_index}.npy")
    return X, Y, Z

def get_val_X_Y_Z(numpy_dir, subset_index):
    X = np.load(f"{numpy_dir}/validate_data_X_{subset_index}.npy")
    Y = np.load(f"{numpy_dir}/validate_data_Y_{subset_index}.npy")
    Z = np.load(f"{numpy_dir}/validate_data_Z_{subset_index}.npy")
    return X, Y, Z

In [3]:
#Previously all training, validate, and test data was loaded from image files and saved in numpy format in batches of 12.
#This grabs those numpy files from my google drive and copies them to the runtime drive for this session.
#Do it this way the "dataGenerator" is able to load the data from the local machine to save time during training, validation, and testing.
!date
drive.mount('/content/drive',force_remount=True)
remote_image_dir = "/content/drive/MyDrive/Colab Notebooks/square_cc_images"
remote_numpy_dir = f"{remote_image_dir}/numpy"
glob_string = f"{remote_numpy_dir}/*.npy"
local_numpy_dir = "/tmp/rpineau_numpy"
os.system(f"mkdir {local_numpy_dir}")


#numpy_list = glob.glob(os.path.join("", glob_string))
#i=0
#for numpy_name in numpy_list:
#  results = re.search(r"((train)|(test)|(validate))\_data_((X)|(Y))\_(\d+).npy", numpy_name)
#  os.system(f"cp \"{numpy_name}\" \"{local_numpy_dir}/{results[1]}_data_{results[5]}_{results[8]}.npy\"")
#  if i%20 == 0:
#    print(f"Copied {i} numpy files of {len(numpy_list)}")
#  i = i+1

#Had to abandon the PYTHON way above, due to it taking over an hour to transfer all the numpy files.
#however, even worse on many attempts it froze, and did not work.
#After tar & gzip the files, the below shell commands are more reliable, and much quicker.
#(13 minutes versus over an hour)
#Due to the fact gzip compression achieves a nearly 90% compression ratio.

!cp /content/drive/MyDrive/Colab\ Notebooks/square_cc_images/numpy/train.tgz /tmp/rpineau_numpy/train.tgz
!cp /content/drive/MyDrive/Colab\ Notebooks/square_cc_images/numpy/validate.tgz /tmp/rpineau_numpy/validate.tgz
!cp /content/drive/MyDrive/Colab\ Notebooks/square_cc_images/numpy/test.tgz /tmp/rpineau_numpy/test.tgz

!cd /tmp/rpineau_numpy && /usr/bin/tar -zxpvf train.tgz
!cd /tmp/rpineau_numpy && /usr/bin/tar -zxpvf validate.tgz
!cd /tmp/rpineau_numpy && /usr/bin/tar -zxpvf test.tgz


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
train_data_X_1414.npy
train_data_X_1415.npy
train_data_X_1416.npy
train_data_X_1417.npy
train_data_X_1418.npy
train_data_X_1419.npy
train_data_X_141.npy
train_data_X_1420.npy
train_data_X_1421.npy
train_data_X_1422.npy
train_data_X_1423.npy
train_data_X_1424.npy
train_data_X_1425.npy
train_data_X_1426.npy
train_data_X_1427.npy
train_data_X_1428.npy
train_data_X_1429.npy
train_data_X_142.npy
train_data_X_1430.npy
train_data_X_1431.npy
train_data_X_1432.npy
train_data_X_1433.npy
train_data_X_1434.npy
train_data_X_1435.npy
train_data_X_1436.npy
train_data_X_1437.npy
train_data_X_1438.npy
train_data_X_1439.npy
train_data_X_143.npy
train_data_X_1440.npy
train_data_X_1441.npy
train_data_X_1442.npy
train_data_X_1443.npy
train_data_X_1444.npy
train_data_X_1445.npy
train_data_X_1446.npy
train_data_X_1447.npy
train_data_X_1448.npy
train_data_X_1449.npy
train_data_X_144.npy
train_data_X_1450.npy
train_data_X_1451.npy
train_data_X_14

In [4]:
#Need to count the entries contained in all the numpy files.
#Z contains the original image_id.
#Since Z is easy to load, only count Z.

local_numpy_dir = "/tmp/rpineau_numpy"
these_counts = {"train":0,"validate":0,"test":0}

for i in ["train","validate","test"]:
  glob_string = f"{local_numpy_dir}/{i}_data_Z_*.npy"
  np_list = glob.glob(os.path.join("", glob_string))
  for n in np_list:
    thisZ = np.load(n)
    these_counts[i] += len(thisZ)

print(f"{these_counts}")

{'train': 21408, 'validate': 216, 'test': 216}


In [5]:
#Initialize the train, validate, and test  DataGenerators(as explained above in the class definition)
training_generator = DataGenerator("train",local_numpy_dir,these_counts["train"],batch_size=12)
validation_generator = DataGenerator("validate",local_numpy_dir,these_counts["validate"],batch_size=12)
test_generator = DataGenerator("test",local_numpy_dir,these_counts["test"],batch_size=12)

Define a CNN model from scratch:

In [7]:
#This model is very similar to AlexNet (figured using a made in Toronto solution as a starting point was a good idea!)
cnn_model = models.Sequential()

cnn_model.add(Conv2D(96, kernel_size=(11,11), strides= 4,
               padding= 'valid', activation= 'relu',
               input_shape=(224, 224, 3),
               kernel_initializer= 'he_normal'))
cnn_model.add(layers.BatchNormalization())

cnn_model.add(MaxPooling2D(pool_size=(3,3), strides= (2,2),
               padding= 'valid', data_format= None))
cnn_model.add(Conv2D(256, kernel_size=(5,5), strides= 1,
               padding= 'same', activation= 'relu',
               kernel_initializer= 'he_normal'))
cnn_model.add(layers.BatchNormalization())

cnn_model.add(MaxPooling2D(pool_size=(3,3), strides= (2,2),
               padding= 'valid', data_format= None))
cnn_model.add(Conv2D(384, kernel_size=(3,3), strides= 1,
               padding= 'same', activation= 'relu',
               kernel_initializer= 'he_normal'))
cnn_model.add(layers.BatchNormalization())

cnn_model.add(Conv2D(384, kernel_size=(3,3), strides= 1,
               padding= 'same', activation= 'relu',
               kernel_initializer= 'he_normal'))
cnn_model.add(layers.BatchNormalization())

cnn_model.add(Conv2D(256, kernel_size=(3,3), strides= 1,
               padding= 'same', activation= 'relu',
               kernel_initializer= 'he_normal'))
cnn_model.add(layers.BatchNormalization())

cnn_model.add(Conv2D(256, kernel_size=(3,3), strides= 1,
               padding= 'same', activation= 'relu',
               kernel_initializer= 'he_normal'))
cnn_model.add(layers.BatchNormalization())

cnn_model.add(MaxPooling2D(pool_size=(3,3), strides= (2,2),
               padding= 'valid', data_format= None))

cnn_model.add(Flatten())
cnn_model.add(Dense(8192, activation= 'relu'))
cnn_model.add(Dropout(0.5))
cnn_model.add(Dense(4096, activation= 'relu'))
cnn_model.add(Dropout(0.5))
cnn_model.add(Dense(1024, activation= 'relu'))
cnn_model.add(Dropout(0.5))
cnn_model.add(Dense(512, activation= 'relu'))
cnn_model.add(Dropout(0.5))
cnn_model.add(Dense(256, activation= 'relu'))
cnn_model.add(Dropout(0.5))
cnn_model.add(Dense(128, activation= 'relu'))

# binary classification, using a final layer with 1 output and sigmoid activation
cnn_model.add(layers.Dense(1, activation='sigmoid'))

optimizer = keras.optimizers.Adam(learning_rate=0.00001)
cnn_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
cnn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 54, 54, 96)        34944     
                                                                 
 batch_normalization (Batch  (None, 54, 54, 96)        384       
 Normalization)                                                  
                                                                 
 max_pooling2d (MaxPooling2  (None, 26, 26, 96)        0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 26, 26, 256)       614656    
                                                                 
 batch_normalization_1 (Bat  (None, 26, 26, 256)       1024      
 chNormalization)                                                
                                                        

In [8]:
cnn_history = cnn_model.fit(training_generator, validation_data=validation_generator, epochs=50)
test_loss,test_acc = cnn_model.evaluate(test_generator)
print(f"Test Loss: {np.round(test_loss,3)} Test Accuracy: {np.round(test_acc*100,3)}%")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Loss: 3.05 Test Accuracy: 55.556%
