# Data Preparation
This section includes importing libraries, train and test datasets which are obtained through my Google Drive. Previously, "train.csv", "test.csv", and BT4012 folder have been uploaded to my Google Drive.

In [2]:
# import the libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from tensorflow.keras import layers, models
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense,BatchNormalization
from keras.callbacks import EarlyStopping
from keras.layers import AveragePooling2D
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score

In [3]:
# mount google drive to google colab notebook

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
# import the train dataset from google drive

train = pd.read_csv("./gdrive/My Drive/BT4012/train.csv")

In [5]:
# see the top 5 rows of train dataset

train.head()

Unnamed: 0,r0c0,r0c1,r0c2,r0c3,r0c4,r0c5,r0c6,r0c7,r0c8,r0c9,r0c10,r0c11,r0c12,r0c13,r0c14,r0c15,r0c16,r0c17,r0c18,r0c19,r1c0,r1c1,r1c2,r1c3,r1c4,r1c5,r1c6,r1c7,r1c8,r1c9,r1c10,r1c11,r1c12,r1c13,r1c14,r1c15,r1c16,r1c17,r1c18,r1c19,...,r18c1,r18c2,r18c3,r18c4,r18c5,r18c6,r18c7,r18c8,r18c9,r18c10,r18c11,r18c12,r18c13,r18c14,r18c15,r18c16,r18c17,r18c18,r18c19,r19c0,r19c1,r19c2,r19c3,r19c4,r19c5,r19c6,r19c7,r19c8,r19c9,r19c10,r19c11,r19c12,r19c13,r19c14,r19c15,r19c16,r19c17,r19c18,r19c19,label
0,1,1,1,1,28,43,52,255,255,255,255,255,80,39,1,1,1,1,1,1,1,1,1,45,196,255,255,255,255,255,255,255,255,241,86,56,1,1,1,1,...,1,1,1,1,67,93,255,255,255,255,255,255,255,219,86,86,34,1,1,1,1,1,1,1,1,2,80,191,43,43,191,255,52,34,1,1,1,1,1,0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
2,1,1,128,255,255,255,255,255,255,255,255,255,255,255,255,255,255,128,1,1,171,171,213,255,255,255,255,255,255,255,255,255,255,255,255,255,255,213,171,171,...,171,171,171,171,171,171,213,255,255,255,255,255,255,255,255,255,213,171,171,1,1,1,1,1,1,1,128,255,255,255,255,255,255,255,255,255,128,1,1,0
3,53,54,61,91,141,172,197,223,233,246,248,249,244,231,205,174,118,87,43,33,29,33,51,98,179,216,246,253,255,255,255,254,253,248,226,200,151,123,66,53,...,25,60,87,134,161,184,223,237,247,248,248,246,241,222,198,156,132,69,55,2,2,2,33,87,116,140,160,168,180,181,184,185,187,169,147,106,82,34,23,1
4,46,46,46,46,36,36,41,41,41,41,55,55,81,81,157,157,203,203,203,203,33,33,44,44,38,38,52,52,63,63,114,114,157,157,198,198,215,215,195,195,...,32,35,35,20,20,37,37,39,39,43,43,95,95,142,142,186,186,212,212,30,30,44,44,28,28,52,52,33,33,38,38,65,65,95,95,149,149,205,205,0


In [6]:
# see the attributes from the train dataset

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72139 entries, 0 to 72138
Columns: 401 entries, r0c0 to label
dtypes: int64(401)
memory usage: 220.7 MB


In [7]:
# import the test dataset from google drive

test = pd.read_csv("./gdrive/My Drive/BT4012/test.csv")

In [8]:
# see the top 5 rows of test dataset

test.head()

Unnamed: 0,Id,r0c0,r0c1,r0c2,r0c3,r0c4,r0c5,r0c6,r0c7,r0c8,r0c9,r0c10,r0c11,r0c12,r0c13,r0c14,r0c15,r0c16,r0c17,r0c18,r0c19,r1c0,r1c1,r1c2,r1c3,r1c4,r1c5,r1c6,r1c7,r1c8,r1c9,r1c10,r1c11,r1c12,r1c13,r1c14,r1c15,r1c16,r1c17,r1c18,...,r18c0,r18c1,r18c2,r18c3,r18c4,r18c5,r18c6,r18c7,r18c8,r18c9,r18c10,r18c11,r18c12,r18c13,r18c14,r18c15,r18c16,r18c17,r18c18,r18c19,r19c0,r19c1,r19c2,r19c3,r19c4,r19c5,r19c6,r19c7,r19c8,r19c9,r19c10,r19c11,r19c12,r19c13,r19c14,r19c15,r19c16,r19c17,r19c18,r19c19
0,0,65,100,169,107,76,194,190,183,177,174,191,192,194,196,197,160,145,115,63,37,185,193,211,201,196,225,226,227,196,181,233,236,242,212,197,232,213,174,104,...,60,70,90,105,113,214,206,191,127,95,37,34,29,23,20,17,22,33,27,24,45,60,90,150,180,191,160,99,33,1,29,29,29,38,43,6,14,29,17,12
1,1,1,1,1,171,255,255,255,255,255,255,255,255,255,255,255,142,1,1,1,1,76,171,171,227,255,255,255,255,255,255,255,255,255,255,255,218,171,171,57,...,171,199,255,199,171,171,171,171,171,171,171,171,170,170,171,171,171,171,171,171,1,86,255,86,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2,2,1,1,1,1,39,59,238,255,255,145,1,1,1,1,1,1,1,1,1,1,1,1,1,1,167,255,255,255,220,78,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1,1,1,114,250,255,255,199,54,1,1,1,1,1,1,1,1,1,1,1,1,1,1,49,76,255,187,59,24,1,1
3,3,1,1,1,1,1,1,63,63,255,255,255,255,255,255,1,1,1,1,1,1,1,1,1,1,1,1,221,221,255,255,255,255,255,255,1,1,1,1,1,...,1,1,1,1,2,2,1,1,2,2,247,247,255,255,255,255,183,183,1,1,1,1,1,1,1,1,1,1,1,1,207,207,255,255,255,255,183,183,1,1
4,4,1,1,1,1,1,255,255,255,255,255,255,255,128,1,1,1,1,1,1,1,146,146,146,146,146,255,255,255,255,255,255,255,201,146,146,146,146,73,1,...,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255


In [9]:
# see the attributes from the train dataset

test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30917 entries, 0 to 30916
Columns: 401 entries, Id to r19c19
dtypes: int64(401)
memory usage: 94.6 MB


# Data Preprocessing
This section includes separating the train.csv and test.csv into features and label, scaling the train and test features, get train and validation datasets by splitting the train.csv, converting dataframes into numpy arrays, and reshaping the numpy arrays which require reshaping

In [10]:
# divide the train dataset into X_train (features) and y_train (label)

X_train = train.loc[:, train.columns != "label"]
y_train = train.loc[:, "label"]

In [11]:
# scale X_train by dividing it by 255

X_train /= 255

In [12]:
# eyeball check to see it has been scaled

X_train.head()

Unnamed: 0,r0c0,r0c1,r0c2,r0c3,r0c4,r0c5,r0c6,r0c7,r0c8,r0c9,r0c10,r0c11,r0c12,r0c13,r0c14,r0c15,r0c16,r0c17,r0c18,r0c19,r1c0,r1c1,r1c2,r1c3,r1c4,r1c5,r1c6,r1c7,r1c8,r1c9,r1c10,r1c11,r1c12,r1c13,r1c14,r1c15,r1c16,r1c17,r1c18,r1c19,...,r18c0,r18c1,r18c2,r18c3,r18c4,r18c5,r18c6,r18c7,r18c8,r18c9,r18c10,r18c11,r18c12,r18c13,r18c14,r18c15,r18c16,r18c17,r18c18,r18c19,r19c0,r19c1,r19c2,r19c3,r19c4,r19c5,r19c6,r19c7,r19c8,r19c9,r19c10,r19c11,r19c12,r19c13,r19c14,r19c15,r19c16,r19c17,r19c18,r19c19
0,0.003922,0.003922,0.003922,0.003922,0.109804,0.168627,0.203922,1.0,1.0,1.0,1.0,1.0,0.313725,0.152941,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.176471,0.768627,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.945098,0.337255,0.219608,0.003922,0.003922,0.003922,0.003922,...,0.003922,0.003922,0.003922,0.003922,0.003922,0.262745,0.364706,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.858824,0.337255,0.337255,0.133333,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.007843,0.313725,0.74902,0.168627,0.168627,0.74902,1.0,0.203922,0.133333,0.003922,0.003922,0.003922,0.003922,0.003922
1,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,...,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922
2,0.003922,0.003922,0.501961,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.501961,0.003922,0.003922,0.670588,0.670588,0.835294,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.835294,0.670588,0.670588,...,0.670588,0.670588,0.670588,0.670588,0.670588,0.670588,0.670588,0.835294,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.835294,0.670588,0.670588,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.501961,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.501961,0.003922,0.003922
3,0.207843,0.211765,0.239216,0.356863,0.552941,0.67451,0.772549,0.87451,0.913725,0.964706,0.972549,0.976471,0.956863,0.905882,0.803922,0.682353,0.462745,0.341176,0.168627,0.129412,0.113725,0.129412,0.2,0.384314,0.701961,0.847059,0.964706,0.992157,1.0,1.0,1.0,0.996078,0.992157,0.972549,0.886275,0.784314,0.592157,0.482353,0.258824,0.207843,...,0.066667,0.098039,0.235294,0.341176,0.52549,0.631373,0.721569,0.87451,0.929412,0.968627,0.972549,0.972549,0.964706,0.945098,0.870588,0.776471,0.611765,0.517647,0.270588,0.215686,0.007843,0.007843,0.007843,0.129412,0.341176,0.454902,0.54902,0.627451,0.658824,0.705882,0.709804,0.721569,0.72549,0.733333,0.662745,0.576471,0.415686,0.321569,0.133333,0.090196
4,0.180392,0.180392,0.180392,0.180392,0.141176,0.141176,0.160784,0.160784,0.160784,0.160784,0.215686,0.215686,0.317647,0.317647,0.615686,0.615686,0.796078,0.796078,0.796078,0.796078,0.129412,0.129412,0.172549,0.172549,0.14902,0.14902,0.203922,0.203922,0.247059,0.247059,0.447059,0.447059,0.615686,0.615686,0.776471,0.776471,0.843137,0.843137,0.764706,0.764706,...,0.12549,0.12549,0.137255,0.137255,0.078431,0.078431,0.145098,0.145098,0.152941,0.152941,0.168627,0.168627,0.372549,0.372549,0.556863,0.556863,0.729412,0.729412,0.831373,0.831373,0.117647,0.117647,0.172549,0.172549,0.109804,0.109804,0.203922,0.203922,0.129412,0.129412,0.14902,0.14902,0.254902,0.254902,0.372549,0.372549,0.584314,0.584314,0.803922,0.803922


In [13]:
# divide the X_train and y_train into dataset for training and validation with 20% dataset as the validation
# dataset for training will be X and y
# dataset for validation will be X_valid, y_valid

X, X_valid, y, y_valid = train_test_split(X_train, y_train, test_size = 0.2, random_state = 1, shuffle=True)

In [14]:
# convert the result datasets from the previous cell into arrays

X = np.array(X)
X_valid = np.array(X_valid)
y = np.array(y)
y_valid = np.array(y_valid)

In [15]:
# reshape X and X_valid so it can be input into the model

X = X.reshape(57711, 20, 20, 1)
X_valid = X_valid.reshape(14428, 20, 20, 1)

In [16]:
# get only the features in the test dataset

test = test.iloc[:, test.columns != "Id"]

In [17]:
# scale the test dataset by dividing it by 255

test /= 255

In [18]:
# convert the test dataset into array and rename it into X_test

X_test = np.array(test)

In [19]:
# reshape X_test so it can be input into the model

X_test = X_test.reshape(30917,20,20,1)

# Model Building
This section includes creating the CNN model, creating the early stopping callback, training and validating the model (using train and validation datasets and early stopping), and predicting the output of the test dataset

In [20]:
model = Sequential()

# feature extraction layer: convolution
model.add(Conv2D(96, (3, 3), padding="same", input_shape=(20, 20, 1)))
# non-linear activation function: ReLU
model.add(Activation('relu'))
# feature extraction layer: max pooling
model.add(MaxPooling2D(pool_size=(2, 2)))
# Batch Normalization
model.add(BatchNormalization())

# feature extraction layer: convolution
model.add(Conv2D(128, (3, 3), padding="same"))
# non-linear activation function: ReLU
model.add(Activation("relu"))
# feature extraction layer: max pooling
model.add(MaxPooling2D(pool_size=(2, 2)))
# Batch Normalization
model.add(BatchNormalization())

# dropout layer
model.add(Dropout(0.8))

# classification layer: flattening
model.add(Flatten()) 

# classification layer: dense non-linear transformation
model.add(Dense(128, activation='relu')) 
# dropout layer
model.add(Dropout(0.2))
# classification layer: dense non-linear transformation
model.add(Dense(128, activation='relu'))
# dropout layer 
model.add(Dropout(0.2))

# classification layer: output label probability
model.add(Dense(1, activation='sigmoid', name='preds')) #output layer

# compile model with Adam optimizer
model.compile(loss='binary_crossentropy',
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
              metrics=['accuracy'])

In [21]:
# create the early stopping callback

early_stopping_monitor = EarlyStopping(
    min_delta=0.0001,
    patience=10,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=True
)

In [None]:
# train the model with batch size equals to 50, 100 epochs, and early stopping

model.fit(x= X, y=y, batch_size=50, epochs=100, validation_data=(X_valid, y_valid), callbacks=[early_stopping_monitor]) 

In [None]:
# predict the result from the test dataset

res = model.predict(X_test)

# Finalization
This section includes converting the result into a pandas dataframe and outputing the result as a csv file

In [None]:
# covert the result into pandas dataframe

res_df = pd.DataFrame(res)
# change the column name
res_df.columns =['Predicted']
# add Id
res_df["Id"] = res_df.index

In [None]:
# convert into csv file
res_df.to_csv("result.csv", index = False)