### Kaggle competition_ overfitters
Sanaz Kaviani, sanaz.kaviani@umontreal.ca, marticule: 2111567 
Mersede Mokri, mersede.mokri@umontreal.ca, marticule: 2111556
Hamed Naseri, hamed.naseri@polymtl.ca, marticule: 2051414


### In the following part, the required standard libraries are imported, and the preprocessing function is modeled.

In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import accuracy_score
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # used to Disable Tensorflow debugging informatio
import tensorflow as tf
from collections import Counter
import matplotlib.pyplot as plt
from skimage import measure, morphology
import keras
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.optimizers import SGD
from keras.layers import BatchNormalization
from keras.layers import LeakyReLU
from keras.preprocessing.image import ImageDataGenerator
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


## Pre-Processing 


In [None]:
""" Pre-Processing """
def PreProcessing_1(dataSample, filteredImgSize, binarizingTH):
    dataSize = dataSample.shape[0]               #Number of Examples
    NumPixels = dataSample[0][1].shape[0]       #Original Feature Size which is imageLength*imageWidth
    originalImgSize = np.sqrt(NumPixels).astype(int)   #Original Image Size
    """ Initialization """
    filteredImgCenter = int(filteredImgSize/2)   #Center of the Filtered Image (25 here)
    featureSize = filteredImgSize**2
    processedImageSet = np.zeros((dataSize, featureSize))
    """ First Train Set: to find Vocabs of SIFT, SURF and anyother feature """
    for i in range(dataSize):
        if (i+1)%1000 == 0:
            print(i , ' of data have been processed')
        originalImg = dataSample[i,1].reshape(originalImgSize,originalImgSize)
        originalImg[originalImg > 255] = 255
        binarizedImg = originalImg > binarizingTH  #this needs to be tunned
        origImgSegments= measure.label(binarizedImg, background = 0)
        mostCommonLabel = Counter(origImgSegments.flatten()).most_common(3)
        filterMask = (origImgSegments == mostCommonLabel[1][0]) #+ (origImgSegments ==mostCommonLabel[2][0])
        filteredImg = filterMask * originalImg
        regionImg = measure.regionprops(filterMask.astype(int))[0]
        originalImgCenter = [int(regionImg.bbox[0]+((regionImg.bbox[2]-regionImg.bbox[0])/2)), int(regionImg.bbox[1]+((regionImg.bbox[3]-regionImg.bbox[1])/2))]
        deltaX = regionImg.bbox[2]-regionImg.bbox[0]
        deltaY = regionImg.bbox[3]-regionImg.bbox[1]
        deltaX = int( min(1.5*deltaX , filteredImgSize))
        deltaX += deltaX%2
        deltaY = int( min(1.5*deltaY , filteredImgSize))
        deltaY += deltaY%2
        grabbedImg = []
        grabbedImg = filteredImg[max(0,originalImgCenter[0]-int(deltaX/2)):min(originalImgSize-1,originalImgCenter[0]+int(deltaX/2)), max(0,originalImgCenter[1]-int(deltaY/2)):min(originalImgSize-1,originalImgCenter[1]+int(deltaY/2))]
        tmpXsize = grabbedImg.shape[0]
        tmpYsize = grabbedImg.shape[1]
        tmpImg = np.zeros((filteredImgSize , filteredImgSize))
        tmpImg[filteredImgCenter-int(tmpXsize/2) : filteredImgCenter+int(tmpXsize/2)+tmpXsize%2, filteredImgCenter-int(tmpYsize/2): filteredImgCenter+int(tmpYsize/2)+tmpYsize%2] = grabbedImg
        tmpImgSegments= measure.label(tmpImg>binarizingTH, background = 0)
        mostCommonLabel = Counter(tmpImgSegments.flatten()).most_common()
        finalMask = morphology.remove_small_objects(tmpImg>binarizingTH, min(40, mostCommonLabel[1][1]-1), connectivity=2)
        tmpImg = tmpImg*finalMask
        outImg = tmpImg / np.max(tmpImg)
        processedImageSet[i,:] = outImg.flatten()
    return processedImageSet

Mounted at /content/drive


## Loading Data

In [None]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpu_devices[0], True)


"""                    Loading Train and Test Data:                         """
dataTrain = np.load('/content/drive/MyDrive/train_images.npy', encoding='latin1', allow_pickle=True)
dataTest  = np.load('/content/drive/MyDrive/test_images.npy',  encoding = 'latin1', allow_pickle=True)

"""  Extracting Train & Test Sets Sizes and Original Feature & Image Sizes  """
trainSize = dataTrain.shape[0]  # Number of Training Examples
testSize  = dataTest.shape[0]                #Number of Testing Examples

"""    Loading Labels of Train Data & Converting Words to Numeric Labels    """
myCSV = np.genfromtxt('/content/drive/MyDrive/train_labels.csv', delimiter=',', dtype='str')
    # myCSV = np.genfromtxt('./all/train_labels.csv', delimiter=',', dtype = 'str')
trainLabelWords = myCSV[1:, 1]  # Training labels: Words
uniqueLabelWords = np.unique(trainLabelWords)  # Unique Labels
trainLabel = np.zeros((trainSize, 1))  # Training Labels: Numerics
NumberLabel = uniqueLabelWords.shape[0]
refLabel = np.zeros((NumberLabel, 2))
for i in range(NumberLabel):
   trainLabel[trainLabelWords == uniqueLabelWords[i], 0] = i

    

### In the following part, data are initialized and preprocessing process is performed to ehnace the model's accuracy.

In [None]:
""" Parameter Initialization """
filteredImgSize = 40
binarizingTH = 8  # for first segmentation

processedImgTrain1 = PreProcessing_1(dataTrain, filteredImgSize, binarizingTH)
processedImgTest1 = PreProcessing_1(dataTest, filteredImgSize, binarizingTH)

binarizingTH = 0.01  # range (0,1)
featMatrixTrain = 1 * (processedImgTrain1 > binarizingTH)
featMatrixTest  = 1 * (processedImgTest1>binarizingTH)
featMatrixTrain = featMatrixTrain.astype('float32')
featMatrixTest = featMatrixTest.astype('float32')


# fix random seed for reproducibility
seed = 7
np.random.seed(seed)


"""  Extracting Train & Test Sets Sizes and Original Feature & Image Sizes  """
trainSize = featMatrixTrain.shape[0]               #Number of Training Examples
testSize  = featMatrixTest.shape[0]                #Number of Testing Examples
featureSize = featMatrixTrain.shape[1]             #Number of Features


nTrain = int(0.9*trainSize)
idxTrainValid = np.random.choice(trainSize, [trainSize,1],replace = False)

t_train = trainLabel[idxTrainValid[:nTrain],0]
t_valid = trainLabel[idxTrainValid[nTrain:],0]
X_train = featMatrixTrain[idxTrainValid[:nTrain,0],:]
X_valid = featMatrixTrain[idxTrainValid[nTrain:,0],:]
X_test = featMatrixTest

X_train = (X_train.reshape(X_train.shape[0], 1,filteredImgSize, filteredImgSize))
#X_train = np.expand_dims(X_train, axis=-1)
X_valid = (X_valid.reshape(X_valid.shape[0], 1,filteredImgSize, filteredImgSize))
#X_valid = np.expand_dims(X_valid, axis=-1)
X_test = (X_test.reshape(X_test.shape[0], 1,filteredImgSize, filteredImgSize))
#X_test = np.expand_dims(X_test, axis=-1)
    #one-hot encode target column
y_train = to_categorical(t_train)
y_valid = to_categorical(t_valid)

num_classes = y_valid.shape[1]

999  of data have been processed
1999  of data have been processed
2999  of data have been processed
3999  of data have been processed
4999  of data have been processed
5999  of data have been processed
6999  of data have been processed
7999  of data have been processed
8999  of data have been processed
9999  of data have been processed
999  of data have been processed
1999  of data have been processed
2999  of data have been processed
3999  of data have been processed
4999  of data have been processed
5999  of data have been processed
6999  of data have been processed
7999  of data have been processed
8999  of data have been processed
9999  of data have been processed


## Convolutional Neural Network is defined in the following part.

In [None]:
def CNN_model(filteredImgSize=40):
    model = Sequential([
        # First two convolutional layers
        Conv2D(32, kernel_size=(3, 3), activation='relu', padding='same', input_shape=(1, filteredImgSize, filteredImgSize)),
        Conv2D(32, kernel_size=(3, 3), activation='relu', padding='same'),
        # normalization layer
        BatchNormalization(),
        # pooling layer
        MaxPooling2D(pool_size=(2, 2)),
        # add regularization
        Dropout(0.25),
        # Second two convolutional layers
        Conv2D(64, kernel_size=(3, 3), activation='relu', padding='same'),
        Conv2D(64, kernel_size=(3, 3), activation='relu', padding='same'),
        # normalization layer
        BatchNormalization(),
        # pooling layer
        MaxPooling2D(pool_size=(2, 2)),
        # add regularization
        Dropout(0.25),

        Flatten(),

        # FC layer
        Dense(1024, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(1024, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(1024, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(31, activation='softmax')
    ])
    model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer=keras.optimizers.Adam(lr=0.0004),
                  metrics=['accuracy'])
    return model

# CNN model is run, and its accuracy for training and validation data is reported for each epoch.

In [None]:
from keras import backend as K
K.set_image_data_format('channels_first')

modelBest=CNN_model()

batch_size=35
epoch_aug1=700

#earlyStopping = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.001, patience=30, verbose=0, mode='min')

gen = ImageDataGenerator(rotation_range=15, width_shift_range=0.1, shear_range=0.3, 
                         height_shift_range=0.1, zoom_range=0.1)



batches = gen.flow(X_train, y_train, batch_size=batch_size)
val_batches = gen.flow(X_valid, y_valid, batch_size=batch_size)
     

results=modelBest.fit_generator(batches, steps_per_epoch=X_train.shape[0] // batch_size, epochs=epoch_aug1,
                                validation_data=val_batches, validation_steps=X_valid.shape[0] // batch_size,
                                use_multiprocessing=False)


plt.figure()
plt.plot(results.history['accuracy'], label="Train")
plt.plot(results.history['val_accuracy'], label="Validation")
plt.legend(fontsize=25)
plt.grid(True)
plt.show()


# The final accuracy of CNN for training and validation data is reported in the following part.
scores = modelBest.evaluate(X_train, y_train, verbose=1)
print("Large CNN Train Error: %.2f%%" % (100-scores[1]*100))
scores = modelBest.evaluate(X_valid, y_valid, verbose=1)
print("Large CNN Valid Error: %.2f%%" % (100-scores[1]*100))



#predict images in the test set
y_test_CNN = modelBest.predict(X_test)
t_test_CNN = np.argmax(y_test_CNN,axis=1)

testLabelCNN = np.zeros((testSize,2)).astype('str')

# map the predict result to classes name
for i in range(NumberLabel):
  testLabelCNN[t_test_CNN == i,1] = uniqueLabelWords[i]

testLabelCNN[:,0]=range(10000)

# The testing data labels are predicted in the following part, and the predicted labels are saved in a csv file in order to be uploaded in Kaggle website.  
test=pd.DataFrame(testLabelCNN,columns=['Id','Category']).set_index('Id')
test.to_csv('7810.csv')
