In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# import libaries 

import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import cv2 as cv
import random

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

import tensorflow.keras 
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras import layers 
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow.keras.metrics import Precision, Recall, AUC
from tensorflow.keras.callbacks import ModelCheckpoint, Callback, EarlyStopping


In [None]:
# import the data
train_df = pd.read_csv('../input/digit-recognizer/train.csv')
test_df = pd.read_csv('../input/digit-recognizer/test.csv')
train_df.head()

## Exploring Data Analysis:

Here, we will quickly check if class imabalances exist within our training dataset. 

In [None]:
num_instances = train_df.groupby('label').size()

plt.figure(figsize = (10,5))
plt.bar(np.unique(train_df.label),num_instances)
plt.title('Number of labels within the training set', fontweight = 'bold')
plt.xlabel('labels')
plt.ylabel('instances')

The above graph tells us that the data is not as imbalanced as most datasets. However, it does seem that exists couple labels (e.g., 4 and 5) that are under counted, while the above figure also shows few labels (e.g., 1, 3, and 7) are abundant. Overall, the imbalances are not too dramatic, so we will avoid including any sort of class balancing methods into our pipeline.

## Prepare the training data: 

Here, the dataframes contain the images as 28*28 columns as integer values, so we have to reshape from (1,28x28) into (28,28,1) and convert the data into float32.

In [None]:
%%time
samples, columns = train_df.shape

# empty tensors 
X = np.zeros((samples,28,28,1))
y_true = np.zeros((samples,1))

for sample in tqdm(range(samples)):
    X[sample,:,:,:] = train_df.iloc[sample,1:columns].values.reshape(28,28,1).astype('float32') # convert vectors into 2D tensors with (28,28,1)
    y_true[sample,0] = train_df.iloc[sample,0] # read the the corresponding output labels

One hot encode the output labels ... 

In [None]:
values = train_df.label
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)

print('The original output labels', values)
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)

integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded) # corresponding loss function is the "categorical cross entropy" and the neural network output should be a layer with 9 neurons
samples, classes = onehot_encoded.shape
print("Number of vectors:", samples, "\nNumber of neurons / length of vector:", classes)

In [None]:
y = onehot_encoded # corresponding ground truth vector for X

In [None]:
%%time
# normalize the input features
def standard_norm(img):
    return (img - np.mean(img))/np.std(img)

# empty tensor 
norm_X = np.zeros((samples,28,28,1))
for sample in tqdm(range(samples)):
    norm_X[sample,:,:,:] = standard_norm(X[sample,:,:,:]).reshape(28,28,1) 
    

We successfully normalized our image pixels from [0,255] to [-I_lower, I_upper], where I means intensity pixel value, and the normalized image pixels are binned into a range such that the mean and standard deviation of the whole image is equal to 0 and 1.

## Baseline model:

Here, we will construct a Convolutional Neural Network with batch normalization layers after each conv. In addition, we included a GlobalAveragePooling, so that we can implement a Class Activation Map into our pipeline. Important to note, CAMs require a Global Average Pooling before the output layer, while Grad-CAMs can be implemented on any Convolutional Neural Network architecture. 

In [None]:
def METRICS():
    metrics = ['accuracy', 
              Precision(name='precision'), 
              Recall(name='recall'),
              AUC(name='AUC')]
    return metrics


model = Sequential()
model.add(layers.Input(shape=(28, 28, 1))) 
model.add(layers.Conv2D(32, (3,3), padding = 'same', activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling2D(pool_size = (2, 2)))
model.add(layers.Conv2D(64, (3,3), padding = 'same', activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling2D(pool_size = (2, 2)))
model.add(layers.Conv2D(128, (3,3), padding = 'same', activation='relu'))
model.add(layers.BatchNormalization())


model.add(layers.GlobalAveragePooling2D()) 
model.add(layers.Dense(classes,activation='softmax', name = 'output_layer'))
model.compile(Adam(lr = 0.00100005134), metrics= METRICS(), loss = 'categorical_crossentropy') 
model.summary()

In [None]:
# functions to help split our data and train our model ...

def split_data(X,Y):
    return train_test_split(X, Y, test_size=0.2, random_state=42)

def train_model(model, X, Y, epochs, bs):
    X_train, X_val, y_train, y_val = split_data(X,Y)
    
    STEP_SIZE_TRAIN = X_train.shape[0]//bs + 1
    STEP_SIZE_VAL = X_val.shape[0]//bs + 1
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
    
    train_history = model.fit(X_train, y_train, 
                             steps_per_epoch = STEP_SIZE_TRAIN,
                             validation_data = (X_val,y_val),
                             validation_steps = STEP_SIZE_VAL, 
                            epochs = epochs, shuffle = True,
                             )
    return train_history, model


In [None]:
# train model .. 
epochs, bs = 20, 32 # choosen hyperparameters
train_hist, final_model = train_model(model, norm_X, y, epochs, bs)

## Load Test Data:

Before implementing CAMs, we will load test images to demonstrate our neural networks ability to focus on imperative features that correspond to accurately classify digits.  

In [None]:
%%time

samples, columns = test_df.shape
X_test = np.zeros((samples,28,28,1)) # empty tensor
X_norm_test = np.zeros((samples,28,28,1))
for sample in tqdm(range(samples)):
    X_test[sample,:,:,:] = test_df.iloc[sample,:].values.reshape(28,28,1).astype('float32') # convert vector into 2D tensor
    X_norm_test[sample,:,:,:] = standard_norm(X_test[sample,:,:].reshape(28,28,1))

## Class Activation Maps:

If you are interested in more detail about CAMs, here is the following link that takes you to the authors website: [Learning Deep Features for Discriminative Localization](http://cnnlocalization.csail.mit.edu/). Here, I will create a object that creates a subset model of our trained model on the MNIST dataset that inputs images with size 28x28 and outputs 128 feature maps with size 7x7. This feature map in particular is the convolutional block before the Global Average Pooling layer. Then for the image of interest, we let the originally trained model predict the label, that way, we can extract the weight vector from the weight tensor that corresponds to the final output dense layer. This weight vector will then have a size of (128,1). Next, we generate the 128 feature maps for the image of interest and compute the dot product of these feature maps with weight vector that corresponds the model's predicted output neuron. So the dot product will be the following: (7,7,128)$\cdot$(128,1) = (7,7,1). This output with size (7,7,1) is the class activation map, where we then resize the image into (28,28,1) and overlap the original input image and activation map. Finally, by overlapping the activation map and image, we can find which features of the image the neural network focuses onto so that the model can successfully predict the label. 

In [None]:
"""
    Implementing class activation maps for architectures with Global Average Pooling 2D before the final dense layer 
"""
class MNIST_CAM:
    
    def __init__(self, img):
        self.resize_width, self.resize_height, _ = img.shape    
    
    # zero-center normalization 
    def standard_norm(self, img):
        return ((img - np.mean(img))/np.std(img))
    
    # final layer should be (7,7,2048)
    def feature_model(self, model):  
        return Model(inputs = model.layers[0].input, outputs = model.layers[-3].output)
    
    # final weight tensor before classification layer is 3*2048
    def weight_tensor(self, model):
        final_outputs = model.layers[-1]
        return final_outputs.get_weights()[0]
    
    # output prediction class of the image of interest
    def predict_class(self, model, X):
        prob_vec = model.predict(X)
        return np.argmax(prob_vec[0])
        
    # generate class activation maps (CAMs)    
    def generate_CAM(self, model, img):
        norm_img = self.standard_norm(img)
        Fmap_model = self.feature_model(model)
        Wtensor = self.weight_tensor(model)
        feature_map = Fmap_model.predict(norm_img.reshape(1,28,28,1))
        label = self.predict_class(model, norm_img.reshape(1,28,28,1))
        CAM = feature_map.dot(Wtensor[:,label])[0,:,:]
        return cv.resize(CAM, 
                         (self.resize_width, self.resize_height),
                         interpolation = cv.INTER_CUBIC), label
    
    # generate probability vector 
    def generate_probvec(self, model, img):
        X = self.standard_norm(img)
        prob_vec = model.predict(X.reshape(1,28,28,1))
        return prob_vec

In [None]:
# example image 
img = X_test[102,:,:,:]
CAM_generator = MNIST_CAM(img)
plt.imshow(img.reshape(28,28), cmap='gray')
activation_map, label = CAM_generator.generate_CAM(final_model, img)
plt.imshow(activation_map,'jet', alpha = 0.3)
plt.title("Predicted Class: " + str(label))
plt.show()

In [None]:
# Here is an interactive loop that asks if you want to continue to generate random input digit images 
#through the CAM_generator ...


# Generate and plot class activation map along with the original image ... 
while True:
    sample = random.randint(0, len(X_test))
    img = X[sample,:,:,:] 
    CAM_generator = MNIST_CAM(img)
    plt.imshow(img.reshape(28,28), cmap='gray')
    activation_map, label = CAM_generator.generate_CAM(final_model, img) # generate activation map and output label
    plt.imshow(activation_map,'jet', alpha = 0.3)
    plt.title("Predicted Class: " + str(label))
    plt.show()
    request = input("Next Image? (y/n)")
    if request and request[0] == 'n':
        break

## Submitted Test Set: 

In [None]:
final_model.predict(X_norm_test[0,:,:,:].reshape(1,28,28,1))

In [None]:
y_test,test_Ids = np.zeros((samples,1)), np.zeros((samples,1))


for sample in tqdm(range(samples)):
    y_test[sample,0] = np.argmax(final_model.predict(X_norm_test[sample,:,:,:].reshape(1,28,28,1)))
    test_Ids[sample,0] = int(sample+1)          

In [None]:
label_df, pred_df = pd.DataFrame(test_Ids), pd.DataFrame(y_test)
sub_df = pd.concat([label_df, pred_df], axis = 1)
sub_df.iloc[:,:] = sub_df.iloc[:,:].astype('int')
sub_df.columns = ['ImageId', 'Label']
sub_df.head()

In [None]:
sub_df.to_csv('sample_submission.csv', index=False)