# PyTorch CNN for Digit Recognition

This kernel provides a simple CNN approach, in order to classify the digit images provided in this dataset.

First, we import the relevant libraries and modules and load the data into two dataframes.

In [None]:
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib.colors import LinearSegmentedColormap

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision import transforms

# Data Loading
traindf = pd.read_csv("../input/digit-recognizer/train.csv")
testdf = pd.read_csv("../input/digit-recognizer/test.csv")

Then, we perform some basic re-definitions.

In [None]:
traindf.rename(columns={'label' : 'Digit'}, inplace=True)
for i in range(traindf.shape[1]-1):
    traindf.rename(columns={'pixel'+str(i) : 'Pixel '+str(i+1)}, inplace=True)
testdf.rename(columns={'label' : 'Digit'}, inplace=True)
for i in range(traindf.shape[1]-1):
    testdf.rename(columns={'pixel'+str(i) : 'Pixel '+str(i+1)}, inplace=True)

# Load the required data into numpy arrays
y = traindf['Digit'].to_numpy().astype(int)
X = traindf.iloc[:,1:].values
X_final_test = testdf.values
print('All data have been loaded into numpy arrays.')

X = np.reshape(X, (X.shape[0], 28, 28))
X_final_test = np.reshape(X_final_test, (X_final_test.shape[0], 28, 28))

At this point we define a custom dataset class in order to form datasets using the data, which are then split into training/validation/testing sub-datasets and loaded into PyTorch DataLoaders.

In [None]:
class CustomDatasetNN(Dataset):
    def __init__(self, feats, labels):
        self.feats = feats
        self.labels = labels

    def __len__(self):
        return len(self.feats)

    def __getitem__(self, item):
        img = self.feats[item]
        tensoring = transforms.ToTensor()
        normalizer = transforms.Normalize((0.1307,), (0.3081,))

        img = tensoring(img).float()
        img = normalizer(img)
        return img, self.labels[item]

batch_size = 300    

model_debug = True # Set to False when the code is ready to be deployed for the final predictions
if model_debug:
    # Take 60% for training
    X_train, X_rest, y_train, y_rest = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
    # Split the remaining 15% for validation and 25% for testing
    X_test, X_val, y_test, y_val = train_test_split(X_rest, y_rest, test_size=0.375, random_state=42, stratify=y_rest)
    
    test_data = CustomDatasetNN(X_test,y_test)
    test_loader = DataLoader(test_data,batch_size=batch_size,shuffle=True)
else:
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)

train_data = CustomDatasetNN(X_train,y_train)
val_data = CustomDatasetNN(X_val,y_val)
train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True)
val_loader = DataLoader(val_data,batch_size=batch_size,shuffle=True)

y_final_test = np.zeros(X_final_test.shape[0]) # Dummy array
final_data = CustomDatasetNN(X_final_test,y_final_test)
final_loader = DataLoader(final_data,batch_size=1,shuffle=False)

Before moving on to the CNN model, we provide some auxiliary functions useful for some visualizations during the model's debugging, where we use our own test set.

In [None]:
# Auxiliary Functions

import seaborn as sns
sns.set(style = "darkgrid") # Personal preference

def CustomCmap(from_rgb,to_rgb):

    # from color r,g,b
    r1,g1,b1 = from_rgb

    # to color r,g,b
    r2,g2,b2 = to_rgb

    cdict = {'red': ((0, r1, r1),
                   (1, r2, r2)),
           'green': ((0, g1, g1),
                    (1, g2, g2)),
           'blue': ((0, b1, b1),
                   (1, b2, b2))}

    cmap = LinearSegmentedColormap('custom_cmap', cdict)
    return cmap

mycmap = CustomCmap([1.0, 1.0, 1.0], [72/255, 99/255, 147/255])
mycmap_r = CustomCmap([72/255, 99/255, 147/255], [1.0, 1.0, 1.0])

mycol = (72/255, 99/255, 147/255)
mycomplcol = (129/255, 143/255, 163/255)

def plot_cm(cfmatrix,title,classes):
    fig, ax1 = plt.subplots(1,1) #, figsize=(5,5)

    for ax,cm in zip([ax1],[cfmatrix]):
        im = ax.imshow(cm, interpolation='nearest', cmap=mycmap)
        divider = make_axes_locatable(ax)
        cax = divider.append_axes("right", size="5%", pad=.2)
        plt.colorbar(im, cax=cax) #, ticks=[-1,-0.5,0,0.5,1]
        ax.set_title(title,fontsize=14)
        tick_marks = np.arange(len(classes))
        ax.set_xticks(tick_marks)
        ax.set_xticklabels(classes, rotation=90)
        ax.set_yticks(tick_marks)
        ax.set_yticklabels(classes)

        fmt = 'd'
        thresh = cm.max() / 2.

        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            ax.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")

        ax.set_ylabel('True label',fontsize=14)
        ax.set_xlabel('Predicted label',fontsize=14)

    plt.savefig(title+'.pdf', bbox_inches='tight')
    plt.show()

The following corresponds to the CNN setup.

In [None]:
class CNNBackbone(nn.Module):
    def __init__(self, input_height, input_width, conv_channels, kernels, maxpools, lin_channels, dropout, batchnorm):
        """
        Agrs:
            input_height (int):
                image height in pixels
            input_width (int):
                image width in pixels
            conv_channels (list):
                contains the input and output channels for each
                convolutional layer, therefore using a total of
                len(channels)-1 convolutional layers
            kernels (list):
                contains the kernel sizes to be considered per
                convolution. Must have length len(channels)-1
            maxpools (list):
                contains the MaxPool2d kernel sizes to be considered
                per convolution. Must have length len(channels)-1
            lin_channels (list):
                contains the output channels for each linear layer
                following the convolutions, therefore using a total of
                len(lin_channels) linear layers.
                Note that the last element must be equal to the number
                of classes to be determined.
            classes (int):
                number of output features
            dropout (float):
                dropout probability, 0 <= dropout <= 1
            batchnorm (bool):
                boolean parameter to control whether batch normalization
                is applied or not.
        """
        super(CNNBackbone, self).__init__()
        self.num_conv_layers = len(kernels)
        self.batchnorm = batchnorm
        
        seq = []
        for i in range(self.num_conv_layers):
            seq.append(nn.Conv2d(in_channels=conv_channels[i], 
                                 out_channels=conv_channels[i+1],
                                 kernel_size=kernels[i], stride=1, padding=1))
            seq.append(nn.ReLU())
            if self.batchnorm:
                seq.append(nn.BatchNorm2d(num_features=conv_channels[i+1],track_running_stats=False))
            seq.append(nn.MaxPool2d(kernel_size=maxpools[i]))
            
        # Flatten the output of the final convolution layer
        seq.append(nn.Flatten())
        
        convolutions = nn.Sequential(*seq)
        
        # Calculation of first linear layer dimensions
        # We build an empty tensor of appropriate size and let him go through
        # the above sequence, in order to calculate the output's size automatically
        first_lin = convolutions(torch.empty(1,conv_channels[0],input_height,input_width)).size(-1)
        
        self.num_lin_layers = len(lin_channels)
        for i in range(self.num_lin_layers):
            if i == self.num_lin_layers-1:
                seq.append(nn.Linear(lin_channels[i-1], lin_channels[i]))
                break
            elif i == 0:
                seq.append(nn.Linear(first_lin, lin_channels[i]))
            else:
                seq.append(nn.Linear(lin_channels[i-1], lin_channels[i]))
            seq.append(nn.ReLU())
            seq.append(nn.Dropout(dropout))
                
        self.fitter = nn.Sequential(*seq)

    def forward(self, x):
        """CNN forward
        Args:
            x (torch.Tensor):
                [B, S, F] Batch size x sequence length x feature size
                padded inputs
        Returns:
            torch.Tensor: [B, O] Batch size x CNN output size cnn outputs
        """
        out = self.fitter(x)
        return out
    
def load_backbone_from_checkpoint(model, checkpoint_path):
    model.load_state_dict(torch.load(checkpoint_path))
    
# adapted code from this repository: https://github.com/Bjarten/early-stopping-pytorch
class EarlyStopping:
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'Validation loss increase spotted. Early stopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

def training_loop(model, train_dataloader, optimizer, device="cuda"):
    model.train()
    batch_losses = []
            
    for batch in train_dataloader:
        x_batch, y_batch = batch
                
        # Move to device
        x_batch, y_batch = x_batch.float().to(device), y_batch.type(torch.LongTensor).to(device)
                
        # Clear the previous gradients first
        optimizer.zero_grad()
        
        # forward pass
        yhat = model(x_batch) # No unpacking occurs in CNNs
        
        # loss calculation
        loss = loss_function(yhat, y_batch)
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()
        
        batch_losses.append(loss.data.item())
        
    train_loss = np.mean(batch_losses)

    return train_loss


def validation_loop(model, val_dataloader, device="cuda"):
    
    model.eval()
    batch_losses = []
    
    for batch in val_dataloader:
        x_batch, y_batch = batch
                
        # Move to device
        x_batch, y_batch = x_batch.float().to(device), y_batch.type(torch.LongTensor).to(device)
        
        yhat = model(x_batch) # No unpacking occurs in CNNs
        
        loss = loss_function(yhat, y_batch)
        
        batch_losses.append(loss.data.item())
        
    val_loss = np.mean(batch_losses)

    return val_loss # Return validation_loss and anything else you need


def train(model, train_dataloader, val_dataloader, optimizer, epochs, device="cuda", patience=-1, verbose_ct=100):

    train_losses = []
    val_losses = []
    print(f"Initiating CNN training.")
    model_path = f'CNN.pt'
    checkpoint_path = 'checkpoint.pt'
        
    if patience != -1:
        early_stopping = EarlyStopping(patience=patience, verbose=False, path=checkpoint_path)

    for epoch in range(epochs):
        
        # Training loop
        train_loss = training_loop(model, train_dataloader, optimizer, device)    
        train_losses.append(train_loss)

        # Validation loop
        with torch.no_grad():

            val_loss = validation_loop(model, val_dataloader, device)
            val_losses.append(val_loss)

        if patience != -1:
            early_stopping(val_loss, model)

            if early_stopping.early_stop:
                print("Patience limit reached. Early stopping and going back to last checkpoint.")
                break

        if epoch % verbose_ct == 0:        
            print(f"[{epoch+1}/{epochs}] Training loss: {train_loss:.4f}\t Validation loss: {val_loss:.4f}.")

    if patience != -1 and early_stopping.early_stop == True:
        load_backbone_from_checkpoint(model,checkpoint_path)        

    torch.save(model.state_dict(), model_path)

    print(f"CNN training finished.\n")
    
    return train_losses, val_losses
    
def evaluate(model, test_dataloader, device="cuda"):
    model.eval()
    predictions = []
    labels = []
    
    with torch.no_grad():
        for batch in test_dataloader:
            
            x_batch, y_batch = batch
                
            # Move to device
            x_batch, y_batch = x_batch.float().to(device), y_batch.type(torch.LongTensor).to(device)
            
            yhat = model(x_batch) # No unpacking occurs in CNNs
            
            # Calculate the index of the maximum argument
            yhat_idx = torch.argmax(yhat, dim=1)
            
            predictions.append(yhat_idx.cpu().numpy())
            labels.append(y_batch.cpu().numpy())
    
    return predictions, labels  # Return the model predictions

# Small code to plot losses after training
def plot_losses(train_losses,val_losses,title):
    plt.plot(train_losses, label="Training loss", color=mycol)
    plt.plot(val_losses, label="Validation loss", color=mycomplcol)
    plt.legend(loc='best')
    plt.ylabel('Mean Loss')
    plt.xlabel('Epochs')
    plt.title(f"Loss graph during the process of training the CNN.")
    plt.savefig(title, bbox_inches='tight')
    plt.show() 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Finally, we run the model in order to make predictions.

In [None]:
input_height, input_width = 28, 28
conv_channels = [1,4,16,32,64]
kernels = [3,3,3,3]
maxpools = [2,2,2,2]
lin_channels = [128,64,10]
dropout = 0.2
learning_rate = 0.0001
weight_decay = 1e-6
patience = 10
verbose_ct = 1

epochs = 2500

model = CNNBackbone(input_height = input_height, input_width = input_width,
                    conv_channels = conv_channels, kernels = kernels, maxpools = maxpools,
                    lin_channels = lin_channels, dropout = dropout, batchnorm=True)
model.to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate, weight_decay = weight_decay)

# Train the model
t_losses, v_losses = train(model, train_loader, val_loader, optimizer, epochs,
                           device=device, patience=patience, verbose_ct = verbose_ct)

# Plot the loss diagram
plot_losses(t_losses, v_losses, 'CNN_Training_Loss.pdf')

if model_debug:
    # Evaluate the model
    predictions, labels = evaluate(model, test_loader, device=device)

    y_true = np.concatenate(labels, axis=0)
    y_pred = np.concatenate(predictions, axis=0)

    print(classification_report(y_true, y_pred))
else:
    # Final Predictions
    predictions, labels = evaluate(model, final_loader, device=device)

    y_pred = np.concatenate(predictions, axis=0)
    x_idx = np.arange(X_final_test.shape[0])+1
    
    final_df = pd.DataFrame({'ImageId':x_idx, 'Label':y_pred})
    final_df.to_csv('submission.csv', index=False)
    print('Submission file is ready.')