In [None]:
import torch
from torch import nn

from torch.utils.data import DataLoader, TensorDataset

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
data_path = '/kaggle/input/digit-recognizer/'

## Reading and processing the dataset

In [None]:
# Let's start by reading the data from csv files
train_data = pd.read_csv(os.path.join(data_path, 'train.csv'))

test_data = pd.read_csv(os.path.join(data_path, 'test.csv'))
train_data.head()

In [None]:
# I'll split train data to construct train and validation splits
# convert data frames to numpy arrays
X_train, Y_train = train_data.iloc[:,1:].values, train_data.iloc[:,0].values

# split the dataset
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size = .25)
x_test = test_data.values # we don't have targets in the test data

# plot few samples from the training data 
fig, axs = plt.subplots(1, 3)
idxs = np.random.randint(0, x_train.shape[0], 3)
for i, idx in enumerate(idxs):
    axs[i].set_title(y_train[idx])
    axs[i].imshow(x_train[idx].reshape(28, 28), cmap = 'gray')
plt.show()

In [None]:
# we will use these constants later on
BATCH_SIZE = 256
LR = 0.001
N_EPOCHS = 50
N_CLASSES = len(set(y_train)) # 10

In [None]:
# standardizing the data

train_mean = x_train.mean()
train_std = x_train.std()

# we will standardize the validation and test set using 
# the mean and standard deviation of training set
# otherwise, we would leak information about train set to validation and test sets 

x_train = (x_train - train_mean) / train_std
x_val = (x_val - train_mean) / train_std
x_test = (x_test - train_mean) / train_std
# convert everything to torch tensors
x_train = torch.from_numpy(x_train).type(torch.float32)
x_val = torch.from_numpy(x_val).type(torch.float32)
x_test = torch.from_numpy(x_test).type(torch.float32)
y_train = torch.from_numpy(y_train)
y_val = torch.from_numpy(y_val)

# build the datasets and dataloaders
train_ds = TensorDataset(x_train, y_train)
val_ds = TensorDataset(x_val, y_val)
# 
train_loader = DataLoader(train_ds, batch_size = BATCH_SIZE, shuffle = True)
val_loader = DataLoader(val_ds, batch_size = BATCH_SIZE, shuffle = False)

## Defining the model
I love to have some helper methods in my model classes. It generally makes everything more structured while it is also possible to use them as snippets later. 

In [None]:
class LinearModel(nn.Module):
    """ Basic linear model with hidden layers """
    def __init__(self, n_input: int = 784, hidden_nodes = [128], n_output: int = N_CLASSES):
        super().__init__()
        if type(hidden_nodes) == int:
            hidden_nodes = [hidden_nodes]
        # concatenate the number of inputs, outputs and all the hidden nodes list         
        # to automatically build a linear model using ModuleList
        num_nodes = [n_input] + hidden_nodes + [n_output]

        
        # let's start
        # we can group sequential Linear->ReLU models together to build the model automatically
        module_list = nn.ModuleList([
            nn.Sequential(
            nn.Linear(num_nodes[i], num_nodes[i+1]), nn.ReLU()
            )
            for i in range(len(num_nodes)-2)
        ]) # the last layer will have num_nodes[-3] input nodes and num_nodes[-2] output nodes
        
        
        
        self.linear = nn.Sequential(*module_list) # pass the modules in the module list to Sequential model
        # since the last layer in `linear` will have number of num_nodes[-2] output nodes,
        # we will set the input nodes `num_nodes[-2]` in the classifier 
        # we also concatenated `n_output` with `num_nodes` and 
        # last element of `num_nodes` is `n_output`
        # so we will set the number of output nodes as `n_output` by passing `num_nodes[-1]`
        self.classifier = nn.Linear(num_nodes[-2], num_nodes[-1])
        
    def configure(self, optimizer, loss_fn):
        """ Simple function to set the optimizer and loss function """
        self.optimizer = optimizer
        self.loss_fn = loss_fn
        
    def forward(self, x):
        x = self.linear(x) # output shape: Nx(n_hidden)
        outputs = self.classifier(x) # output shape: Nx(n_output)
        return outputs
        
    def train_step(self, x_batch, y_batch):
        # feed data to the network
        out = self.forward(x_batch)
        # calculate the losses
        loss = self.loss_fn(out, y_batch)
        # backward propagation
        loss.backward()
        self.optimizer.step()
        self.optimizer.zero_grad() # we zero out the gradients when we're done, they will accumulate if we don't.
        # finally return the loss
        return loss.item()
    
    def val_step(self, x_batch, y_batch):
        with torch.no_grad():
            out = self.forward(x_batch)
            loss = self.loss_fn(out, y_batch)
        return loss.item()

    def predict(self, x):
        """ Function to make predictions, just returns the index of the maximums in model outputs """
        model_outputs = self.forward(x)
        preds = model_outputs.argmax(1)
        return preds

In [None]:
model = LinearModel(hidden_nodes = [256,256])
print(model)
del model

In [None]:
def train(model, train_loader, validation_loader, n_epochs = 20):
    train_losses = np.zeros(n_epochs)
    val_losses = np.zeros(n_epochs)
    for i in range(n_epochs):
        # define the running losses for the epoch
        train_batch_loss = 0
        val_batch_loss = 0
        
        for x, y in train_loader:
            loss = model.train_step(x, y)
            # CrossEntropyLoss is reducing the calculated loss by mean
            # So every single time we add up the average loss by batches
            # we also need to divide it by number of batches because of this
            train_batch_loss += loss
        
        model.eval()
        for x, y in val_loader:
            val_loss = model.val_step(x, y)
            val_batch_loss += val_loss 
        model.train()
        
        # divide it by the number of the batches per dataset phase
        # to get the average losses
        train_losses[i] = train_batch_loss / len(train_loader)
        val_losses[i] = val_batch_loss / len(validation_loader)
        
        if (i+1)%10==0: # we will log the average losses per epoch
            print(f"Epoch {i+1}/{n_epochs} | Avg. training loss: {train_losses[i]:.4f} | Avg. validation loss: {val_losses[i]:.4f}")
    return train_losses, val_losses

In [None]:
# we can finally define the model and train it
model = LinearModel(hidden_nodes = [256, 256])
print(model)
# define the loss function and optimizer, then pass them to `configure` method of the model
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = LR)
model.configure(optimizer, loss_fn)


train_losses, val_losses = train(model, train_loader, val_loader, n_epochs = N_EPOCHS)

plt.title('Losses')
plt.plot(train_losses, label = 'training loss')
plt.plot(val_losses, label = 'validation loss')
plt.legend()
plt.show()

## Can we visualize the model?

Well,generally we think of neural nets as black boxes but we can try to interpret what model learned by looking at the activations. We would look at the learned filters if we used CNNs but we can still try to understand

In [None]:
# I'm using this utility function to calculate 
# the number of columns of activation matrices
# in the `plot_activations`
def mid_multiplier(num):
    """
    Constructs the list of multipliers of given number
    Then returns the multiplier at the middle    
    It looks like a bit complex but I just filter numbers 
    from starting 1 to given number, then return the number
    at the middle
    """
    f = list(filter(lambda x: num%x==0, range(1, num)))
    L = len(f)
    return f[L//2]
    
def plot_activations(model: nn.Module, data: np.ndarray) -> None:
    """
    Plots the given model's activations layer by layer
    
    params:
        model - linear torch model (with the same structure as defined above)
        data - numpy array that contains the samples
    
    It simply processes the given data iteratively while
    plotting the activations as heatmaps
    
    I've set plotting parameters according to my use case,
    you may need to change it accordingly before using it.
    
    
    """
    model.eval()
    with torch.no_grad(): 
        # we will first process the sample like we did before training
        # so the samples should come from the first data frame as a numpy array
        
        # process the data
        x = (data - train_mean) / train_std
        # we need to pass it to the model as a batch tensor
        x = torch.from_numpy(x.astype('float32')) # x.shape: 10x784
        
        num_layers = len(model.linear)
        # plus 1 more row for the samples
        fig, axs = plt.subplots( num_layers + 1, 10, figsize = (40, (num_layers + 1) * 10))
        # plot the samples at first row
        for i in range(data.shape[0]):
            axs[0][0].set_ylabel("Inputs", fontsize = 48)
            axs[0][i].imshow(data[i].reshape(28, 28), cmap = 'gray')

        # we need to use the previous outputs from (n-1)th layer in nth layer
        # so we should iteratively pass the processed data through layers
        # we can plot the activations as heatmaps after we reshape them 
        # to more suitable shape 
        output = torch.clone(x)
        for i in range(num_layers):
            output = model.linear[i](output)
            b, n_node = output.shape # output.shape: BATCHx(out_nodes)
            axs[i+1][0].set_ylabel("Layer %d"%i, fontsize = 48)
            for j in range(data.shape[0]):
                # plot the layer outputs (activations)
                y = int(mid_multiplier(n_node))
                x = int(n_node/y)
                axs[i+1][j].imshow(output[j].view(x, y).numpy())
        
        plt.show()
        
    model.train()

In [None]:
# pick one sample from each class
sample_indices = []
for data in train_data.groupby('label'):
    # data will be a tuple that contains group name at index 0
    # and the data inside of the group at the index 1
    # since the grouped data will also be a dataframe, 
    # we can get sample indices and take a random index in them
    idx = np.random.choice(data[1].index) 
    sample_indices.append(idx)
    
# we can finally plot the model activations as heatmaps
samples = train_data.iloc[sample_indices].values
labels = samples[0,:]
inputs = samples[:, 1:]
plot_activations(model, inputs)


## Evaluation 

In [None]:
# make predictions using validation data
model.eval()
preds = model.predict(x_val)
model.train()
preds = preds.tolist()

# let's take a look of the model's performance:
print(classification_report(preds, y_val))

In [None]:
# make predictions using test data and prepare the submission dataframe
model.eval()
preds = model.predict(x_test)
preds = preds.tolist()

submission = pd.read_csv(os.path.join(data_path, 'sample_submission.csv'))
submission['Label'] = preds
submission.to_csv('submission.csv', index = None)

In [None]:
pd.read_csv('submission.csv')