# 1. Library Imports

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset

# 2. Data Loading and Preliminary Processing

In [None]:
#Load train data
data = pd.read_csv('/kaggle/input/Kannada-MNIST/train.csv')
data.head()

In [None]:
#Get simple description of the data set
data.describe()

As we can see, the rows of our dataset are composed of 784 features, equivalent to 28x28 pixels images. Additionally, our target, *label*, assumes 10 different values. Let us now separate features and target in order to proceed the data processing.

In [None]:
x_data = data.drop(columns = 'label')
y_data = data['label']

First of all, we should take a look at our data as more than just a bunch of pixels, to get a sense of what we are working with. We can do that simply by printing some of the images.

In [None]:
fig, axs = plt.subplots(2, 2)
fig.tight_layout(pad = 3.0)
for i in range(2):
    for j in range(2):
        img = np.random.randint(0, len(x_data))
        axs[i,j].imshow(x_data.loc[img].values.reshape(28, 28))
        axs[i,j].set_title(y_data[img])

# 3. Feature Engineering

We now take a look at our dataset and try to find forms of improving how it is going o help us classificate digits. From the previous section, two aspects of this data have grabbed our attention:

* The values for each pixel basically range from 0 to 255; 
* The image data is presented in an one-dimensional format.

The first topic is fairly common. It is widely known that we need to scale our features in order to achieve better and faster models. Additionally, the second point is a very interesting one. It is possible to treat each image as 1D array of features and to obtain fairly accurarte results with this approach, as seen on previous versions of this notebook. However, if we want to take advantage of the CNN capabilities, it is better to represente the images in their original two-dimensional format. Hence, our first step is to apply this transformation.

In [None]:
#Define function to scale the dataset
def scale_data(data, scaler, opt = 0): #opt: 0 -> apply fit_trandform, 1 -> apply only transform
    if opt == 0:
        return scaler.fit_transform(data)
    else:
        return scaler.transform(data)

In [None]:
#Scale train dataset
my_scaler = MinMaxScaler()
scaled_x_data = scale_data(x_data, my_scaler, 0)

print('Original dataset: \n', x_data.values[0:3], '\n\n')
print('Scaled dataset: \n', scaled_x_data[0:3])

In [None]:
#Convert data entries from 1D to 2D
scaled_x_data = np.asarray([x.reshape(28, 28) for x in scaled_x_data])
print('Sample converted image: \n')
print(scaled_x_data[0])

This concludes the feature engineering section of this notebook. We now proceed to constructing and training our predictive model.

# 4. Data Modeling

In this section we are using the PyTorch library in order to construct a MLP classification model. First we are going to construct the neural network topology and then try to optimize it while training it with our data.

## 4.1. Data Preparation

In [None]:
#Selecting device
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print('Device: ', device)

When using the *torch* package, we are working with its on data type: tensors. Then, the first thing we are going to do is to adjust our data to this format. Additionally, it is important to take advantage of the tools provided by this library. One of them is the *DataLoader* module, which allows for better management of the data packages on the training procedure. Here, we are also creating a dataloader for our dataset.

Additionally, the *test.csv* file contains the data using in the final scoring of the model. So, if we want to really understand how our model is going to behave on production, we have to separate our data set into three parts as follows:

* Train data: 75%;
* Test data: 25%.

In [None]:
#Split data into train, test and validation
x_train, x_test, y_train, y_test = train_test_split(scaled_x_data, y_data,
                                                    test_size = 0.25,
                                                    stratify = y_data)

print('Train size: %d \n Test size: %d' %(len(x_train), len(x_test)))

In [None]:
#Define model hyperparameters
param = {
    'num_jobs': 2,
    'batch_size': 128,
    'num_epochs': 100
}

In [None]:
#Define class to load the dataset as a tensor
class MnistData(Dataset):
    
    def __init__(self, x_data, y_data):
        self.x_data = x_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        sample = torch.from_numpy(self.x_data[index].astype(np.float32).reshape(1, 28, 28))
        target = torch.from_numpy(self.y_data[index].astype(np.float32))
        return (sample, target)
    
    def __len__(self):
        return len(self.x_data)

In [None]:
#Load data as tensors
train_data = MnistData(x_train, y_train.values.reshape(len(y_train), 1))
test_data = MnistData(x_test, y_test.values.reshape(len(y_test), 1))
#validation_data = MnistData(x_validation, y_validation.values.reshape(len(y_validation), 1))

In [None]:
#Create DataLoader
train_loader = DataLoader(train_data,
                          batch_size = param['batch_size'],
                          shuffle = True,
                          num_workers = param['num_jobs'])
test_loader = DataLoader(test_data,
                         batch_size = param['batch_size'],
                         shuffle = True,
                         num_workers = param['num_jobs'])

## 4.2. Neural Network Topology Construction 

In [None]:
class MnistClassification(nn.Module):
    
    def __init__(self, input_dim, output_dim):
        #Inicializar classe pai
        super(MnistClassification, self).__init__()
        self.input_dim = input_dim
        
        #Preprocessing layers
        self.conv_01 = nn.Conv2d(in_channels = 1, out_channels = 16, kernel_size = 6, stride = 1, padding = 1)
        self.conv_bn_01 = nn.BatchNorm2d(num_features = 16)
        self.pool_01 = nn.MaxPool2d(kernel_size = 3, stride = 1)
        
        self.conv_02 = nn.Conv2d(in_channels = 16, out_channels = 32, kernel_size = 5, stride = 1, padding = 1)
        self.conv_bn_02 = nn.BatchNorm2d(num_features = 32)
        self.pool_02 = nn.MaxPool2d(kernel_size = 3, stride = 1)
        
        self.conv_03 = nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size = 5, stride = 1, padding = 1)
        self.conv_bn_03 = nn.BatchNorm2d(num_features = 64)
        self.pool_03 = nn.MaxPool2d(kernel_size = 3, stride = 2)
        
        #Linear an normalization layers
        self.norm_01 = nn.BatchNorm1d(num_features = 2048)
        self.layer_01 = nn.Linear(in_features = 4096, out_features = 2048)
        self.norm_02 = nn.BatchNorm1d(num_features = 1024)
        self.layer_02 = nn.Linear(in_features = 2048, out_features = 1024)
        self.norm_03 = nn.BatchNorm1d(num_features = 512)
        self.layer_03 = nn.Linear(in_features = 1024, out_features = 512)
        self.norm_04 = nn.BatchNorm1d(num_features = 256)
        self.layer_04 = nn.Linear(in_features = 512, out_features = 256)
        self.norm_05 = nn.BatchNorm1d(num_features = 128)
        self.layer_05 = nn.Linear(in_features = 256, out_features = 128)
        self.norm_06 = nn.BatchNorm1d(num_features = 64)
        self.layer_06 = nn.Linear(in_features = 128, out_features = 64)
        self.output_layer = nn.Linear(in_features = 64, out_features = output_dim)
        
        #Activation and dropout layers
        self.dropout = nn.Dropout(0.2)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim = 1)
            
    def forward(self, x_data):
        
        x_data = x_data.to(device)
        response = self.relu(self.conv_bn_01(self.conv_01(x_data)))
        response = self.pool_01(response)
        response = self.relu(self.conv_bn_02(self.conv_02(response)))
        response = self.pool_02(response)
        response = self.relu(self.conv_bn_03(self.conv_03(response)))
        response = self.pool_03(response)
        
        response = self.dropout(self.relu(self.norm_01(self.layer_01(response.reshape(len(x_data),4096)))))
        response = self.dropout(self.relu(self.norm_02(self.layer_02(response))))
        response = self.dropout(self.relu(self.norm_03(self.layer_03(response))))
        response = self.dropout(self.relu(self.norm_04(self.layer_04(response))))
        response = self.dropout(self.relu(self.norm_05(self.layer_05(response))))
        response = self.dropout(self.relu(self.norm_06(self.layer_06(response))))
        
        response = self.softmax(self.output_layer(response))
        
        return response

In [None]:
#Create MLP instance
param['input_dim'] = x_data.shape[1]
param['output_dim'] = len(y_data.unique())
param['num_layers'] = 3

model = MnistClassification(input_dim = param['input_dim'],
                            output_dim = param['output_dim']).to(device)

In [None]:
#Define loss function and optimizer
param['learning_rate'] = 1e-4
param['weight_decay'] = 5e-3

loss_function = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(params = model.parameters(),
                       lr = param['learning_rate'],
                       weight_decay = param['weight_decay'])

## 4.3. Training Setup Construction

In [None]:
#Define training model function
def train_model(estimator, train_data, epoch):
    
    #Toogle training mode
    model.train()
    
    epoch_error = []
    epoch_accuracy = []
    
    for batch in train_data:
        
        #Update learning rate 
        optimizer = optim.Adam(params = model.parameters(),
                               lr = param['learning_rate'] * (1/(10**((epoch/10) + 1))),
                               weight_decay = param['weight_decay'])

        #Forward process
        x_batch, y_batch = batch
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        batch_response = model(x_batch)
                
        #Compute error
        batch_error = loss_function(batch_response, torch.squeeze(y_batch, 1).type(torch.LongTensor).to(device))
        epoch_error.append(batch_error.cpu().data)
        
        batch_accuracy = accuracy_score(torch.squeeze(y_batch, 1).type(torch.LongTensor).cpu().numpy(), 
                                        np.argmax(batch_response.detach().cpu().numpy(), 1))
        epoch_accuracy.append(batch_accuracy)
        
        #Backward process
        optimizer.zero_grad()
        batch_error.backward()
        optimizer.step()
        
    epoch_error = np.asarray(epoch_error)
    epoch_accuracy = np.asarray(epoch_accuracy)
    
    print('Epoch %d TRAIN error: %.4f +/- %.4f / accuracy: %.4f' %(epoch+1, epoch_error.mean(), 
                                                                   epoch_error.std(), epoch_accuracy.mean()))
    
    return [epoch_error.mean(), epoch_accuracy.mean()] 

In [None]:
#Define testing model function
def test_model(estimator, test_data, epoch):
    
    #Toogle training mode
    model.eval()
    with torch.no_grad():
    
        epoch_error = []
        epoch_accuracy = []
        for batch in test_data:
            #Forward process
            x_batch, y_batch = batch
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            batch_response = model(x_batch)

            #Compute error
            batch_error = loss_function(batch_response, torch.squeeze(y_batch, 1).type(torch.LongTensor).to(device))
            epoch_error.append(batch_error.cpu().data)
            
            batch_accuracy = accuracy_score(torch.squeeze(y_batch, 1).type(torch.LongTensor).cpu().numpy(), 
                                            np.argmax(batch_response.detach().cpu().numpy(), 1))
            epoch_accuracy.append(batch_accuracy)

        epoch_error = np.asarray(epoch_error)
        epoch_accuracy = np.asarray(epoch_accuracy)
        
        print('Epoch %d TEST error: %.4f +/- %.4f / accuracy: %.4f' %(epoch+1, epoch_error.mean(), 
                                                                      epoch_error.std(), epoch_accuracy.mean()))

        return [epoch_error.mean(), epoch_accuracy.mean()]

## 4.4. Train Model

In [None]:
#Train model
param['num_epochs'] = 50

train_error = []
test_error = []

for epoch in range(param['num_epochs']):
    train_error.append(train_model(model, train_loader, epoch))
    test_error.append(test_model(model, test_loader, epoch))
    print('-----------------------------------------------------------')

In [None]:
#Plot loss function profile
plt.plot(list(range(param['num_epochs'])), [x[0] for x in train_error])
plt.plot(list(range(param['num_epochs'])), [x[0] for x in test_error])
plt.legend(['Train', 'Test'])
plt.xlabel('Epochs')
plt.ylabel('Loss')

In [None]:
#Plot accuracy profile
plt.plot(list(range(param['num_epochs'])), [x[1] for x in train_error])
plt.plot(list(range(param['num_epochs'])), [x[1] for x in test_error])
plt.legend(['Train', 'Test'])
plt.xlabel('Epochs')
plt.ylabel('Accuracy')

## 4.5. Predict Test Data Target

In [None]:
#Load test data
test_data = pd.read_csv('/kaggle/input/Kannada-MNIST/test.csv')

In [None]:
#Adjust test data
test_data_id = test_data['id']

scaled_test_data = scale_data(test_data.drop(columns = 'id'), my_scaler, 1)
scaled_test_data = np.asarray([x.reshape(1, 28, 28) for x in scaled_test_data])

In [None]:
pred = np.argmax(model.forward(torch.from_numpy(scaled_test_data.astype(np.float32))).detach().cpu().numpy(), 1)

In [None]:
submission_data = pd.DataFrame(data = test_data_id.values, columns = ['id'])
submission_data['label'] = pred
print(submission_data.head())

In [None]:
submission_data.to_csv('submission.csv', index = False)