### This solution made without pretrained models and additional datasets

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader
import torchvision as thv
import random
from tqdm.notebook import tqdm
import copy
import numpy as np
import pandas as pd
import collections
import matplotlib.pyplot as plt
from IPython.display import FileLink

Here we are fixing ALL of the randomnesses for each restart.

In [None]:
def setSeed(x):
  random.seed(x)
  np.random.seed(x)
  torch.manual_seed(x)
  torch.cuda.manual_seed(x)
  torch.backends.cudnn.deterministic = True

setSeed(2)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # using gpu if available
device

Copying the .csv files to the `pandas.DataFrame`<br>
<br>
Links: [`pandas.DataFrame`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html)

In [None]:
mnist_test = pd.read_csv("../input/digit-recognizer/test.csv")
mnist_train = pd.read_csv("../input/digit-recognizer/train.csv")
mnist_train

We can implement `torch.utils.data.Dataset` with our data type, so we can use an automatic `DataLoader`.<br>
It's not necessary, because you can do a batch split manually, but still, it's good to use.<br>
<br>
Links: [`unsqueeze`](https://pytorch.org/docs/stable/generated/torch.unsqueeze.html), [datasets and dataloader tutorial](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html)

In [None]:
class Dataset(torch.utils.data.Dataset):
  def __init__(self, images, targets, transforms=None):
        self.targets = targets # This tensor contains numbers from 0 to 9, which are the answers
        # We need to add a channel dimension for all of the images
        # Also, we can normalize pixel values from 0..255 to 0..1. This will speed up the training process
        self.images = images.unsqueeze(1) / 255
        self.transforms = transforms # We will catch up this later

  def __len__(self):
        return len(self.targets)

  def __getitem__(self, i):
        # if the transforms are set up - use it
        if self.transforms:
            x = self.transforms(self.images[i])
        else:
            x = self.images[i]
        return x, self.targets[i]

Now we need to extract the data from the `pandas.DataFrame` to the `torch.Tensor`.<br>
`torch.utils.data.random_split` will return us two `torch.Subset`, which we will use for the train and validation.<br>
You need to understand, that the `subset.dataset` contains **whole** dataset as `pandas.DataFrame`.<br>
`subset.dataset.iloc[subset.indices]` will return us a dataframe that will contain only wanted values.<br>
In this dataframe, the first row will be our target, and the others - pixel information.

In [None]:
def extractData(subset):
    # extract subset
    df = subset.dataset.iloc[subset.indices]
    # extract pixel information and transform every image to 28x28 tensor (from 784 pixels in a row)
    x = torch.from_numpy(df.values[:, 1:]).reshape(-1, 28, 28).to(torch.float)
    # extract target and covert it to torch.long type tensor
    y = torch.from_numpy(df.values[:, 0]).to(torch.long)
    return x, y

def splitData(dataframe):
    # 1/6 of all our data will be reserved for the validation
    val_n = len(dataframe) // 6
    mnist_train, mnist_val = torch.utils.data.random_split(dataframe, [len(dataframe) - val_n, val_n])
    
    return extractData(mnist_train), extractData(mnist_val)

(x_train, y_train), (x_val, y_val) = splitData(mnist_train)
# our test doesn't have answers, so the dataframe contains only pictures
x_test = torch.Tensor(mnist_test.values).reshape(-1, 28, 28)
print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)

Let's look at the first image and its target from the train subset<br>
`.item()` returns a value of the tensor with one element

In [None]:
plt.imshow(x_train[0, :, :])
plt.xlabel(y_train[0].item())
plt.show()

35000 images for the training are good, but we can do better. Let's do some *data augmentation*.<br>
We will use `torchvision.transforms` to do so. First, we will randomly rotate the image by -10..10 degrees.<br>
After that, we will apply a tiny random blur to it.<br>
This will increase the overall amount of pictures that our net will see.<br>
<br>
Links: [`torchvision.transforms`](https://pytorch.org/vision/stable/transforms.html)

In [None]:
transforms = torch.nn.Sequential(
    # without this "deprecated" resample scripted_transforms crashes in this kernel. Should be removed in the newer versions
    thv.transforms.RandomRotation(10, resample=0),
    thv.transforms.GaussianBlur(3, sigma=(0.0001, 0.3)),
)

# This will speed up transformation
scripted_transforms = torch.jit.script(transforms)

Let's see what these transforms do.

In [None]:
plt.imshow(scripted_transforms(x_train)[0])
plt.xlabel(y_train[0].item())
plt.show()

Creating train set and validation set

In [None]:
trainset = Dataset(x_train, y_train, scripted_transforms)
valset = Dataset(x_val, y_val)

Creating data loaders with our datasets. Train loader will automatically give us shuffled batches.

In [None]:
# num workers used to paralell computations
train_loader = DataLoader(trainset, batch_size=200, shuffle=True, num_workers=2)
val_loader = DataLoader(valset, batch_size=200, shuffle=False, num_workers=2)

Now we need to create the model. I will use a convolutional neural network (CNN)<br>
It can be easily understood on the LeNet5 example (check links).<br>
Apart from that, we will use some more specific layers:
* `nn.BatchNorm` will normalize the data from the previous layers. It will speed up our net.
* `nn.Dropout(0.25)` will turn off each neuron from the previous layer with a 25% chance. It will help us with the overfitting.

Notice that I don't use softmax at the end, because I don't need to have probabilities for each class. It will reduce unnecessary computations.<br>
<br>
Links: [LeNet5](https://towardsdatascience.com/understanding-and-implementing-lenet-5-cnn-architecture-deep-learning-a2d531ebc342), [BatchNorm and Dropout explanation](https://towardsdatascience.com/batch-normalization-and-dropout-in-neural-networks-explained-with-pytorch-47d7a8459bcd), [BatchNorm docs](https://pytorch.org/docs/stable/nn.html#normalization-layers), [Dropout docs](https://pytorch.org/docs/stable/nn.html#dropout-layers)

In [None]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3, padding=1),
            nn.BatchNorm2d(num_features=64, eps=1e-05, momentum=0.1),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=1),
            nn.BatchNorm2d(num_features=64, eps=1e-05, momentum=0.1),
            nn.ReLU(),

            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=1),
            nn.BatchNorm2d(num_features=64, eps=1e-05, momentum=0.1),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=1),
            nn.BatchNorm2d(num_features=64, eps=1e-05, momentum=0.1),
            nn.ReLU(),

            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout2d(0.25),
        )
       
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=1),
            nn.BatchNorm2d(num_features=64, eps=1e-05, momentum=0.1),
            nn.ReLU(),

            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=1),
            nn.BatchNorm2d(num_features=64, eps=1e-05, momentum=0.1),
            nn.ReLU(),

            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout2d(0.25),
        )
        
        self.fc1 = nn.Sequential(
            nn.Dropout(0.25),
            nn.Linear(7 * 7 * 64, 256),
            nn.BatchNorm1d(num_features=256, eps=1e-05, momentum=0.1),
            nn.ReLU(),
            nn.Dropout(0.25),
        )
        
        self.fc2 = nn.Sequential(
            nn.Dropout(0.25),
            nn.Linear(256, 10),
        )
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        
        # Flatten the 3 last dimensions (channels, width, height) to one
        x = x.view(x.size(0), x.size(1) * x.size(2) * x.size(3))
        
        x = self.fc1(x)
        x = self.fc2(x)
        
        return x

# move the net to gpu if available
net = Net().to(device)

`CrossEntropy` is a common lost function for the classification task.<br>
`RMSprop` is not that easy to understand, so I'll just leave a link. But still, you can use other optimizers, maybe they will improve the quality.<br>
`ReduceLROnPlateau` will reduce the optimizer's learning rate during the training process.<br>
When the validation loss doesn't change for the `patience` amount of epochs, the learning rate multiplies by the `factor`. The `threshold` sets the needed delta between losses.<br>
<br>
Links: [`CrossEntropy docs`](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html#torch.nn.CrossEntropyLoss), [`RMSprop explanation`](https://towardsdatascience.com/understanding-rmsprop-faster-neural-network-learning-62e116fcf29a), [`optimizators and schedulers in PyTorch docs`](https://pytorch.org/docs/stable/optim.html), 

In [None]:
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(net.parameters(), lr=1e-3, alpha=0.99)
# verbose=True will print learning rate changes
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, threshold=0.0001, patience=3, verbose=True)

Now we need to create a training function, which will return us loss and accuracy histories. Also, it will return the final net, and the best loss and accuracy net.<br>
When we do `net.eval()`, it will block dropout layers and fix batch normalization layers. It's very important, don't forget about it!

In [None]:
def train_model(net, loss, optimizer, scheduler, num_epochs):
    
    loss_hist = {'train': [], 'val': []}
    acc_hist = {'train': [], 'val': []}
    
    for epoch in range(num_epochs):
        for phase in ['train', 'val']:
            # choose dataloader and net mode based on phase
            if phase == 'train': 
                dataloader = train_loader
                net.train()
            else:
                dataloader = val_loader 
                net.eval()

            # this will accumulate loss and accuracy
            running_loss = 0. 
            running_acc = 0.
            
            # this will accumulate loss and accuracy
            best_loss = 100.
            best_acc = 0.
            
            # this will contain the best nets
            best_loss_net = None
            best_acc_net = None

            # tqdm provides the progress bar for the loop
            with tqdm(dataloader, unit='batch') as tepoch:
                # print the current epoch and phase
                tepoch.set_description(f'Epoch {epoch + 1}/{num_epochs}, {phase:5} phase') # seems like tqdm.notebook.tqdm drops repeatable spaces :(
                
                # iterate the dataloader (one iteration - one batch, which is 200 items)
                for images, targets in tepoch:
                    # transfer all data to gpu if possible
                    images, targets = images.to(device), targets.to(device)

                    # PyTorch accumulates tensor's gradient. We need to set it to zero at every batch.
                    optimizer.zero_grad()

                    # We don't need to calculate gradients at the validation phase. It will give us a significant speed boost.
                    with torch.set_grad_enabled(phase == 'train'):
                        # making predictions
                        preds = net(images)
                        # calculate loss value
                        loss_value = loss(preds, targets)
                        # choose the class with maximum value for each image. It will be an answer.
                        preds_class = preds.argmax(dim=1)
                        if phase == 'train':
                            # calculate gradients
                            loss_value.backward()
                            # make the optimizer's step
                            optimizer.step()

                    # accumulate loss and accuracy
                    running_loss += loss_value.item()
                    # all true predictions become 1, others - 0. Calculating mean of this tensor will give us the accuracy
                    running_acc += (preds_class == targets.data).float().mean().item()
                    
                
                # divide our accumulators by the amount of the batches
                # so we receive the mean value of loss and accuracy
                epoch_loss = running_loss / len(dataloader)
                epoch_acc = running_acc / len(dataloader)
                
                # update out history
                loss_hist[phase].append(epoch_loss)
                acc_hist[phase].append(epoch_acc)
                
                if phase == 'val':
                    # make a scheduler's step based on epoch's validation loss
                    scheduler.step(epoch_loss)
                    
                    # remember best models
                    if epoch_loss < best_loss:
                        best_loss = epoch_loss
                        # this will create us a copy of the network
                        best_loss_net = copy.deepcopy(net)

                    if epoch_acc > best_acc:
                        best_acc = epoch_acc
                        best_acc_net = copy.deepcopy(net)
                        
                # print epoch's accuracy and loss
                tepoch.set_postfix(loss=f'{epoch_loss:.5f}', accuracy=f'{epoch_acc:.5f}')
                
    print(best_acc, best_loss)
    return best_loss_net, best_acc_net, net, loss_hist, acc_hist

In [None]:
best_loss_net, best_acc_net, net, loss_hist, acc_hist = train_model(net, loss, optimizer, scheduler, 40)

Now let's print loss and accuracy histories

In [None]:
plt.figure(figsize=(14, 7))
for phase in acc_hist.keys():
    plt.plot(acc_hist[phase], label=phase)
plt.legend(loc='upper left')
plt.title('Accuracy')
plt.xlabel('Epoch', fontsize=14)
plt.grid()

In [None]:
plt.figure(figsize = (14, 7))
for phase in loss_hist.keys():
    plt.plot(loss_hist[phase], label=phase)
plt.legend(loc='upper left')
plt.title('Loss')
plt.xlabel('Epoch', fontsize=14)
plt.grid()

As you can see on the plots, we don't really face overfitting. But still, this net doesn't give us the best results.<br>
You can try different parameters, optimizers, schedulers, etc., which might improve accuracy.

Let's see which pictures this net recognizes incorrectly.

In [None]:
best_acc_net = best_loss_net.to(device)
best_acc_net.eval()
preds_class = np.array([], dtype=int)

with torch.no_grad():
    for images, _ in val_loader:
        preds = best_acc_net(images.to(device))
        preds = preds.cpu() # only cpu tensor can be converted to the numpy array
        preds_class = np.append(preds_class, preds.argmax(dim=1))

# this will return bool mask array
errors = (y_val.numpy() != preds_class)

def draw_errors(x_val, y_val, preds_class, errors):
    # check only error's indicies
    x_val, y_val, preds_class = x_val[errors], y_val[errors], preds_class[errors]
    cols = 6
    fig, ax = plt.subplots(1, cols, sharex=True)
    fig.set_size_inches(20, 10)
    for col in range(cols):
        ax[col].imshow(x_val[col])
        ax[col].set_title(f'Predicted  {preds_class[col]}\nTrue: {y_val[col]}')

draw_errors(x_val, y_val, preds_class, errors)

Now we need to create dataloader for the test data. Same as for the train, but only with images.

In [None]:
class TestDataset(torch.utils.data.Dataset):
  def __init__(self, images):
        self.images = images.unsqueeze(1).to(device) / 255

  def __len__(self):
        return len(self.images)

  def __getitem__(self, i):
        return self.images[i]
    
    
testset = TestDataset(x_test)
test_loader = DataLoader(testset, batch_size=100, shuffle=False, num_workers=0)

Calculating predictions on the best accuracy model

In [None]:
preds_class = np.array([], dtype=int)

with torch.no_grad():
    for images in test_loader:
        preds = best_acc_net(images.to(device)).cpu() # only cpu tensor can be converted to the numpy array
        preds_class = np.append(preds_class, preds.argmax(dim=1))

Creating dataframe like sample_submission.csv

In [None]:
df = pd.DataFrame({
    # notice that the sample submission iterates id's from 1
    'ImageId': np.arange(1, len(preds_class) + 1),
    'Label': preds_class
})
# set the ImageId row as an index
df = df.set_index('ImageId')
df

Now we need to create the file and download it

In [None]:
# save dataframe to csv
df.to_csv('submission.csv')
# generate download link
FileLink(r'submission.csv')