In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Load Training Dataset

In [None]:
import numpy as np
import os


data_root = '/kaggle/input'

train_set = np.loadtxt(os.path.join(data_root, 'digit-recognizer/train.csv'), delimiter=',', skiprows=1, dtype=np.float32)
print(train_set.shape)

labels, images = train_set[:, 0, np.newaxis], train_set[:, 1:]
print(labels.shape)
print(images.shape)

### Visualize Training Samples

In [None]:
import matplotlib.pyplot as plt

num_train_set, _ = labels.shape

idxs = np.arange(num_train_set)

np.random.shuffle(idxs)

h, w = 10, 10

fig, ax = plt.subplots(h, w, figsize=(20, 20))
for i in range(h):
    for j in range(w):
        idx = idxs[i * h + j]
        number = labels[idx, 0].astype(np.uint8)
        ax[i, j].set_title(f'label: {number}')
        ax[i, j].imshow(images[idx, :].reshape((28, 28)), cmap='gray')
        ax[i, j].axis('off')

plt.show()

### Define the operations for CNN

In [None]:
import torch.nn as nn

import collections

_activations = {
    'relu': nn.ReLU,
    'relu6': nn.ReLU6,
    'leaky_relu': nn.LeakyReLU
}


class BaseBlock(nn.Module):

    def __init__(self):
        super(BaseBlock, self).__init__()
        self._layer: nn.Sequential

    def forward(self, x):
        return self._layer(x)


class DenseBlock(BaseBlock):

    def __init__(self, shape, **params):
        super(DenseBlock, self).__init__()
        in_dims, out_dims = shape
        _seq = collections.OrderedDict([
            ('dense', nn.Linear(in_dims, out_dims)),
        ])
        _act_name = params.get('activation')
        if _act_name:
            _seq.update({_act_name: _activations[_act_name](inplace=True)})

        self._layer = nn.Sequential(_seq)

        w_init = params.get('w_init', None)
        idx = list(dict(self._layer.named_children()).keys()).index('dense')
        if w_init:
            w_init(self._layer[idx].weight)
        b_init = params.get('b_init', None)
        if b_init:
            b_init(self._layer[idx].bias)


class Conv2DBlock(BaseBlock):

    def __init__(self, shape, stride, padding='same', **params):
        super(Conv2DBlock, self).__init__()

        h, w, in_channels, out_channels = shape
        _seq = collections.OrderedDict([
            ('conv', nn.Conv2d(in_channels, out_channels, kernel_size=(h, w), stride=stride, padding=padding))
        ])

        _bn = params.get('batch_norm')
        if _bn:
            _seq.update({'bn': nn.BatchNorm2d(out_channels)})

        _act_name = params.get('activation')
        if _act_name:
            _seq.update({_act_name: _activations[_act_name](inplace=True)})

        _max_pool = params.get('max_pool')
        if _max_pool:
            _kernel_size = params.get('max_pool_size', 2)
            _stride = params.get('max_pool_stride', _kernel_size)
            _seq.update({'max_pool': nn.MaxPool2d(kernel_size=_kernel_size, stride=_stride)})

        self._layer = nn.Sequential(_seq)

        w_init = params.get('w_init', None)
        idx = list(dict(self._layer.named_children()).keys()).index('conv')
        if w_init:
            w_init(self._layer[idx].weight)
        b_init = params.get('b_init', None)
        if b_init:
            b_init(self._layer[idx].bias)

### Define the model

|    layer    | Input  |   Conv 1   |   Conv 2   |   Conv 3   |   Conv 4   |   Dense 5    |  Dense 6   |  Dense 7   |  Dense 8   |
| :---------: | ------ | :--------: | :--------: | :--------: | :--------: | :----------: | :--------: | :--------: | :--------: |
|  channels   | 3      |     16     |     32     |     64     |    128     |     512      |     256    |      64    |     10     |
| weight size | -      |   3 x 3    |   3 x 3    |   3 x 3    |   3 x 3    |  1152 x 512  |  512 x 256 |  256 x 64  |   64 x 10  |
|   pooling   | -      | 2 x 2 - s2 | 2 x 2 - s2 | 2 x 2 - s2 | 2 x 2 - s2 |      -       |      -     |      -     |      -     |
|   padding   | -      |     2      |     2      |     2      |     2      |      -       |      -     |      -     |      -     |
|   dropout   | -      |     -      |     -      |     -      |     -      |      -       |      -     |      -     |      -     |
| activation  | linear |    ReLU    |    ReLU    |    ReLU    |    ReLU    |     ReLU     |     ReLU   |     ReLU   |   Linear   |

In [None]:
import torch.nn as nn
import torch


class Network(nn.Module):
    
    def __init__(self, **params):
        super(Network, self).__init__()
        
        self.classes = params.get('classes', 10)
        self.channels = params.get('channels', 1)
        
        self._layer = nn.Sequential(
            Conv2DBlock(
                shape=[3, 3, self.channels, 16], stride=1, padding='same', activation='relu', max_pool=True
            ),
            Conv2DBlock(
                shape=[3, 3, 16, 32], stride=1, padding='same', activation='relu', max_pool=True
            ),
            Conv2DBlock(
                shape=[3, 3, 32, 64], stride=1, padding='same', activation='relu', max_pool=True
            ),
            Conv2DBlock(
                shape=[3, 3, 64, 128], stride=1, padding='same', activation='relu', max_pool=False
            ),
            nn.Flatten(),
            
            DenseBlock(shape=[1152, 512], activation='relu'),
            DenseBlock(shape=[512, 256], activation='relu'),
            nn.Dropout(p=params.get('dropout_rate', 0.9)),
            DenseBlock(shape=[256, 64], activation='relu'),
            DenseBlock(shape=[64, self.classes])
        )

    def forward(self, x):
        return self._layer(x)

#### Dimension Test

In [None]:
net = Network()
sample_input = torch.tensor(images[0, :].reshape(1, 1, 28, 28))
output = net(sample_input)
print(output.shape)

### Model Wrapper

In [None]:

class Model(object):
    def __init__(self, **params):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.net = Network(
            classes=params.get('classes', 10),
            channels=params.get('channels', 1),
            dropout_rate=params.get('dropout_rate', 0.9)
        )
        self.net.to(self.device)

        self.lr = params.get('lr', 1e-4)
        self.lr_step = params.get('lr_step', None)
        self.lr_decay = params.get('lr_decay', None)
        self.lr_scheduler = None

        self.momentum = params.get('momentum', 0.9)
        self.weight_decay = params.get('weight_decay', 0)
        
        self.grad_clip = params.get('grad_clip', None)
        self.adjustable_grad_clip = params.get('adjustable_grad_clip', False)
        
        self.criterion = torch.nn.CrossEntropyLoss(reduction='mean')
        self.optimizer = torch.optim.SGD(
            self.net.parameters(),
            lr=self.lr,
            momentum=self.momentum,
            weight_decay=self.weight_decay
        )

        if self.lr_decay:
            self.lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
                optimizer=self.optimizer,
                milestones=self.lr_step,
                gamma=self.lr_decay
            )

    def optimize(self, x, y):
        p = self.net(x.to(self.device))
        loss = self.criterion(p, y.to(self.device))

        self.optimizer.zero_grad()
        loss.backward()

        if self.grad_clip:
            torch.nn.utils.clip_grad_norm_(
                self.net.parameters(),
                self.grad_clip / self.lr if self.adjustable_grad_clip else self.grad_clip)
            
        self.optimizer.step()
        return loss.item()

    @torch.no_grad()
    def inference(self, x):
        return self.net(x.to(self.device))

    def save(self, path):
        torch.save(self.net.state_dict(), path)

    def load(self, path):
        self.net.load_state_dict(torch.load(path))
        self.net.eval()

### Data Loader

In [None]:
from PIL import Image


class Dataset(torch.utils.data.Dataset):

    def __init__(self, path, name='digit-recognizer', ratio=0.9, is_train=False, transform=None):
        self.is_train = is_train
        self.name = name
        self.ratio = ratio
        
        self.images = []
        self.labels = []

        self.transform = transform
        self._load_data(path)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        _image = self.images[idx]
        _label = self.labels[idx]

        if self.transform:
            _image = Image.fromarray(_image)
            _image = self.transform(_image)

        return _image, _label

    def _load_data(self, path):
        mode = 'train' if self.is_train else 'test'

        data = np.loadtxt(os.path.join(path, f'{self.name}/train.csv'), delimiter=',', skiprows=1, dtype=np.float32)
        num_dataset, _ = data.shape
        num_train = int(num_dataset * self.ratio)
        
        if self.is_train:
            self.labels, self.images = data[: num_train, 0], data[: num_train, 1:]
            
        else:
            self.labels, self.images = data[num_train:, 0], data[num_train: , 1:]
            
        self.images = self.images.reshape(-1, 28, 28)

In [None]:

@torch.no_grad()
def validation(m, ds):
    num_data = 0
    corrects = 0

    # Test loop
    m.net.eval()
    _softmax = torch.nn.Softmax(dim=1)
    for i, data in enumerate(ds):
        images, labels = data
        
        predictions = m.inference(images)
        predictions = _softmax(predictions)

        _, predictions = torch.max(predictions, 1)
        labels = labels.type(torch.LongTensor)
        num_data += labels.size(0)
        corrects += (predictions == labels.to(m.device)).sum().item()

    accuracy = 100 * corrects / num_data
    return accuracy

### Random Seed

In [None]:
import random

def set_random_seed(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)  # if use multi-GPU

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    np.random.seed(random_seed)
    random.seed(random_seed)

In [None]:
set_random_seed(12321)

### Hyper Parameter settings for training

In [None]:
config = {
    "model_name": "digit-recognizer",
    "dataset": "digit-recognizer",
    "classes": 10,
    "channels": 1,
    "batch_size": 256,
    "epochs": 50,
    "momentum": 0.9,
    "lr": 1e-2,
    "lr_step": [50],
    "lr_decay": 0.5,
    "weight_decay": 1e-8,
    "dropout_rate": 5e-1,
    "grad_clip": None, 
    "adjustable_grad_clip": False,
}

### Data Loader

#### Data Augmentation
- Random Shift (Zero Padding + Random Crop)
- Random Rotation

In [None]:
import torchvision

print('Load Training Data')
_train_set = Dataset(
    path=data_root, name='digit-recognizer', ratio=0.9, is_train=True, 
    transform=torchvision.transforms.Compose([
        torchvision.transforms.Pad(padding=2), 
        torchvision.transforms.RandomRotation(degrees=2),
        torchvision.transforms.RandomCrop(size=28),  
        torchvision.transforms.ToTensor()
    ])
)
train_set = torch.utils.data.DataLoader(
    _train_set, batch_size=config['batch_size'], shuffle=True, num_workers=2
)

print('Load Test Data')
_test_set = Dataset(
    path=data_root, name='digit-recognizer', ratio=0.9, is_train=False, 
    transform=torchvision.transforms.Compose([torchvision.transforms.ToTensor()])
)
test_set = torch.utils.data.DataLoader(
    _test_set, batch_size=config['batch_size'], shuffle=False, num_workers=1
)


### Training

In [None]:
os.makedirs('/kaggle/model', exist_ok=True)

m = Model(
    classes=config['classes'],
    channels=config['channels'],
    lr=config['lr'],
    lr_step=config['lr_step'],
    lr_decay=config['lr_decay'],
    momentum=config['momentum'],
    weight_decay=config['weight_decay'],
    dropout_rate=config['dropout_rate'],
    grad_clip=config['grad_clip'],
    adjustable_grad_clip=config['adjustable_grad_clip']
)

epochs = config['epochs']
lr = config['lr']

history = {
    'loss': [],
    'accuracy': []
}

for epoch in range(epochs):
    
    _loss = []
    
    m.net.train()
    for i, data in enumerate(train_set):
        images, labels = data
        labels = labels.type(torch.LongTensor)
        _loss.append(m.optimize(images, labels))
        
    if m.lr_scheduler:
        lr = m.lr_scheduler.get_last_lr()[0]
        m.lr_scheduler.step()
        
    accuracy = validation(m, test_set)
    
    print(f'Epoch: {epoch + 1:03d}/{epochs:03d} | loss={np.mean(_loss):.8f} | lr={lr:.5f} | accuracy={accuracy:.2f}')
    
    history['loss'].append(np.mean(_loss))
    history['accuracy'].append(accuracy)
    
    m.save(f'/kaggle/model/model-{epoch + 1:03d}.pth')

### Visualize the training history
- Training Loss
- Validation Accuracy

In [None]:
loss = history['loss'] # on training set
accuracy = history['accuracy'] # on test set

epochs = np.arange(len(loss))

fig, ax1 = plt.subplots()
ax2 = ax1.twinx()

plot1, = ax1.plot(epochs, loss, marker='.', c='blue', label='loss')
plot2, = ax2.plot(epochs, accuracy, marker='.', c='red', label='accuracy')
plt.legend([plot1, plot2], ['loss', 'accuracy'], loc='upper right')

plt.grid()

ax1.set_xlabel('Epoch')
ax1.set_ylabel('loss', color='blue')
ax2.set_ylabel('accuracy', color='red')
plt.show()

### Early Stopping

In [None]:
import glob


model_history = glob.glob('/kaggle/model/*.pth')
model_history = sorted(model_history, key=os.path.basename)

best = {
    'epoch': 0,
    'accuracy': 0,
    'path': ''
}

for i, model_path in enumerate(model_history):
    m.load(model_path)
    accuracy = validation(m, test_set)
    if accuracy >= best['accuracy']:
        best['epoch'] = i
        best['accuracy'] = accuracy
        best['path'] = model_path
        print(f'Best accuracy at epoch={i} with {accuracy:.2f}%')

best_epoch = best['epoch']
best_accuracy = best['accuracy']
best_path = best['path']

print(f'Final model is epoch={best_epoch} with accurayc={best_accuracy:.2f}%')
print(f'Path={best_path}')

### Confusion Matrix with Best Model

In [None]:
from sklearn import metrics

@torch.no_grad()
def confusion_matrix(_m, ds):
    
    _pred = []
    _gt = []
    
    _m.net.eval()
    _softmax = torch.nn.Softmax(dim=1)
    for i, data in enumerate(ds):
        images, labels = data
        
        predictions = _m.inference(images)
        predictions = _softmax(predictions)

        _, predictions = torch.max(predictions, 1)
        labels = labels.type(torch.LongTensor)
        _pred += predictions.cpu().tolist()
        _gt += labels.cpu().tolist()
        
    return metrics.confusion_matrix(_gt, _pred)

In [None]:
import seaborn as sns


m.load(best_path)
cmat = confusion_matrix(m, test_set)

sns.reset_defaults()
ax = sns.heatmap(cmat, annot=True, fmt='d', cbar=False)

ax.set_yticklabels(list(range(10)), rotation=0)
ax.set_xticklabels(list(range(10)), rotation=0)

plt.xlabel('prediction', fontsize=12)
plt.ylabel('label', fontsize=12)
plt.show()

### ErrorAnalysis

In [None]:
@torch.no_grad()
def error_analysis(_m, ds):
    
    _pred = []
    _gt = []
    _images = []
    
    _m.net.eval()
    _softmax = torch.nn.Softmax(dim=1)
    for i, data in enumerate(ds):
        images, labels = data
        
        predictions = _m.inference(images)
        predictions = _softmax(predictions)

        _, predictions = torch.max(predictions, 1)
        labels = labels.type(torch.LongTensor)
        failures = predictions != labels.to(m.device)
        
        _pred += predictions[failures].cpu().tolist()
        _gt += labels[failures].cpu().tolist()
        _images += images[failures].tolist()
        
    return _images, _pred, _gt

In [None]:
images, preds, gts  = error_analysis(m, test_set)

num_failures = len(gts)
print(num_failures)

h, w = 3, 6

fig, ax = plt.subplots(h, w, figsize=(20, 10))
idx = 0

for i in range(h):
        
    for j in range(w):
        
        if idx < num_failures:        
            img = np.array(images[idx]).reshape(28, 28)
            ax[i, j].set_title(f'GT: {gts[idx]} | Pred: {preds[idx]}')
            ax[i, j].imshow(img, cmap='gray')
        ax[i, j].axis('off')
        idx += 1

plt.show()

### Make a submission with best model

In [None]:
data = np.loadtxt(os.path.join('/kaggle/input/digit-recognizer/test.csv'), delimiter=',', skiprows=1, dtype=np.float32)

print(data.shape)

In [None]:
batch_size = 2048
num_test_set, _ = data.shape

m.load(best_path)
m.net.eval()

predictions = []
_softmax = torch.nn.Softmax(dim=1)
for i in range(0, num_test_set, batch_size):
    images = data[i: i + batch_size].reshape(-1, 1, 28, 28)
    images = torch.tensor(images)
    
    p = m.inference(images)
    p = _softmax(p)
    _, p = torch.max(p, 1)
    
    predictions += p.cpu().tolist()


In [None]:
idxs = list(range(1, num_test_set + 1))
print(len(idxs), len(predictions))
submission = np.concatenate(([idxs], [predictions]), axis=0)
submission = submission.T


In [None]:
import pandas as pd

df = pd.DataFrame(submission, columns=['ImageId', 'Label'])
df.to_csv('submission.csv', index=False)

In [None]:
df