In [None]:
import random
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline

from PIL import Image

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

In [None]:
SEED = 42 # "Answer to the Ultimate Question of Life, the Universe, and Everything"
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

img_size = 64

batch_size = 64

In [None]:
chin_mnist_df = pd.read_csv('../input/chinese-mnist/chinese_mnist.csv')
chin_mnist_df.head(5)

In [None]:
print(chin_mnist_df['value'].nunique())
print(chin_mnist_df['value'].unique())

In [None]:
# file from csv: f"input_{suiteid}_{sampleid}_{code}.jpg"
item = chin_mnist_df.iloc[0, :]

img = Image.open(f"../input/chinese-mnist/data/data/input_{item['suite_id']}_{item['sample_id']}_{item['code']}.jpg")
print(type(img))
img = np.array(img) # convert to np.array
print(type(img))
plt.imshow(img, cmap='gray')

In [None]:
chin_mnist_df['character'].apply(lambda char: char)

In [None]:
plt.bar(np.sort(chin_mnist_df['value'].unique()).astype(np.str), chin_mnist_df['character'].value_counts())

1000 samples for each class, so we will have 700 train, 200 valid and 100 test samples from each class

In [None]:
# get sample for each class
# sample is already without replacement
#chin_mnist_df.groupby('value').apply(lambda x: x.sample(1))

### sampling keeps track of sampled indices even when called in different scopes

In [None]:
#testing_df = chin_mnist_df.groupby('value').apply(lambda x: x.sample(3)).reset_index(drop=True)

#testing_df_1 = testing_df.groupby('value').apply(lambda x: x.sample(1))
#testing_df_2 = testing_df.groupby('value').apply(lambda x: x.sample(2))

#testing_df

In [None]:
#testing_df_1

In [None]:
#testing_df_2

# Training, validation and testing sets and loaders

In [None]:
labels = np.sort(chin_mnist_df['value'].\
unique())
labels

In [None]:
class Dataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        self.n_classes = y.nunique()
        self.labels    = np.sort(y.unique())
                
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, index):
        img_path_info = self.x.iloc[index, :]
        img = Image.open(f"../input/chinese-mnist/data/data/input_{self.x.iloc[index, 0]}_{self.x.iloc[index, 1]}_{self.x.iloc[index, 2]}.jpg")
        img = np.array(img) # convert to np.array
        
        label = self.y.iloc[index]
        label_index   = np.where(self.labels == label)[0][0]
        
        return img, label_index # returns features (image) and target index in self.labels, which corresponds to the target softmax index in the model

In [None]:
# Train (70%), valid (20%) and test (hold-out) (10%) splits

# Not securing samples of each class
#train_df = chin_mnist_df.sample(frac=0.7, random_state=SEED)
#chin_mnist_df.drop(train_df.index)
#valid_df = chin_mnist_df.sample(frac=0.7, random_state=SEED) # 66% (ceiled to 70%) of the remaining 30% from original
#chin_mnist_df.drop(valid_df.index)
#test_df = chin_mnist_df
#chin_mnist_df.drop(test_df.index)

# Securing equal number of samples from each class (sample bootstraps without replacement by default)
train_df = chin_mnist_df.groupby('value').apply(lambda x: x.sample(700, random_state=SEED)).reset_index(drop=True)
x_train, y_train  = train_df.iloc[:, :-2], train_df.iloc[:, -2]

valid_df = chin_mnist_df.groupby('value').apply(lambda x: x.sample(200, random_state=SEED)).reset_index(drop=True)
x_valid, y_valid  = valid_df.iloc[:, :-2], valid_df.iloc[:, -2]

test_df  = chin_mnist_df.groupby('value').apply(lambda x: x.sample(100, random_state=SEED)).reset_index(drop=True)
x_test, y_test    = test_df.iloc[:, :-2], test_df.iloc[:, -2]

In [None]:
train_ds = Dataset(x_train, y_train)
train_dataloader = torch.utils.data.DataLoader(train_ds, batch_size=16, shuffle=True)

valid_ds = Dataset(x_valid, y_valid)
valid_dataloader = torch.utils.data.DataLoader(valid_ds, batch_size=16, shuffle=True)

test_ds = Dataset(x_test, y_test)
test_dataloader = torch.utils.data.DataLoader(test_ds, batch_size=16, shuffle=True)

In [None]:
images, labels = next(iter(test_dataloader))
plt.imshow(images[0], cmap='gray')
print(images.shape)
print(labels[0])

# Model

In [None]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
                
        convs = [
            nn.Conv2d(1, 32, kernel_size=3, stride=1),
            nn.Conv2d(32, 64, kernel_size=3, stride=1),
            nn.MaxPool2d(2),
            nn.ReLU(),
            nn.Conv2d(64, 128, kernel_size=2, stride=1, dilation=2),
            nn.Conv2d(128, 256, kernel_size=2, stride=2, dilation=2),
            nn.MaxPool2d(2),
            nn.ReLU(),
            nn.Conv2d(256, 512, kernel_size=2, stride=1, dilation=2),
            #nn.Conv2d(512, 1024, kernel_size=2, stride=2, dilation=2),
            nn.MaxPool2d(2),
            nn.GELU()
        ]
        
        self.conv = nn.Sequential(*convs)
        
        self.linear = nn.Linear(2048, 15)
        
        self.log_softmax = nn.LogSoftmax(dim=0) # avoid overflowing: large number -> exp() -> NaN -> log() -> NaN. I think I could also solve this through batch normalization.
        
    def forward(self, x):
        x = x.unsqueeze(1) # single channel image
        #print(x.size())
        
        hidden = self.conv(x)
        #print(hidden.size())
        
        hidden = torch.flatten(hidden, start_dim=1)
        #print(hidden.size())
        
        
        output = self.log_softmax(self.linear(hidden))
        return output

In [None]:
model = ConvNet()
model = model.to(device)
print(model)
print(device)

optimizer = torch.optim.Adam(model.parameters(), lr=10e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
criterion = nn.CrossEntropyLoss()

In [None]:
epochs = 10
train_size = len(train_ds)
valid_size = len(valid_ds)

valid_conf_matrixes = []

for epoch in range(epochs):
    labels = torch.tensor([]).to(device).detach()
    preds  = torch.tensor([]).to(device).detach()
    
    total_preds = 0
    correct_preds = 0
    
    train_running_loss = 0.0
    
    for index, data in enumerate(train_dataloader):
        model.train()
        
        batch_inputs, batch_labels = data[0][:].to(device).type(torch.float), data[1][:].to(device)
        
        outputs = model(batch_inputs)
        
        loss = criterion(outputs, batch_labels) # expects distribution from model softmax as pred and target_index as target
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_running_loss += loss.mean()
        
        labels = torch.cat((labels, batch_labels))
        #total_preds += 1
        
        for idx, item in enumerate(outputs):                
            preds  = torch.cat((preds, torch.argmax(item).unsqueeze(-1)))
            
        if index % 10 == 1 :
            if index == 1:
                print(f'Training Epoch: {epoch+1}, step: {index+1}, first step training loss: {train_running_loss/1}')
            else:
                print(f'Training Epoch: {epoch+1}, step: {index+1}, moving average of training loss: {train_running_loss/10}')
            train_running_loss = 0.0
    
    print('Calculating conf_matrix')
    conf_mat = confusion_matrix(labels.cpu().numpy(), preds.cpu().numpy())
    
    total = np.sum(conf_mat)
    
    correct_count = 0
    
    for i, data in enumerate(conf_mat[0]):
        correct_count += conf_mat[i][i]
    
    
    print(f'Training Epoch {epoch+1}:\n Accuracy: {correct_count/total}\n{conf_mat}\n')
    print()
    
    labels = torch.tensor([]).to(device).detach()
    preds  = torch.tensor([]).to(device).detach()
    
    total_preds = 0
    correct_preds = 0
    
    valid_running_loss = 0.0
    
    for index, data in enumerate(valid_dataloader):
        model.eval()
        
        batch_inputs, batch_labels = data[0][:].to(device).type(torch.float), data[1][:].to(device)
        
        outputs = model(batch_inputs)
        
        loss = criterion(outputs, batch_labels) # expects distribution from model softmax as pred and target_index as target
        
        valid_running_loss += loss.mean()
        
        labels = torch.cat((labels, batch_labels))
        #total_preds += 1
        
        for i, item in enumerate(outputs):                
            preds  = torch.cat((preds, torch.argmax(item).unsqueeze(-1)))
        
        if index % 10 == 1:
            print(f'Validation Epoch: {epoch+1}, step: {index+1}, running average validation loss: {valid_running_loss/10}')
            valid_running_loss = 0.0
        
    print('Calculating conf_matrix')
    conf_mat = confusion_matrix(labels.cpu().numpy(), preds.cpu().numpy())
    
    total = np.sum(conf_mat)
    
    correct_count = 0
    
    for i, data in enumerate(conf_mat[0]):
        correct_count += conf_mat[i][i]
    
    valid_conf_matrixes.append(conf_mat)   
    
    print(f'Validation Epoch {epoch+1}:\n Accuracy: {correct_count/total}\n{conf_mat}')
    print()

In [None]:
test_size = len(test_ds)

labels = np.array([])
preds  = np.array([])

test_running_loss = 0.0

for index, data in enumerate(test_dataloader):
    model.eval()

    batch_inputs, batch_labels = data[0][:].to(device).type(torch.float), data[1][:].to(device)

    outputs = model(batch_inputs)

    loss = criterion(outputs, batch_labels) # expects distribution from model softmax as pred and target_index as target

    test_running_loss += loss.mean()

    labels = np.concatenate((labels, batch_labels.cpu().numpy()))

    for index, item in enumerate(outputs):
        preds  = np.concatenate((preds, torch.argmax(item).unsqueeze(-1).detach().cpu().numpy()))

    if index % 10 == 1:
        print(f'Testing Step: {index+1}, mean testing loss: {test_running_loss / 10}')
        test_running_loss = 0.0

conf_mat = confusion_matrix(labels, preds)

total_preds = np.sum(conf_mat)
    
correct_preds = 0

for i, data in enumerate(conf_mat[0]):
    correct_preds += conf_mat[i][i]

print(f'Test Accuracy: {correct_preds/total_preds}\n{conf_mat}')
print()