# Chinese Digit Recognizer, This time with PyTorch and GPU

It's my very first time in **PyTorch** and I will try to do my best with this dataset :3

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# For the dataset
# main module, I will use it to cerate the tensors
import torch
# base class to create a dataset class
from torch.utils.data import Dataset
# create the datasets for train and test
from torch.utils.data import DataLoader

# for the model
# the trainable layers (dense layers, convolutional layers ...)
import torch.nn as nn # idk why dropout is in nn x_x
# not trainable layers (dropout, batchnormalization, pooling, ...)
import torch.nn.functional as F
# for the model otimizers
import torch.optim as optim
# 
from sklearn.metrics import confusion_matrix

# Reading the data
In this dataset we have `15000 rows with a 64x64 image per row`. Each image is in each row of the .csv file and the last two columns are the labels (both can be used as target with a categorical encoder). 

In [None]:
# First read the dataset and transform it
ds = pd.read_csv('../input/chinese-mnist-digit-recognizer/chineseMNIST.csv')
print(ds.shape)
ds.head()

In [None]:
# select the features and the labels from the dataset, ds

feature_cols = ds.columns[:-2] # all except the last 2
features = ds[feature_cols].values.astype('float32') # select the values in a float32 type

# I chose to use as target the character column

label_cols = ds.columns[-1:] # the last one
labels = ds[label_cols].values # these vales are going to be string values
# the shape of labels is (15000, 1), lets 'delete' this extra dimention
labels = np.squeeze(labels)

features.shape, labels.shape

In [None]:
labels, features

# Split the Data
I consider that is some early to apply this process, but in this case its okay since `I will include the data and target processing inside CustomDataset class`. Also inside this process the data will be normalized.

In [None]:
# test set contains 3000 images
x_train, x_test, y_train, y_test = train_test_split(
    features, labels,
    test_size=0.2,
    random_state=314,
    shuffle=True
)
# and val set will contain 1000 images from tarin set
x_train, x_val, y_train, y_val = train_test_split(
    x_train, y_train,
    test_size=1/12,
    random_state=314,
    shuffle=True
)

# yes, the target and data are not processed
y_test, x_test.shape

# Creating the Dataset Class
This is like part of the data processing, but creating a dataset class is important on PyTorch as I see

In [None]:
class CustomDataset(Dataset):
    # data will be a numpy array, in this case: features
    # labels will be tha characters, in this case: labels

    ######################### THE "BASIC" NEEDED FUNCTIONS
    
    def __init__(self, data, labels, labels_ids=[]):
        # process the labels and the data
        self.process_data(data)
        self.process_labels(labels, labels_ids) # this labels ids is too important, so
                                                # read bellow why and check the function
    
    def __len__(self):
        return self.labels.shape[0]
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels_n[idx]
    
    ######################### THE PROCESSING FUNCTIONS
    
    # function inside the class for process the data
    def process_data(self, data):
        # convert data to a torch tensor
        self.data = torch.from_numpy(data)
        # normalize the data
        self.data = self.data/255
        # reshape the tensor, keep the batch and chanel dim and reshape the images
        self.data = self.data.view(self.data.shape[0], 1, 64, 64)
        
    # function inside the class for process the target
    def process_labels(self, labels, labels_ids):
        # count all the different values in labels
        self.distrib, self.num_labels = self.__count_values(labels)
        
        # verify if there are existant ids !important
        if labels_ids == []:
            # set an id for each label if there's no labels
            self.id_labels = {}
            for label, i in zip(self.distrib.keys(), range(self.num_labels)):
                self.id_labels[label] = i

        # if there are existant labels use them
        else:
            self.id_labels = labels_ids   
            
        # then create an array with the ids
        ids = [] # will be converted in an array
        for label in labels:
            # append the id of the label
            ids.append(self.id_labels[label])
        # use one hot encoding for the labels
        self.labels = []
        self.labels_n = [] # labels without encoding
        for i in ids:
            # append the normal label
            self.labels_n.append(i)
            # append the one hot encoded label
            self.labels.append(self.__one_hot(i))
        # convert to a numpy array, this is because the final dtype
        # was torch.float64 and the data is torch.float32
        self.labels = np.array(self.labels).astype('float32')
        # and convert to torch tensor
        self.labels = torch.from_numpy(self.labels)
        self.labels_n = torch.tensor(self.labels_n, dtype=torch.int64)
        
    
    ######################### EXTRA FUNCTIONS
    
    # extra function to count the different items from an array
    def __count_values(self, arr):
        dic = {}
        for val in arr:
            if val not in dic.keys():
                dic[val] = 1
            else:
                dic[val] += 1
        return dic, len(dic.keys())
    
    # extra function to make one hot encoding for the labels
    def __one_hot(self, label):
        res = np.zeros((self.num_labels))
        res[label]+=1
        return res
    
    # extra function to decode from one hot to numbers
    def decode(self, labels): # recieve a torch tensor, a prediction
        # select the indexs of the max elements in each label
        decoded = torch.argmax(labels, dim=1).numpy()
        return decoded
        
        

#### DATASET CLASS ATRIBUTES: (the ones I use)
# data: all the tensors (images) with shape (bacth,1,64,64), pytorch tensor
# labels: all the labels encoded with one hot encoding, pytorch tensor
# distrib: a dict that contains how many data there are from each label {character: number}
# id_labels: a dict with and id for each label or character {character:id}
## as this has to be the same for every dataset it will be passed as a param
## for the test and val datasets. The data was shuffled, not being passing the dict
## as a param for the others will cause diferent ids and LOW val and test accuracy.
# labels_n: a list the labels but without one hot encoding, are the ids from id_labels
# num_labels: how many different labels or classes there are


# use the class for create dataset objects
train_set = CustomDataset(x_train, y_train)
# the next ones will contain the same ids of train_set
test_set = CustomDataset(x_test, y_test, train_set.id_labels)
val_set = CustomDataset(x_val, y_val, train_set.id_labels)


# see some data of the datasets
# the lengths of each dataset, the dtypes of data and labels, and an example of the labels
len(train_set), len(test_set), test_set[0][0].shape, test_set[0][0].dtype, test_set[0][1].dtype, test_set[1000][1]

In [None]:
# Lets plot some images using the function of my other notebook

# plot multiple images, preds is for the titles
# preds must be like [[real, pred]]
def plot_images(imgs, dims, figsize, title_size, preds=[]):
    plt.figure(figsize=figsize)
    for img, i, in zip(imgs, np.arange(imgs.shape[0])):
        plt.subplot(dims[0], dims[1], i+1)
        plt.imshow(np.squeeze(img), cmap='gray')
        plt.axis('off')
        title = f'Image {i+1}'
        if preds != []:
            title = f'Real: {preds[i][0]}, Pred: {preds[i][1]}'
        plt.title(title, fontsize=title_size)
    plt.show()


# select the first 10 images of test set
sample_images = []
for i in range(10):
    img = test_set[i][0] # select the image
    sample_images.append(img.numpy())

# plot the images
plot_images(np.array(sample_images), dims=(2,5), figsize=(16,8), title_size=22)

In [None]:
# then use the module DataLoader to 'adapt' the datasets for the model
batch_size = 16

train_loader = DataLoader(
    dataset=train_set,
    batch_size=batch_size,
    shuffle=False # the datasets are already shuffled
)

test_loader = DataLoader(
    dataset=test_set,
    batch_size=batch_size,
    shuffle=False # the datasets are already shuffled
)

val_loader = DataLoader(
    dataset=val_set,
    batch_size=batch_size,
    shuffle=False # the datasets are already shuffled
)

# Create the Model and set the device
With **PyTorch** we can locate easily the model and tensors in a specific device (cpu, gpu, tpu)

In [None]:
num_labels = test_set.num_labels

class Network(nn.Module):
    # in the init are going to be defined the layers
    def __init__(self):
        super(Network, self).__init__()
        # define the layers, here the order is not important
        # CONVOLUTIONAL LAYERS
        # here there's no input shape, only the number of channels and the
        # output number of chanels
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=5)
        self.pool1 = nn.MaxPool2d(2)
        # DENSE LAYERS
        # the first para is like the input_shape, the seccond the number
        # of outputs, like the number of neurons
        self.dense1 = nn.Linear(32*30*30, 256)
        self.dense2 = nn.Linear(256, 256)
        self.dense3 = nn.Linear(256, 256)
        self.dense4 = nn.Linear(256, 15)
        # DROPOUT LAYER
        self.dropout = nn.Dropout(0.3)
        
    # here we define the forward propagation of the net
    def forward(self, x): # and use the not trainable layers
        # convolutional process
        x = F.relu(self.pool1(self.conv1(x)))
        # apply the flatten process, conserving the batch dim
        x = torch.flatten(x,1)
        # dense process with droput layers
        x = self.dropout(F.relu(self.dense1(x)))
        x = self.dropout(F.relu(self.dense2(x)))
        x = self.dropout(F.relu(self.dense3(x)))
        x = self.dense4(x)
        return x
        
        
model = Network()

# this is a prediction from the model, if we play and modificate the
# forward function we will see how the x.shape is changing
model(torch.randn((32,1,64,64)))

# SET THE DEVICE
# if we hace a gpu (with cuda) set the device as the gpu, else in cpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device) # move the model to the selected device
device

# Define the loss functionn optimizer and train the model

In [None]:
lr = 0.001

# this is like instance the loss function
criterion = nn.CrossEntropyLoss()
# the optimizer receives the model weights and a learning rate
optimizer = optim.Adam(model.parameters(), lr=lr)

In [None]:
# define a metric
def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0
    model.eval() # is like swich the model mode, this changes the
    # behave of layers like Dropouts Layers, BatchNorm Layers

    with torch.no_grad(): # deactivcate the back propagation,
    # it will reduce memory and speed up computations

        for x,y in loader:
            # move the data and targets to the device
            x = x.to(device)
            y = y.to(device)
            # obtain the scores
            scores = model(x)
            # we ned the max from the second dim
            _, preds = scores.max(1)
            # select the correct preds and sum them
            num_correct += (preds == y).sum()
            # count the num of samples
            num_samples += preds.shape[0]
    
    # calculate the accuracy, float since numbers are tensors
    acc = float(num_correct) / float(num_samples)
    print(f'Got {num_correct} / {num_samples} with accuracy {acc*100}%')

    # switch the model to train mode
    model.train()
    
    return acc

In [None]:
# train variables
epochs = 24
hist = {
    'accuracy': [],
    'val accuracy': []
}

# early stopping variables
patience = 8
wait=0
max_acc = 0

# train the network
for epoch in range(epochs):
    # this iters the the data and targets, and with an id
    # data and targets are the batch for each train step
    for batch_idx, (data, targets) in enumerate(train_loader):
        # move the data and taregets to the model's device
        data = data.to(device)
        targets = targets.to(device)

        # forward propagation, predict and measure the error
        scores = model(data)
        loss = criterion(scores, targets)
        
        # backward propagation, use the error to fit the weights
        optimizer.zero_grad() # clean the gradient, it's needed
        # contains the directions to redice the error value of
        # each prediction made, else is going to be adding value and fail
        ## back propagation of the gradient and fit the weights
        loss.backward()

        # gradient descent or adam step
        optimizer.step()
        
    print(f'==> Epoch{epoch}')
    # check the accuracy from train and val in each epoch
    acc = check_accuracy(train_loader, model)
    val_acc = check_accuracy(val_loader, model)
    
    # regist the accuacy values
    hist['accuracy'].append(acc)
    hist['val accuracy'].append(val_acc)
    
    # add a simple early stopping manually
    if acc > max_acc:
        acc = max_acc
        wait=0
    else:
        wait += 1
    if wait == patience:
        break

In [None]:
# plot the accuracy increase
x = np.arange(len(hist['accuracy']))
plt.figure(figsize=(10,6))
plt.plot(x, hist['accuracy'], label='train')
plt.plot(x, hist['val accuracy'], label='val')
plt.title('Accuracy')
plt.grid(True)
plt.legend()
plt.xlabel('Epoch')
plt.show()

# Evaluate the model and Confusion Matrix

In [None]:
# evaluate the model with the test set
_ = check_accuracy(test_loader, model)

In [None]:
# for flatten a list of 2 dims
def flatten2d(x):
    l = []
    for row in x:
        for i in row:
            l.append(i)
    return l
        

# define a function to get y_real, y_pred
def pred_and_real(loader, model):
    y_real = []
    y_pred = []
    model.eval() # is like swich the model mode, this changes the
    # behave of layers like Dropouts Layers, BatchNorm Layers

    with torch.no_grad(): # deactivcate the back propagation,
    # it will reduce memory and speed up computations

        for x,y in loader:
            # move the data and targets to the device
            x = x.to(device)
            y = y.to(device)
    
            # obtain the predictions
            # and move them to cpu
            scores = model(x).to('cpu')
            # decode the predictions
            scores = test_set.decode(scores)
            # move the answers to cpu
            y = y.to('cpu').numpy()
            
            # append the results
            y_real.append(list(y))
            y_pred.append(list(scores))
    
    # flatten the arrays
    y_real = flatten2d(y_real)
    y_pred = flatten2d(y_pred)

    return y_real, y_pred
            
# use the functions
answers, predicts = pred_and_real(test_loader, model)
len(answers) == len(predicts)

In [None]:
# define the matrix with the real classes and the predicted
m = confusion_matrix(answers, predicts)
# the labels for the plot
labels = list(test_set.id_labels.values()) # the characters throw warnings
plt.figure(figsize=(20, 8))
# create the plot
heatmap = sns.heatmap(m, xticklabels=labels, yticklabels=labels, annot=True, fmt='d', color='blue')
# labels for the axes
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
plt.title('Confusion Matrix')
plt.show()
# print the ids and the labels
print('Labels and ids')
print(test_set.id_labels.keys())
print(test_set.id_labels.values())

# See some examples

In [None]:
# select n images from the test_set
num_samples = 15
sample_images = []
sample_labels = []
sample_preds = []

for x,y in test_loader: # x and y will be 32 examples, batch size
    sample_images = x[:num_samples].to(device) # move the images to the model's device
    # make the predictions and move them to the cpu
    sample_preds = model(sample_images).to('cpu')
    # and decode the predictions
    sample_preds = test_set.decode(sample_preds)
    # get the labels with the same format of sample_preds
    sample_labels = y[:num_samples].numpy()
    # move the images to cpu and convert to numpy
    sample_images = sample_images.to('cpu').numpy()
    break # only one iteration

In [None]:
# format the preds for the plot
preds = []
for real,pred in zip(sample_labels, sample_preds):
    # first the real and later the label
    preds.append((real,pred))

# plot the images with the labels and the predictions
plot_images(sample_images, (3,5), figsize=(16,10), title_size=18, preds=preds)

# Save the model
[Here](https://pytorch.org/tutorials/beginner/saving_loading_models.html) there are more information about saving and loading **PyTorch** models

In [None]:
# save the model
torch.save(model.state_dict(), './model.pth')

In [None]:
# load the model, we need to have the model class defined
m = Network() # define a new model
m.load_state_dict(torch.load('./model.pth')) # load the weights
m.eval() # set the model on 'evaluation mode', this just change
# the behavior of some layers like batchnorm and dropout

# evaluate the loaded model
m = m.to(device) # first move to the device the model
_ = check_accuracy(test_loader, m) # and evaluate the model