# Chinese Digit Recognizer with a Keras CNN

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow import keras
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers, Sequential
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix
import seaborn as sns

path = '../input/chinese-mnist-digit-recognizer/chineseMNIST.csv'

In [None]:
# Read the dataset
ds = pd.read_csv(path)
ds.shape

In [None]:
ds.columns

In [None]:
# the labels and the character cols
labels_cols = ['label', 'character']
# select only the images
data = ds.drop(labels_cols, axis=1).values
# select the labels and the characters
labels = ds[labels_cols[0]].values
characters = ds[labels_cols[1]].values

data.shape, labels.shape, characters.shape

# Processing and Useful functions

In [None]:
# to process the data and convert to 64x64 images
# receives data
def process_data(x):
    images = [] # all the images
    # is each row in x, each image
    for img in x:
        # reshape the flatten data
        image = img.reshape(64,64,1)
        images.append(image)
    # return the images in an apropiate format
    return np.array(images).astype('float32')/255

# recieves labels
def process_target(chars, num_classes):
    target = [] # is the result
    class_names = {} # other result
    count = count_values(chars) # count the characters
    ###### add the labels for the dict
    for key, i in zip(count.keys(), range(num_classes)):
        class_names[key] = i
    ###### create the labels data, the numbers
    labs = class_names.keys()
    for char in chars:
        pos = class_names[char] # position of the 1
        row = []
        for i in range(num_classes):# create the target [0,0,0...,1,...]
            if pos != i:
                row.append(0)
            else:
                row.append(1)
        target.append(row)
    return np.array(target).astype('float32'), class_names


def count_values(arr):
    dic = {}
    for val in arr:
        if val not in dic.keys():
            dic[val] = 1
        else:
            dic[val] += 1
    return dic

# plot multiple images, preds is for the titles
# preds must be like [[real, pred]]
def plot_images(imgs, dims, figsize, title_size, preds=[]):
    plt.figure(figsize=figsize)
    for img, i, in zip(imgs, np.arange(imgs.shape[0])):
        plt.subplot(dims[0], dims[1], i+1)
        plt.imshow(np.squeeze(img), cmap='gray')
        plt.axis('off')
        title = f'Image {i+1}'
        if preds != []:
            title = f'Real: {preds[i][0]}, Pred: {preds[i][1]}'
        plt.title(title, fontsize=title_size)
    plt.show()
    
# these numbers are just to prove*
sample_data = process_data(data[:8008:1001])
plot_images(sample_data, dims=(2,4), figsize=(16,8), title_size=22)

# Applying the Processing Functions

In [None]:
# get the images from the df as arrays
X = process_data(data)

# and obtain the target data from the characters
Y, class_names = process_target(characters, num_classes=15)

X.shape, X.dtype, Y.shape, Y.dtype

# Split the Data on *train* and *test*

In [None]:
# split the dataset in train and test. validation set will be included in the training, later
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.2, random_state=314, shuffle=True)
x_train.shape, x_test.shape

In [None]:
y_train[0] # sample of the result of the predictions

# The Model, a Convolutional Neural Network

In [None]:
# to reset the keras session
#keras.backend.clear_session()

input_shape = (64,64,1) # the dimension of the data
num_classes = 15 # the number of classes

model = Sequential([
    # define the input shape with a layer
    layers.InputLayer(input_shape=input_shape),

    # convolutional part with relu and later pooling
    layers.Conv2D(filters=32, kernel_size=5, activation='relu'),
    layers.MaxPooling2D(pool_size=2),

    # flatten the data, as it comes with (64,64,1) shape
    layers.Flatten(),
    
    # dense part, with neurons
    layers.Dense(256, activation='relu'),
    layers.Dropout(rate=.3), # turn off random neurons in each step
    layers.Dense(256, activation='relu'),
    layers.Dropout(rate=.3), # it helps to prevent overfitting
    layers.Dense(num_classes, activation='softmax')
])

# the last layer has multiple (15) neurons since the result we
# want looks like the one in the cell above, each neuron provides
# one of these numbers

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics = ['accuracy'],
)

model.summary()

In [None]:
# as the model performance increases faster
# it might be good to use an early stopping

early_stopping = EarlyStopping(
    min_delta=.005, # minimum performance increase
    patience=10, # epochs until the stop
    restore_best_weights=True
)

hist = model.fit(
    x_train,
    y_train,
    batch_size=64,
    epochs=20,
    validation_split=.1,# as there's no val data
    callbacks=[early_stopping]
)

In [None]:
plt.figure(figsize=(15, 10))

# plot the loss function
plt.subplot(1,2,1)
plt.plot(hist.history['loss'], label='train')
plt.plot(hist.history['val_loss'], label='validation')
plt.title('Loss Function')
plt.grid(True)
plt.legend()

# and the accuracy
plt.subplot(1,2,2)
plt.plot(hist.history['accuracy'], label='train')
plt.plot(hist.history['val_accuracy'], label='validation')
plt.grid(True)
plt.title('Accuracy')
plt.legend()

plt.show()

# Evaluate the Model with *test* set

In [None]:
results = model.evaluate(x_test, y_test, batch_size=64)
print("test loss, test acc:", results)

# Confusion Matrix

In [None]:
def max_index(arr):
    mx = 0
    idx = 0
    for a, i in zip(arr, range(len(list(arr)))):
        if a > mx:
            mx = a
            idx = i
    return idx

# prepare the data for the matrix
preds = model.predict(x_test) # make predictions
y_pred = []
# iterate the preds as we want the class number
for p in preds:
    c_num = max_index(p) # find the index of the max
    y_pred.append(c_num)

# obtain the y_real, the same process but with y_test
y_real = []
for p in y_test:
    c_num = max_index(p) # find the index of the max
    y_real.append(c_num)

In [None]:
# define the matrix with the real classes and the predicted
m = confusion_matrix(y_real, y_pred)
# the labels for the plot
labels = list(class_names.values()) # the characters throw warnings
plt.figure(figsize=(20, 8))
# create the plot
heatmap = sns.heatmap(m, xticklabels=labels, yticklabels=labels, annot=True, fmt='d', color='blue')
# labels for the axes
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
plt.title('Confusion Matrix')
plt.show()

# Samples of Predictions from *test* set

In [None]:
x_test[:10].shape

In [None]:
sample_data = x_test[:15] # the first 15 numbers
# a process similar as the one above
preds = model.predict(sample_data) # make predictions
y_pred = []

# iterate the preds as we want the class number
for pred,real in zip(preds, y_test):
    pred_num = max_index(pred) # find the index of the max
    real_num = max_index(real) # in each arrays
    y_pred.append((real_num, pred_num)) # first real


images = np.squeeze(sample_data) # delete the extra dim
# as the shape of sample data is (15, 64, 64, 1)

# it will be 15 preds
plot_images(images, (3,5), figsize=(25,15), title_size=22, preds=y_pred)

# Save the Model

In [None]:
model.save('Chinese_Digit_Recognizer.h5')
# also I will save the processed data
np.save('training_data.npy', X)
np.save('testing_data.npy', Y)

# Conclusion
This is my fist practise where the objective is to recognize hand-written digits. I liked it. `The model reach a performance of almost 0.96 in validation and testing and .99 in training`, I guess that was because of there was a good quantity of data, `1K samples of each class`, and the borders and `lines are such a thing that cnn's are good at` and I think this dataset proves that and helps us to experiment with cnn's. It was a really good dataset. **Thank you :D**