In [None]:
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, BatchNormalization
from tensorflow.keras.utils import to_categorical
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.image import imread
%matplotlib inline

np.random.seed(2)

print(f'Tensorflow version: {tf.__version__}')

In [None]:
data = pd.read_csv('../input/chinese-mnist/chinese_mnist.csv', low_memory = False)
data.head()

In [None]:
IMAGE_PATH = "../input/chinese-mnist/data/data/"
print("dataframe rows:", data.shape[0]) 
print("image files :", len(os.listdir(IMAGE_PATH)))

In [None]:
def file_path_col(data):    
    file_path = f"input_{data[0]}_{data[1]}_{data[2]}.jpg"
    return file_path

data["file_path"] = data.apply(file_path_col, axis = 1)
data.head()

In [None]:
data.groupby(["value","character"]).size()

In [None]:
data.isnull().sum()

In [None]:
path = os.path.join(IMAGE_PATH, data['file_path'][0])
path_img = imread(path)
plt.imshow(path_img, cmap='gray')

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, random_state=0, stratify=data["code"].values, shuffle = True)

print(train.shape[0])
print(test.shape[0])

In [None]:
train_data, val_data = train_test_split(train, test_size=0.2, random_state=0, stratify=train["code"].values, shuffle = True)

In [None]:
print(f"Train set rows: {train.shape[0]}")
print(f"Test set rows: {test.shape[0]}")
print(f"Val set rows: {val_data.shape[0]}")

In [None]:
import skimage.io
import skimage.transform

def read_image(file_name):
    path = os.path.join(IMAGE_PATH, file_name)
    
    image = skimage.io.imread(path)
    image = skimage.transform.resize(image, (64, 64, 1), mode='reflect')
    
    return image[:,:,:]

def character_encoder(dataset, var='character'):
    X = np.stack(dataset['file_path'].apply(read_image))
    y = pd.get_dummies(dataset[var], drop_first=False)
    return X, y

In [None]:
X_train, y_train = character_encoder(train)
X_val, y_val = character_encoder(val_data)
X_test, y_test = character_encoder(test)

print(X_train.shape, ",", y_train.shape)
print(X_val.shape, ",", y_val.shape)
print(X_test.shape, ",", y_test.shape)

In [None]:
model = Sequential()

model.add(Conv2D(filters=32, kernel_size=(5,5), input_shape=X_train.shape[1:], padding = 'same', activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2)))

model.add(Conv2D(64, kernel_size = (5, 5), activation = 'relu', padding = 'same'))
model.add(MaxPool2D(pool_size = (2, 2)))

model.add(Conv2D(64, kernel_size = (5, 5), activation = 'relu', padding = 'same'))
model.add(MaxPool2D(pool_size = (2, 2)))

model.add(Conv2D(164, kernel_size = (5, 5), activation = 'relu', padding = 'same'))
model.add(MaxPool2D(pool_size = (2, 2)))

model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256, activation = 'relu'))
model.add(Dense(y_train.shape[1], activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) 
model.summary()

In [None]:
history = model.fit(X_train, y_train, batch_size=128, epochs=25, validation_data=(X_val, y_val))

In [None]:
fig, ax = plt.subplots(figsize=(15,5))

plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy', linestyle='--')
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss', linestyle='--')
plt.legend()

In [None]:
ModelLoss, ModelAccuracy = model.evaluate(X_test, y_test)

print(f'Test Loss is {ModelLoss}')
print(f'Test Accuracy is {ModelAccuracy}')

In [None]:
columns_names=y_test.columns
print(list(columns_names))

character_value=data.groupby(["value","character"]).size()
dic_df=dict(character_value)
dic_l=list(dic_df.keys())
dic_v=[i[0] for i in dic_l]
dic_k=[i[1] for i in dic_l]
dic=dict(zip(dic_k, dic_v))
print(dic)

In [None]:
!pip install pyplotz

In [None]:
from pyplotz.pyplotz import PyplotZ
pltz = PyplotZ()
pltz.enable_chinese()

predict=model.predict(X_test)

plt.figure(figsize=(20, 31))
for i in range(150):
    
    predicted_label = np.argmax(predict[i])

    row=y_test.iloc[i, :].values.tolist()
    true_label = np.argmax(row)

    columns_names=list(y_test.columns)
    
    color='black'
    if predicted_label!=true_label:
        color='red'
    
    plt.subplot(15, 10, i+1)
    plt.imshow(X_test[i], cmap='gray')
    pltz.xlabel(f"{columns_names[predicted_label]} ({100*predict[i][predicted_label]:2.0f}%) - {columns_names[true_label]}",
                fontsize=12, color=color)
    plt.xticks([])
    plt.yticks([])