## Almost the same script used on the classical MNIST dataset https://www.kaggle.com/guillaumes/sklearn-ensembling-keras-cnn

### data importation

In [None]:
import pandas as pd
import numpy as np

SEED = 1234


data = pd.read_csv("/kaggle/input/chinese-mnist-digit-recognizer/chineseMNIST.csv")
data.head()

### data exploration

In [None]:
data.shape

In [None]:
data[['label', 'character']].drop_duplicates()

### data preprocessing

In [None]:
map_label = {label: i for i, label in enumerate(data.label.unique())}
data.label = data.label.map(map_label)

In [None]:
map_label

In [None]:
data.head()

In [None]:
data.label.unique()

In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(data.drop('character', axis=1), train_size=0.8, random_state=SEED)
train_df.shape, val_df.shape

In [None]:
train_df.label.nunique()

In [None]:
train_df.head()

In [None]:
y_train = train_df.label
X_train = train_df.drop('label', axis=1)

y_val = val_df.label
X_val = val_df.drop('label', axis=1)

### ML modeling

In [None]:
import time

from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [None]:
dict_model = {
    "SVC": SVC,
    "RF": RandomForestClassifier,
}

In [None]:
"""dict_pred = {}
for model_str, model in dict_model.items():
    start = time.time()
    dict_pred[model_str] = {}
    try:
        model = model(probability=True)
    except TypeError:
        model = model()
    model.fit(X_train, y_train)
    y_pred_prob = model.predict_proba(X_val)
    dict_pred[model_str]['y_pred_prob'] = y_pred_prob
    accuracy = accuracy_score(np.argmax(y_pred_prob, axis=1), y_val)
    dict_pred[model_str]['accuracy'] = accuracy
    print(f"{round(time.time()-start, 2)}s")
    print(f"{model_str}: accuracy={accuracy}")

    
y_pred_prob_ensemble = np.average([dict_pred[mdl]["y_pred_prob"] for mdl in dict_model.keys()], axis=0)

accuracy_ensemble = accuracy_score(np.argmax(y_pred_prob_ensemble, axis=1), y_val)
print(accuracy_ensemble)"""

### Keras model

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers

shape = 64

X_val = np.array(X_val)
X_train = np.array(X_train)

X_train = X_train.reshape(-1, shape, shape)
X_val = X_val.reshape(-1, shape, shape)

X_train = X_train.astype("float32") / 255.0
X_val = X_val.astype("float32") / 255.0

X_val = np.expand_dims(X_val, -1)
X_train = np.expand_dims(X_train, -1)

num_classes = len(set(y_train))
y_train = keras.utils.to_categorical(y_train, num_classes)
y_val = keras.utils.to_categorical(y_val, num_classes)

In [None]:
input_shape = (shape, shape, 1)

model = keras.Sequential([
    # Input layer
    keras.Input(shape=input_shape, name='input'),
    
    layers.Conv2D(filters=32, kernel_size=5, strides=1, 
                  activation='relu', kernel_regularizer=regularizers.l2(0.0005), name='conv1'),
    
    layers.Conv2D(filters=32, kernel_size=5, strides=1, use_bias=False, name='conv2'),
    
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.MaxPool2D(pool_size=2, strides=2, name='pool1'),
    layers.Dropout(0.25),
    
    layers.Conv2D(filters=64, kernel_size=3, activation='relu', 
                  kernel_regularizer=regularizers.l2(0.0005), name='conv3'),
    
    layers.Conv2D(filters=64, kernel_size=3, use_bias=False, name='conv4'),
    
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.MaxPool2D(pool_size=2, strides=2, name='pool2'),
    layers.Dropout(0.25),
    
    layers.Flatten(name='flatten'),
    layers.Dense(256, use_bias=False, name='dense1'),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Dropout(0.25),
    
    layers.Dense(128, use_bias=False, name='dense2'),
    layers.BatchNormalization(),
    layers.Activation('relu'),

    layers.Dense(num_classes, activation='softmax', name='output')
])

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=128)

In [None]:
## Would be interesting to try data augmentation