In [128]:
import pandas as pd
import numpy as np
import sys
import os
import random
from pathlib import Path
import imageio
import skimage
import skimage.io
import skimage.transform
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import scipy
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, MaxPool2D, Dropout, BatchNormalization,LeakyReLU
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping, ReduceLROnPlateau, LearningRateScheduler
from keras.utils import to_categorical
import tensorflow_addons as tfa
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import tensorflow as tf


In [149]:
IMAGE_PATH = '.'
IMAGE_WIDTH = 64
IMAGE_HEIGHT = 64
IMAGE_CHANNELS = 1
RANDOM_STATE = 42
TEST_SIZE = 0.2
VAL_SIZE = 0.2
CONV_2D_DIM_1 = 16
CONV_2D_DIM_2 = 16
CONV_2D_DIM_3 = 32
CONV_2D_DIM_4 = 64
MAX_POOL_DIM = 2
KERNEL_SIZE = 3
BATCH_SIZE = 32
NO_EPOCHS = 50
DROPOUT_RATIO = 0.5
PATIENCE = 5
VERBOSE = 1

In [150]:
# load the data
os.listdir(".")

['data', 'chinese_mnist.csv', 'archive.zip']

In [151]:
# load the dataset file
data_df=pd.read_csv('./chinese_mnist.csv')

In [152]:
# print number of columns and rows
data_df.shape

(15000, 5)

In [153]:
# take a peek at the raw data
# suite_id - each suite corresponds to a set of handwritten samples by one volunteer;
# sample_id - each sample will contain a complete set of 15 characters for Chinese numbers;
# code - for each Chinese character we are using a code, with values from 1 to 15;
# value - this is the actual numerical value associated with the Chinese character for number;
# character - the Chinese character;
data_df.sample(100).head()

Unnamed: 0,suite_id,sample_id,code,value,character
1357,40,7,11,10,十
13817,82,7,8,7,七
12044,12,4,7,6,六
14613,64,3,9,8,八
8744,76,4,3,2,二


In [154]:
# check for missing data (note: may be mislabeled or other errors)
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data(data_df)

Unnamed: 0,Total,Percent
character,0,0.0
value,0,0.0
code,0,0.0
sample_id,0,0.0
suite_id,0,0.0


In [155]:
# check the image data
image_files = list(os.listdir(IMAGE_PATH))
print("Number of image files: {}".format(len(image_files)))

Number of image files: 15000


In [156]:
# check that each line in the dataset has a corresponding image
def create_file_name(x):
    
    file_name = f"input_{x[0]}_{x[1]}_{x[2]}.jpg"
    return file_name
data_df["file"] = data_df.apply(create_file_name, axis=1)
file_names = list(data_df['file'])

print("Matching image names: {}".format(len(set(file_names).intersection(image_files))))

Matching image names: 15000


In [157]:
# check image sizes
def read_image_sizes(file_name):
    image = skimage.io.imread(IMAGE_PATH + file_name)
    return list(image.shape)

m = np.stack(data_df['file'].apply(read_image_sizes))
df = pd.DataFrame(m,columns=['w','h'])
data_df = pd.concat([data_df,df],axis=1, sort=False)

data_df.head()

Unnamed: 0,suite_id,sample_id,code,value,character,file,w,h
0,1,1,10,9,九,input_1_1_10.jpg,64,64
1,1,10,10,9,九,input_1_10_10.jpg,64,64
2,1,2,10,9,九,input_1_2_10.jpg,64,64
3,1,3,10,9,九,input_1_3_10.jpg,64,64
4,1,4,10,9,九,input_1_4_10.jpg,64,64


In [158]:
# check the suites of the images
print(f"Number of suites: {data_df.suite_id.nunique()}")
print(f"Samples: {data_df.sample_id.unique()}")

Number of suites: 100
Samples: [ 1 10  2  3  4  5  6  7  8  9]


In [159]:
# split the dataset into training and test set (80% training set, 20% test set), RANDOM_STATE ensure reproducibility
train_df, test_df = train_test_split(data_df, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=data_df["code"].values)

In [160]:
# further split the training set into training (80%) and validation (20%) sets
train_df, val_df = train_test_split(train_df, test_size=VAL_SIZE, random_state=RANDOM_STATE, stratify=train_df["code"].values)

In [161]:
# make sure it was done correctly
print("Train set rows: {}".format(train_df.shape[0]))
print("Test  set rows: {}".format(test_df.shape[0]))
print("Val   set rows: {}".format(val_df.shape[0]))

Train set rows: 9600
Test  set rows: 3000
Val   set rows: 2400


In [162]:
# function for rading images, scale to 100 x 100 x 3 (channels)
def read_image(file_name):
    image = skimage.io.imread(IMAGE_PATH + file_name)
    image = skimage.transform.resize(image, (IMAGE_WIDTH, IMAGE_HEIGHT, 1), mode='reflect')
    return image[:,:,:]

In [163]:
# function to create dummy variables corresponding to categorical target variable
def categories_encoder(dataset, var='character'):
    X = np.stack(dataset['file'].apply(read_image))
    y = pd.get_dummies(dataset[var], drop_first=False)
    return X, y

In [164]:
# populate the training, validation, and test sets with image data 
# and create dummy varibales corresponding to the categorical target variable (subspecies)
X_train, y_train = categories_encoder(train_df)
X_val, y_val = categories_encoder(val_df)
X_test, y_test = categories_encoder(test_df)

In [165]:
# create model and print out summary
model=Sequential()
model.add(Conv2D(CONV_2D_DIM_1, kernel_size=KERNEL_SIZE, input_shape=(IMAGE_WIDTH, IMAGE_HEIGHT,IMAGE_CHANNELS), activation='relu', padding='same'))
model.add(Conv2D(CONV_2D_DIM_2, kernel_size=KERNEL_SIZE, activation='relu', padding='same'))
model.add(MaxPool2D(MAX_POOL_DIM))
model.add(Dropout(DROPOUT_RATIO))
model.add(Conv2D(CONV_2D_DIM_2, kernel_size=KERNEL_SIZE, activation='relu', padding='same'))
model.add(Conv2D(CONV_2D_DIM_2, kernel_size=KERNEL_SIZE, activation='relu', padding='same'))
model.add(Dropout(DROPOUT_RATIO))
model.add(Flatten())
model.add(Dense(y_train.columns.size, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_20 (Conv2D)           (None, 64, 64, 16)        160       
_________________________________________________________________
conv2d_21 (Conv2D)           (None, 64, 64, 16)        2320      
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 32, 32, 16)        0         
_________________________________________________________________
dropout_10 (Dropout)         (None, 32, 32, 16)        0         
_________________________________________________________________
conv2d_22 (Conv2D)           (None, 32, 32, 16)        2320      
_________________________________________________________________
conv2d_23 (Conv2D)           (None, 32, 32, 16)        2320      
_________________________________________________________________
dropout_11 (Dropout)         (None, 32, 32, 16)       

In [166]:
# predefined epoch number (50 steps) and a learning function with variable learning rate that depends on epoch number
# At each training epoch, we evaluate valiation error and decide to stop training or continue (predestined "patience"
# factor to stop if validation is not improving after 5 step). We load the best model and use it for test set prediction
annealer = LearningRateScheduler(lambda x: 1e-3 * 0.99 ** (x+NO_EPOCHS))
earlystopper = EarlyStopping(monitor='loss', patience=PATIENCE, verbose=VERBOSE)
checkpointer = ModelCheckpoint('best_model.h5',
                                monitor='val_accuracy',
                                verbose=VERBOSE,
                                save_best_only=True,
                                save_weights_only=True)

In [170]:
# train the model
train_model  = model.fit(X_train, y_train,
                  batch_size=BATCH_SIZE,
                  epochs=NO_EPOCHS,
                  verbose=1,
                  validation_data=(X_val, y_val),
                  callbacks=[earlystopper, checkpointer, annealer])

Epoch 1/50
Epoch 00001: val_accuracy did not improve from 0.97375
Epoch 2/50
Epoch 00002: val_accuracy did not improve from 0.97375
Epoch 3/50
Epoch 00003: val_accuracy did not improve from 0.97375
Epoch 4/50
Epoch 00004: val_accuracy did not improve from 0.97375
Epoch 5/50
Epoch 00005: val_accuracy improved from 0.97375 to 0.97458, saving model to best_model.h5
Epoch 6/50
Epoch 00006: val_accuracy did not improve from 0.97458
Epoch 7/50
Epoch 00007: val_accuracy did not improve from 0.97458
Epoch 8/50
Epoch 00008: val_accuracy did not improve from 0.97458
Epoch 9/50
Epoch 00009: val_accuracy did not improve from 0.97458
Epoch 10/50
Epoch 00010: val_accuracy did not improve from 0.97458
Epoch 11/50
Epoch 00011: val_accuracy did not improve from 0.97458
Epoch 12/50
Epoch 00012: val_accuracy did not improve from 0.97458
Epoch 13/50
Epoch 00013: val_accuracy did not improve from 0.97458
Epoch 14/50
Epoch 00014: val_accuracy did not improve from 0.97458
Epoch 15/50
Epoch 00015: val_accurac

In [171]:
# evaluate model by plotting the loss error for the training and validation sets
def create_trace(x,y,ylabel,color):
        trace = go.Scatter(
            x = x,y = y,
            name=ylabel,
            marker=dict(color=color),
            mode = "markers+lines",
            text=x
        )
        return trace
    
def plot_accuracy_and_loss(train_model):
    hist = train_model.history
    acc = hist['accuracy']
    val_acc = hist['val_accuracy']
    loss = hist['loss']
    val_loss = hist['val_loss']
    epochs = list(range(1,len(acc)+1))
    #define the traces
    trace_ta = create_trace(epochs,acc,"Training accuracy", "Green")
    trace_va = create_trace(epochs,val_acc,"Validation accuracy", "Red")
    trace_tl = create_trace(epochs,loss,"Training loss", "Blue")
    trace_vl = create_trace(epochs,val_loss,"Validation loss", "Magenta")
    fig = tools.make_subplots(rows=1,cols=2, subplot_titles=('Training and validation accuracy',
                                                             'Training and validation loss'))
    #add traces to the figure
    fig.append_trace(trace_ta,1,1)
    fig.append_trace(trace_va,1,1)
    fig.append_trace(trace_tl,1,2)
    fig.append_trace(trace_vl,1,2)
    #set the layout for the figure
    fig['layout']['xaxis'].update(title = 'Epoch')
    fig['layout']['xaxis2'].update(title = 'Epoch')
    fig['layout']['yaxis'].update(title = 'Accuracy', range=[0,1])
    fig['layout']['yaxis2'].update(title = 'Loss', range=[0,1])
    #plot
    iplot(fig, filename='accuracy-loss')

plot_accuracy_and_loss(train_model)

In [126]:
# now run the model on the test set and evaluate the loss and accuracy
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.18712486326694489
Test accuracy: 0.9649999737739563


In [102]:
# test the accuracy per class
def test_accuracy_report(model):
    predicted = model.predict(X_test)
    test_predicted = np.argmax(predicted, axis=1)
    test_truth = np.argmax(y_test.values, axis=1)
    print(metrics.classification_report(test_truth, test_predicted, target_names=y_test.columns)) 
    test_res = model.evaluate(X_test, y_test.values, verbose=0)
    print('Loss function: %s, accuracy:' % test_res[0], test_res[1])
test_accuracy_report(model)

              precision    recall  f1-score   support

           一       0.99      0.99      0.99       200
           七       0.97      0.97      0.97       200
           万       0.95      0.98      0.97       200
           三       0.99      0.98      0.98       200
           九       0.91      0.94      0.92       200
           二       0.97      0.98      0.98       200
           五       1.00      0.99      0.99       200
           亿       0.93      0.97      0.95       200
           八       1.00      0.99      1.00       200
           六       0.98      0.98      0.98       200
           十       0.94      0.97      0.96       200
           千       0.98      0.93      0.95       200
           四       0.99      0.99      0.99       200
           百       0.99      0.93      0.96       200
           零       0.99      0.99      0.99       200

   micro avg       0.97      0.97      0.97      3000
   macro avg       0.97      0.97      0.97      3000
weighted avg       0.97   

In [103]:
# predict using the best model
model_optimal = model
model_optimal.load_weights('best_model.h5')
score = model_optimal.evaluate(X_test, y_test, verbose=0)
print(f'Best validation loss: {score[0]}, accuracy: {score[1]}')

test_accuracy_report(model_optimal)

Best validation loss: 0.12490731477737427, accuracy: 0.9723333120346069
              precision    recall  f1-score   support

           一       1.00      0.98      0.99       200
           七       0.98      0.98      0.98       200
           万       0.95      0.98      0.97       200
           三       0.98      0.98      0.98       200
           九       0.94      0.94      0.94       200
           二       0.97      0.97      0.97       200
           五       1.00      0.98      0.99       200
           亿       0.95      0.97      0.96       200
           八       1.00      1.00      1.00       200
           六       0.96      0.99      0.98       200
           十       0.96      0.96      0.96       200
           千       0.97      0.93      0.95       200
           四       0.98      0.95      0.97       200
           百       0.97      0.94      0.96       200
           零       0.98      1.00      0.99       200

   micro avg       0.97      0.97      0.97      3000
   macro