# Building Handwritten Mathematical Expressions Recognition Model

In [51]:
from __future__ import absolute_import, division, print_function, unicode_literals

import matplotlib.pylab as plt
import shutil # copy, move file
import os # miscellaneous operation system interfaces
import pathlib
import random
import numpy as np
import pandas as pd
import seaborn as sns
from PIL import Image
%matplotlib inline

In [3]:
data_root = pathlib.Path('train')

In [4]:
# Find all image's path

all_image_paths = list(data_root.glob("*/*"))
all_image_paths = [str(path) for path in all_image_paths if path.is_file()]
random.shuffle(all_image_paths)

image_count = len(all_image_paths)
image_count

52496

In [None]:
for index, image_path in enumerate(all_image_paths):
    try:
        image = tf.io.read_file(image_path)
        image = tf.image.decode_jpeg(image, channels=3)
    except:
        print(all_image_paths[index])
    if index % 200 == 0: 
        print(index)

In [6]:
# Find all label names

label_names = sorted(item.name for item in data_root.glob('*/') if item.is_dir())
label_names = np.array(label_names)
label_names

array(['*', '+', '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       '=', 'div'], dtype='<U3')

In [7]:
label_to_index = dict((name, index) for index, name in enumerate(label_names))

label_to_index

{'*': 0,
 '+': 1,
 '-': 2,
 '0': 3,
 '1': 4,
 '2': 5,
 '3': 6,
 '4': 7,
 '5': 8,
 '6': 9,
 '7': 10,
 '8': 11,
 '9': 12,
 '=': 13,
 'div': 14}

In [8]:
# Find all image's labels

all_image_labels = [label_to_index[pathlib.Path(path).parent.name]
                    for path in all_image_paths]

print("First 10 labels indices: ", all_image_labels[:10])

First 10 labels indices:  [4, 3, 13, 5, 3, 11, 12, 7, 2, 9]


In [9]:
# Count #image each class

from collections import Counter
count = Counter()

for label in all_image_labels:
    count[label_names[label]] += 1
  
count

Counter({'1': 4000,
         '0': 4000,
         '=': 4000,
         '2': 4000,
         '8': 3068,
         '9': 3737,
         '4': 4000,
         '-': 4000,
         '6': 3118,
         '*': 3251,
         '3': 4000,
         '+': 4000,
         '5': 3545,
         '7': 2909,
         'div': 868})

In [10]:
df = pd.DataFrame(columns=['path','label'])
df['path'] = all_image_paths
df['label'] = all_image_labels

In [14]:
mnist = pd.read_csv("train.csv")

In [15]:
newdf = pd.DataFrame(columns=mnist.columns)

In [None]:
newdf.drop('label',axis=1,inplace=True)

In [20]:
newdf.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783


In [None]:
from PIL import Image
for i in range(df.shape[0]):
    img = Image.open(df.loc[i,'path'])
    img.load()
    img = img.resize((28, 28), Image.BILINEAR)
    img = np.asarray( img, dtype="int64" )
    img = img.ravel()
#   Chang to black background
    img = 255 - img
    newdf.loc[i,:]=img
    if i % 200 == 0: 
        print(i)

In [22]:
label = pd.DataFrame(columns=['label'])
label['label'] = df['label']
data = label.join(newdf)

In [23]:
#   Save a flattern dataframe
data.to_csv('data.csv')

## Building Model

In [48]:
# Import the modules
from sklearn.externals import joblib
from sklearn import datasets
from skimage.feature import hog
from sklearn.svm import LinearSVC
import numpy as np
from collections import Counter
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [35]:
df = pd.read_csv(r"data.csv")

In [36]:
df = df.drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,0.0,20.0,104.0,142.0,150.0,150.0,150.0,147.0,113.0,...,0.0,0.0,1.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0
4,3,1.0,0.0,6.0,86.0,135.0,93.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
# Extract the features and labels
X = df.drop('label', axis=1).to_numpy()
y = df['label'].to_numpy()

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.1, random_state = 101)

In [39]:
# Extract the hog features

def ExtractHog(features):
    list_hog_fd = []
    for feature in features:
        fd = hog(feature.reshape((28, 28)), orientations=9, pixels_per_cell=(14, 14), cells_per_block=(1, 1), visualize=False)
        list_hog_fd.append(fd)
    hog_features = np.array(list_hog_fd, 'float64')
    return hog_features

In [40]:
X_train_hog = ExtractHog(X_train)
X_test_hog = ExtractHog(X_test)

### Tunning hyperparameters

In [None]:
from sklearn.model_selection import cross_val_score

val_rate = []
c_range =  range(1,200,20)

for i in c_range:
    
    svm = SVC(C=i, kernel='linear')
    val_error = 1 - cross_val_score(svm, X_train_hog, y_train,cv=5).mean()
    val_rate.append(val_error)


# Plot settings
plt.figure(figsize=(15,7))
plt.plot(c_range, val_rate, color='orange', linestyle='dashed', marker='o',
         markerfacecolor='black', markersize=5, label='Validation Error')

plt.xticks(np.arange(c_range.start, c_range.stop, c_range.step), rotation=60)
plt.grid()
plt.legend()
plt.title('Validation Error vs. C Value')
plt.xlabel('C')
plt.ylabel('Validation Error')
plt.show()

In [None]:
best_c = c_range[val_rate.index(min(val_rate))]
best_c

### Building Linear SVM model

In [41]:
# Create an linear SVM object
clf = LinearSVC()

In [42]:
# Perform the training
clf.fit(X_train_hog, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

### Evaluate the accuracy of model

In [50]:
clf_predictions = clf.predict(X_test_hog)

clf_acc = accuracy_score(clf_predictions, y_test)
print(clf_acc)

0.803809523809524


In [33]:
# Save the classifier
joblib.dump(clf, "model_cls.pkl", compress=3)

['model_cls.pkl']