## Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from PIL import Image
import os

## Declaring Constants and Variables

In [2]:
dir_path = 'archive/train'

mapping = {"zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9, "equal": 10, "plus": 11, "minus": 12, "times": 13, "div": 14, "original number": 15, "original sign": 15, "other number": 15, "other sign": 15, "decimal": 16}
reverse_mapping = {0: 0, 1: 1, 2: 2, 3:3, 4:4, 5:5, 6:6, 7:7, 8:8, 9:9, 10: "=", 11: "+", 12: "-", 13: "*", 14: "/", 15: "Illigible", 16: "."}
folders = []
files = []
symbols = []
image_pixel = []
new_size = (32, 32)

## Creating a Dataframe from the Dataset

In [3]:
for item in os.listdir(dir_path):
    if os.path.isdir(os.path.join(dir_path, item)):
        files_in_folder = os.listdir(os.path.join(dir_path, item))
        files += files_in_folder
        temp = [item] * len(files_in_folder)
        folders += temp
        temp = [mapping[item]] * len(files_in_folder)
        symbols += temp

data = {'Symbol': symbols, 'Digit': folders, 'Sample': files}

df = pd.DataFrame(data)

print(df)

      Symbol    Digit     Sample
0         16  decimal   1003.jpg
1         16  decimal   1029.jpg
2         16  decimal   1040.jpg
3         16  decimal   1097.jpg
4         16  decimal   1111.jpg
...      ...      ...        ...
7686       0     zero  98821.jpg
7687       0     zero  98860.jpg
7688       0     zero  99044.jpg
7689       0     zero  99420.jpg
7690       0     zero  99987.jpg

[7691 rows x 3 columns]


## Converting the Images to Pixels

In [4]:
for i in range(len(df)):
    path = dir_path + '/' + df.loc[i, "Digit"] + "/" + df.loc[i, "Sample"]
    img = Image.open(path)
    
    img_resized = img.resize(new_size)
    img = img_resized
    
    img_gray = img.convert("L")
    pixels = list(img_gray.getdata())
    width, height = img_gray.size
    
    pixels = np.array(pixels)
    pixels = pixels / 255.0
    pixels = pixels.astype('float32')
    
    image_pixel.append(pixels)

In [5]:
df["Pixels"] = image_pixel
df

Unnamed: 0,Symbol,Digit,Sample,Pixels
0,16,decimal,1003.jpg,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
1,16,decimal,1029.jpg,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
2,16,decimal,1040.jpg,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
3,16,decimal,1097.jpg,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
4,16,decimal,1111.jpg,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
...,...,...,...,...
7686,0,zero,98821.jpg,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
7687,0,zero,98860.jpg,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
7688,0,zero,99044.jpg,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
7689,0,zero,99420.jpg,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."


## Machine Learning

In [6]:
X = df["Pixels"]
y = df["Symbol"]

In [7]:
pixel_df = pd.DataFrame(X.tolist())

In [8]:
pixel_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7686,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7687,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7688,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7689,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
train_X, test_X, train_y, test_y = train_test_split(pixel_df, y, train_size = 0.7, random_state=0)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

model1 = RandomForestClassifier(random_state=0)
model2 = DecisionTreeClassifier()
model3 = KNeighborsClassifier()
model4 = SVC()
model5 = GaussianNB()

model1.fit(train_X, train_y)
model2.fit(train_X, train_y)
model3.fit(train_X, train_y)
model4.fit(train_X, train_y)
model5.fit(train_X, train_y)

y_pred1 = model1.predict(test_X)
y_pred2 = model2.predict(test_X)
y_pred3 = model3.predict(test_X)
y_pred4 = model4.predict(test_X)
y_pred5 = model5.predict(test_X)

print("RandomForestClassifier Accuracy Score: ", accuracy_score(test_y, y_pred1))
print("DecisionTreeClassifier Accuracy Score: ", accuracy_score(test_y, y_pred2))
print("KNeighborsClassifier Accuracy Score: ", accuracy_score(test_y, y_pred3))
print("SVC Accuracy Score: ", accuracy_score(test_y, y_pred4))
print("GaussianNB Accuracy Score: ", accuracy_score(test_y, y_pred5))

RandomForestClassifier Accuracy Score:  0.8019930675909879
DecisionTreeClassifier Accuracy Score:  0.5186308492201039
KNeighborsClassifier Accuracy Score:  0.652946273830156
SVC Accuracy Score:  0.7452339688041595
GaussianNB Accuracy Score:  0.33708838821490467


## Testing with Eval Dataset

In [11]:
dir_path = 'archive/eval'

mapping = {"zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9, "equal val": 10, "plus val": 11, "minus val": 12, "times val": 13, "div val": 14, "original number": 15, "sign": 15, "other number": 15, "number": 15, "decimal val": 16}
reverse_mapping = {0: 0, 1: 1, 2: 2, 3:3, 4:4, 5:5, 6:6, 7:7, 8:8, 9:9, 10: "=", 11: "+", 12: "-", 13: "*", 14: "/", 15: "Illigible", 16: "."}
folders = []
files = []
symbols = []
image_pixel = []
new_size = (32, 32)

for item in os.listdir(dir_path):
    if os.path.isdir(os.path.join(dir_path, item)):
        files_in_folder = os.listdir(os.path.join(dir_path, item))
        files += files_in_folder
        temp = [item] * len(files_in_folder)
        folders += temp
        temp = [mapping[item]] * len(files_in_folder)
        symbols += temp

data = {'Symbol': symbols, 'Digit': folders, 'Sample': files}

df = pd.DataFrame(data)

for i in range(len(df)):
    path = dir_path + '/' + df.loc[i, "Digit"] + "/" + df.loc[i, "Sample"]
    img = Image.open(path)
    
    img_resized = img.resize(new_size)
    img = img_resized
    
    img_gray = img.convert("L")
    pixels = list(img_gray.getdata())
    width, height = img_gray.size
    
    pixels = np.array(pixels)
    pixels = pixels / 255.0
    pixels = pixels.astype('float32')
    
    image_pixel.append(pixels)
    
df["Pixels"] = image_pixel

X = df["Pixels"]
y = df["Symbol"]

pixel_df = pd.DataFrame(X.tolist())

y_pred1 = model1.predict(pixel_df)
y_pred2 = model2.predict(pixel_df)
y_pred3 = model3.predict(pixel_df)
y_pred4 = model4.predict(pixel_df)
y_pred5 = model5.predict(pixel_df)

print("RandomForestClassifier Accuracy Score: ", accuracy_score(y, y_pred1))
print("DecisionTreeClassifier Accuracy Score: ", accuracy_score(y, y_pred2))
print("KNeighborsClassifier Accuracy Score: ", accuracy_score(y, y_pred3))
print("SVC Accuracy Score: ", accuracy_score(y, y_pred4))
print("GaussianNB Accuracy Score: ", accuracy_score(y, y_pred5))

RandomForestClassifier Accuracy Score:  0.8178256611165524
DecisionTreeClassifier Accuracy Score:  0.6101860920666013
KNeighborsClassifier Accuracy Score:  0.6190009794319294
SVC Accuracy Score:  0.7110675808031341
GaussianNB Accuracy Score:  0.31047992164544563


## Testing with a new Image

In [12]:
def imageToPixel(path):
    img = Image.open(path)
    
    img_resized = img.resize(new_size)
    img = img_resized

    img_gray = img.convert("L")
    pixels = list(img_gray.getdata())
    
    pixels = np.array(pixels)
    pixels = pixels / 255.0
    pixels = pixels.astype('float32')
    
    pixel_df = pd.DataFrame(list(pixels))
    
    return pixel_df.transpose()

In [13]:
test = "archive/eval/one/21738.jpg"
df_pixel = imageToPixel(test)
prediction = model1.predict(df_pixel)
print(reverse_mapping[list(prediction)[0]])

1
