In [1]:
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import os 

from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, roc_curve, average_precision_score, recall_score

from sklearn.decomposition import PCA 

In [2]:
train_path = "./Devnagari-Handwritten-Dataset/Train"

test_path = "./Devnagari-Handwritten-Dataset/Test"

In [3]:
unique_labels = os.listdir(train_path)

In [4]:
def list_of_images(folder):
    
    return os.listdir(os.path.join(train_path, folder))

In [5]:
def read_image(folder, image):
    
    folder_path = os.path.join(train_path,folder)
    
    image_path = os.path.join(folder_path, image)
    
    image = plt.imread(image_path)
    
    return image.reshape(image.shape[0]*image.shape[1],)

In [6]:
def stacking_row_vector(folder):
    
    images_list  = list_of_images(folder)
    
    images = []
    
    for img in images_list:
        
        images.append(read_image(folder,img))
        
    return np.array(images[0:1360]) ,np.array(images[1360:])

In [7]:
train_data = []

cv_data = []

for folder in unique_labels:
    
    train_folder_matrix, cv_folder_matrix = stacking_row_vector(folder)
    
    train_data.append(train_folder_matrix)
    
    cv_data.append(cv_folder_matrix)
    
train_data = np.concatenate(train_data, axis = 0)

cv_data = np.concatenate(cv_data, axis = 0)

In [8]:
train_labels = []

cv_labels = []

for folder_name in unique_labels:
    
    train_labels = train_labels + [folder_name] * 1360
    
    cv_labels = cv_labels + [folder_name]*340

In [9]:
train_data = pd.DataFrame(data=train_data)

train_data['labels'] = train_labels

train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,labels
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna


In [10]:
cv_data = pd.DataFrame(data = cv_data) 

cv_data['labels'] = cv_labels

cv_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,labels
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna


Training data using scikit learn

In [11]:
obj = GaussianNB()

features = np.array(train_data.iloc[:,0:1024])

labels = np.array(train_data.iloc[:,1024])

obj.fit(features, labels)

GaussianNB()

In [12]:
cv_features = np.array(cv_data.iloc[:,0:1024])

cv_actual_results = np.array(cv_data.iloc[:,1024])

cv_predicted_category = obj.predict(cv_features)

cv_predicted_category

accuracy_score(y_true = cv_actual_results, y_pred = cv_predicted_category)

0.47154731457800514

In [13]:
classification_report(y_true = cv_actual_results, y_pred = cv_predicted_category, target_names=unique_labels)

'                           precision    recall  f1-score   support\n\n         character_10_yna       0.68      0.61      0.64       340\n    character_11_taamatar       0.87      0.71      0.78       340\n        character_12_thaa       0.68      0.70      0.69       340\n         character_13_daa       0.44      0.17      0.24       340\n        character_14_dhaa       0.51      0.52      0.51       340\n        character_15_adna       0.44      0.43      0.43       340\n      character_16_tabala       0.51      0.69      0.59       340\n         character_17_tha       0.28      0.18      0.22       340\n          character_18_da       0.33      0.29      0.31       340\n         character_19_dha       0.63      0.51      0.56       340\n           character_1_ka       0.64      0.52      0.58       340\n          character_20_na       0.33      0.28      0.31       340\n          character_21_pa       0.30      0.47      0.36       340\n         character_22_pha       0.26      0.7

## Applying PCA

In [14]:
data = pd.concat([train_data, cv_data])

X = np.array(data.iloc[:,0:1024])

In [15]:
def cross_validation(n_eig_vectors):
    
    pca_obj = PCA(n_components = n_eig_vectors)
    
    X_new = pca_obj.fit_transform(X)
    
    X_train_new = X_new[0:62560,:]
    
    X_cv_new = X_new[62560:,:]
    
    obj = GaussianNB()
    
    obj.fit(X_train_new, labels)
    
    cv_predicted_category = obj.predict(X_cv_new)
    
    return accuracy_score(y_true = cv_actual_results, y_pred = cv_predicted_category)

In [16]:
number_eig_vectors = [512, 256, 128, 64, 32, 16, 8, 4, 2,1]

for i in number_eig_vectors:
    
    accuracy=cross_validation(i)
    
    print(i ,":",accuracy)

512 : 0.4563299232736573
256 : 0.5062020460358057
128 : 0.5653452685421995
64 : 0.5499360613810742
32 : 0.5319053708439898
16 : 0.46585677749360616
8 : 0.31643222506393864
4 : 0.15939897698209718
2 : 0.06604859335038363
1 : 0.03618925831202046
