## Import libraries

In [1]:
# non ml imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import cv2 
import joblib
import random as rd
# ml imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
)
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import (loguniform, randint)

## Creating $X$ and $y$ from image dataset ***(ctn from here)***

In [None]:
root = "asl_alphabet_train" # main training folder
alphabets = os.listdir("asl_alphabet_train") # subfolder for each alphabets
k = len(alphabets) # |multiclasses|

image_df = []
sample_size = 50 # small sample size 
for alph in alphabets:
    images = os.listdir(root+"\\"+alph) # name of the images
    images = rd.sample(os.listdir(root+"\\"+alph), sample_size) # name of the images
    for i in images:
        if '.jpg' not in i: continue # skip non image files
        # print(root + "\\" + alph + "\\" + i)
        img = cv2.imread(root + "\\" + alph + "\\" + i) # load each image as (200,200,3) cube
        img = img.reshape(120000)
        img = img/255 # normalise the data
        image_df.append(img)

Cannot increase the sample size more with my current laptop / computational power and space available to me, especially without using any convolutional and pooling layer

In [19]:
# convert it to a matrix
X = np.array(image_df)
n, d = X.shape

In [20]:
y = np.zeros(n)
correct_val = 0
for ind in range(0, n, n//k):
    y[ind: ind+n//k] = correct_val
    correct_val +=1 
print(y[:11], y[-11:]) # mutliclass

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [28. 28. 28. 28. 28. 28. 28. 28. 28. 28. 28.]


# Performing cross validation on $k$-NN, $SVC$ and *Random Forest*

In [5]:
# splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.1)

Will try K-NN too however, it will most likely not perform the best with such high dimensional data

In [23]:
from sklearn.neighbors import KNeighborsClassifier
knn_grid = {
    'n_neighbors': [3, 5, 7, 10, 15, 20],
    'weights': ['uniform', 'distance'],
}

grid_knn = GridSearchCV(KNeighborsClassifier(), knn_grid, cv=5)
grid_knn.fit(X_train, y_train)
print('best for KNN:', grid_knn.best_params_, "& best score is ", grid_knn.best_score_)

best for KNN: {'n_neighbors': 3, 'weights': 'distance'} & best score is  0.26415094339622647


In [21]:
svc_grid = {
    'C': loguniform(1e-2, 1e2), 
    'gamma': randint(1, 1000),
}

random_svc = RandomizedSearchCV(SVC(kernel = "rbf"), svc_grid, cv=5, scoring='accuracy', n_iter=10, random_state=42, n_jobs=-1)
random_svc.fit(X_train, y_train)
print('best for SVC:', random_svc.best_params_, "& best score is ", random_svc.best_score_)

best for SVC: {'C': 0.31489116479568624, 'gamma': 861} & best score is  0.030696661828737305


In [22]:
rf_grid = {
    'n_estimators': [10, 50, 100, 150, 200],
    'max_depth': [5, 10, 20, 30, None],
}
grid_rf = GridSearchCV(RandomForestClassifier(random_state=123), rf_grid, cv=5, n_jobs=-1)
grid_rf.fit(X_train, y_train)
print('best for RandomForest:', grid_rf.best_params_, "& best score is ", grid_rf.best_score_)

best for RandomForest: {'max_depth': 20, 'n_estimators': 100} & best score is  0.3560957910014514


# Training the model with best score

In [10]:
final_model = RandomForestClassifier(max_depth = 20, n_estimators = 100) 
final_model.fit(X_train, y_train)

# Testing the model

### Seeing how confident is with the first prediction which should be "A"

In [13]:
root_test = "asl_alphabet_test" # main testing folder
img = cv2.imread(root_test + "\\A_test.jpg")
img = img/255 # normalise the data
proba = final_model.predict_proba([img.reshape(120000)])
print(f"relative confidence in class A is: {proba[0][0]/np.sum(proba[0])}")

relative confidence in class A is: 0.08726880828196616


### Now checking accuracy on the relatively small test data

In [14]:
root_test = "asl_alphabet_test" # main testing folder

image_test_df = []
for x_i in os.listdir(root_test):
    if '.jpg' not in x_i: continue # only select the image files
    img = cv2.imread(root_test + "\\" + x_i) # load each image as (200,200,3) cube
    img = img.reshape(120000)
    img = img/255 # normalise the data
    image_test_df.append(img)
    
X_test = np.array(image_test_df)
n_test, d = X_test.shape
print(n_test)

28


In [15]:
y_test = [i for i in range(0,28)]
eval_ = {}

eval_["accuracy"] = accuracy_score(y_test, final_model.predict(X_test))
eval_["error"] = 1 - eval_["accuracy"]
eval_["precision"] = precision_score(y_test, final_model.predict(X_test), average = "weighted") # weighted avg for multiclass
eval_["recall"] = recall_score(y_test, final_model.predict(X_test), average = "weighted") # weighted avg for multiclass
eval_["f1 score"] = f1_score(y_test, final_model.predict(X_test), average = "weighted") # weighted avg for multiclass

eval_df = pd.DataFrame.from_dict(eval_, orient = "index", columns = ["calculation"])
eval_df

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,calculation
accuracy,0.142857
error,0.857143
precision,0.089286
recall,0.142857
f1 score,0.107143


# Testing loading and deploying to be in use for the project

In [16]:
import joblib
joblib.dump(final_model, 'rf_model.pkl')

['rf_model.pkl']

In [17]:
rf_model = joblib.load('rf_model.pkl')
rf_model