In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,ConfusionMatrixDisplay,confusion_matrix,classification_report

In [3]:
df = pd.read_csv("csvs/hog_95/extracted_features_hog_95.csv",header =None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3239,3240,3241,3242,3243,3244,3245,3246,3247,3248
0,Rachel_Leigh_Cook,0.101289,1.330157,-0.964354,0.278769,-0.54351,1.725658,-0.641089,-0.416891,-0.197742,...,52.0,689.0,2015.0,543.0,19.0,301.0,450.0,355.0,289.0,11011.0
1,Roseanne_Barr,0.391125,0.704784,1.217563,-0.780129,-1.41438,0.959051,1.196409,1.637598,0.144761,...,44.0,654.0,1721.0,548.0,26.0,416.0,510.0,451.0,378.0,7875.0
2,Vladimir_Putin,-1.246895,-0.68008,0.591671,-2.354134,-1.920589,-0.013368,0.060721,-0.179192,1.438939,...,35.0,504.0,3792.0,564.0,27.0,420.0,679.0,460.0,611.0,17388.0
3,Carlos_Menem,0.597257,-0.661087,1.356273,-0.878202,-0.58822,-0.435263,-0.077499,-0.152108,0.537329,...,53.0,495.0,1593.0,566.0,34.0,343.0,563.0,399.0,338.0,9557.0
4,Lynne_Thigpen,-1.589231,-0.049223,-1.306154,-0.461846,0.250779,-2.198609,-0.026885,0.607296,-0.321626,...,39.0,520.0,1744.0,502.0,24.0,339.0,531.0,398.0,394.0,14844.0


In [4]:
df_test = pd.read_csv("csvs/hog_95/extracted_features_test_hog_95.csv",header=None)
df = pd.concat([df,df_test],axis = 0)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13233 entries, 0 to 2646
Columns: 3249 entries, 0 to 3248
dtypes: float64(3248), object(1)
memory usage: 328.1+ MB


##### Restricting the minimum no of samples per class to be 50

In [85]:
df_modified_50 = df[df[0].map(df[0].value_counts()) >= 50]
# Reference Stack overflow

In [86]:
df_modified_50.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1560 entries, 6 to 2641
Columns: 3249 entries, 0 to 3248
dtypes: float64(3248), object(1)
memory usage: 38.7+ MB


In [87]:
df_modified_50[0].unique().shape

(12,)

In [88]:
Features = df_modified_50.drop(0,axis = 1)
print(Features.shape)
target = df_modified_50[0]
print(target.shape)

(1560, 3248)
(1560,)


In [89]:
Features_HoG_95 = Features.loc[:,:946]
Features_CNN = Features.loc[:,946:2993]
Features_LBP = Features.loc[:,2993:]

In [90]:
print("No of CNN features: ",Features_CNN.shape[1])
print("No of HoG features covering 0.95 variance: ",Features_HoG_95.shape[1])
print("No of LBP Features are: ",Features_LBP.shape[1])

No of CNN features:  2048
No of HoG features covering 0.95 variance:  946
No of LBP Features are:  256


Default Hyperparameters training on LBP Features

In [91]:
X_train_LBP, X_test_LBP, y_train_LBP,y_test_LBP = train_test_split(Features_LBP,target,train_size=0.8,random_state=42)

In [92]:
model_LBP_default = RandomForestClassifier()

In [93]:
model_LBP_default.fit(X_train_LBP,y_train_LBP)

In [94]:
y_predicted_LBP = model_LBP_default.predict(X_test_LBP)

In [95]:
print(accuracy_score(y_true=y_test_LBP,y_pred=y_predicted_LBP))

0.3301282051282051


Default Hyperparameters training on CNN features

In [96]:
X_train_CNN,X_test_CNN,y_train_CNN,y_test_CNN = train_test_split(Features_CNN,target,train_size=0.8,random_state=42)

In [97]:
model_CNN_default = RandomForestClassifier()

In [98]:
model_CNN_default.fit(X_train_CNN,y_train_CNN)

In [99]:
y_predicted_CNN = model_CNN_default.predict(X_test_CNN)

In [100]:
print(accuracy_score(y_true=y_test_CNN,y_pred=y_predicted_CNN))

0.4775641025641026


Default Hyperparameters training using HoG Features

In [101]:
X_train_HoG,X_test_HoG,y_train_HoG,y_test_HoG = train_test_split(Features_HoG_95,target,train_size=0.8,random_state=42)

In [102]:
model_HoG_default = RandomForestClassifier()

In [103]:
model_HoG_default.fit(X_train_HoG,y_train_HoG)

In [104]:
y_predicted_HoG = model_HoG_default.predict(X_test_HoG)

In [105]:
print(accuracy_score(y_true=y_test_HoG,y_pred=y_predicted_HoG))

0.34935897435897434


##### If Minimum number of samples per class is increased to 70

In [106]:
df_modified_70 = df[df[0].map(df[0].value_counts()) >= 70]

In [122]:
Features_70 = df_modified_70.drop(0,axis=1)
target_70 = df_modified_70[0]

Features_HoG_95_70 = Features_70.loc[:,:946]
Features_CNN_70 = Features_70.loc[:,946:2993]
Features_LBP_70  = Features_70.loc[:,2993:]

Default Hyper parameters training for LBP Features

In [123]:
X_train_LBP_70,X_test_LBP_70,y_train_LBP_70,y_test_LBP_70 = train_test_split(Features_LBP_70,target_70,train_size=0.8,random_state=42)

In [124]:
model_LBP_default_70 = RandomForestClassifier()

In [125]:
model_LBP_default_70.fit(X_train_LBP_70,y_train_LBP_70)

In [126]:
y_predicted_LBP_70 = model_LBP_default_70.predict(X_test_LBP_70)

In [127]:
print(accuracy_score(y_true = y_test_LBP_70,y_pred = y_predicted_LBP_70))

0.4418604651162791


Default Hyper parameters training for CNN Features

In [128]:
X_train_CNN_70,X_test_CNN_70,y_train_CNN_70,y_test_CNN_70 = train_test_split(Features_CNN_70,target_70,train_size=0.8,random_state=42)

In [129]:
model_CNN_default_70 = RandomForestClassifier()
model_CNN_default_70.fit(X_train_CNN_70,y_train_CNN_70)

In [130]:
y_predicted_CNN_70 = model_CNN_default_70.predict(X_test_CNN_70)
print(accuracy_score(y_true=y_test_CNN_70,y_pred = y_predicted_CNN_70))

0.5813953488372093


Default Hyper parameters training for HoG Features covering 0.95 Variance

In [131]:
X_train_HoG_70,X_test_HoG_70,y_train_HoG_70,y_test_HoG_70 = train_test_split(Features_HoG_95_70,target_70,train_size=0.8,random_state=42)

In [132]:
model_HoG_default_70 = RandomForestClassifier()
model_HoG_default_70.fit(X_train_HoG_70,y_train_HoG_70)

In [133]:
y_predicted_HoG_70 = model_HoG_default_70.predict(X_test_HoG_70)
print(accuracy_score(y_true=y_test_HoG_70,y_pred=y_predicted_HoG_70))

0.4186046511627907
