In [1]:
import numpy as np
import pandas as pd

# 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [3]:
# load numpy array
data = np.load('data_pca_45_target.npz') #loading the file where we save pca data and y 
data.files

['arr_0', 'arr_1']

In [4]:
data.allow_pickle = True

In [5]:
X = data['arr_0'] # pca data with 50 components
y = data['arr_1'] # target or dependent variable

In [6]:
X.shape,y.shape

((7401, 45), (7401,))

In [7]:
X

array([[ 0.89208069,  0.22794021,  0.38151244, ..., -1.4473214 ,
         0.30559631,  1.11401326],
       [ 1.18895475, -0.31423106,  0.33196464, ...,  0.21265328,
        -0.93588488,  0.65166503],
       [-0.79452487, -0.31443967,  0.33835987, ...,  0.12823152,
        -0.70007941,  1.16151387],
       ...,
       [-1.27374698,  0.92625135, -0.25434603, ..., -1.02348641,
         1.09023431, -0.03677684],
       [ 1.40538475,  0.63222861, -1.17093043, ..., -3.79534324,
         0.81106028,  0.29549103],
       [ 1.26083041,  0.85328375,  0.25573004, ..., -0.07864408,
         0.99574579, -0.64199835]], shape=(7401, 45))

In [8]:
y

array(['female', 'female', 'female', ..., 'male', 'male', 'male'],
      shape=(7401,), dtype=object)

#### Split the data into train and test 

In [10]:
x_train,x_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(5920, 45) (1481, 45) (5920,) (1481,)


### training machine learning model

In [11]:
model_svc = SVC(probability=True)

param_grid = {'C':[0.5,1,10,20,30,50],
             'kernel':['rbf','poly'],
             'gamma':[0.1,0.05,0.01,0.001,0.002,0.005],
             'coef0':[0,1]}

In [12]:
model_grid = GridSearchCV(model_svc,
                          param_grid=param_grid,
                          scoring='accuracy',cv=3,verbose=2)

In [13]:
model_grid.fit(x_train,y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=  15.9s
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=  13.7s
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=  13.4s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   3.9s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   4.2s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   4.0s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=  11.5s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   9.7s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   8.7s
[CV] END ............C=0.5, coef0=0, gamma=0.05, kernel=poly; total time=   2.0s
[CV] END ............C=0.5, coef0=0, gamma=0.05, kernel=poly; total time=   1.7s
[CV] END ............C=0.5, coef0=0, gamma=0.0

In [14]:
model_grid.best_params_

{'C': 10, 'coef0': 0, 'gamma': 0.05, 'kernel': 'rbf'}

In [15]:
model_finale = model_grid.best_estimator_

In [17]:
model_finale.get_params()

{'C': 10,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 0.05,
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': True,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

### Model Evaluation
- Classification Report
    - Precision, Recall, F1-Score
- Kappa Score(used for multiclass classification)
    - -ve (worst model)
    - 0 to 0.5 (bad model)
    - 0.5 to 0.7 (Good Model)
    - 0.7 to 0.9 (Excellent Model)
    - 0.9 to 1.0 (Perfect Model)
- AUC
    - Less than 0.5 (Worst Model)
    - 0.5 to 0.6 (Bad Model)
    - 0.6 to 0.8 (Good Model)
    - 0.8 to 0.9 (Excellent Model)
    - 0.9 to 1.0 (Perfect Model)

In [18]:
y_pred = model_finale.predict(x_test) # Predicted values

In [19]:
y_pred

array(['female', 'male', 'female', ..., 'female', 'female', 'female'],
      shape=(1481,), dtype=object)

#### Classification Report

In [30]:
metrics.confusion_matrix(y_test, y_pred)

array([[803,  32],
       [ 61, 585]])

In [28]:
cr = metrics.classification_report(y_test, y_pred)
print(cr)

              precision    recall  f1-score   support

      female       0.93      0.96      0.95       835
        male       0.95      0.91      0.93       646

    accuracy                           0.94      1481
   macro avg       0.94      0.93      0.94      1481
weighted avg       0.94      0.94      0.94      1481



In [29]:
cr_df = metrics.classification_report(y_test, y_pred, output_dict=True)
pd.DataFrame(cr_df).T

Unnamed: 0,precision,recall,f1-score,support
female,0.929398,0.961677,0.945262,835.0
male,0.948136,0.905573,0.926366,646.0
accuracy,0.937205,0.937205,0.937205,0.937205
macro avg,0.938767,0.933625,0.935814,1481.0
weighted avg,0.937572,0.937205,0.93702,1481.0


#### Kappa Score

In [31]:
metrics.cohen_kappa_score(y_test, y_pred)

np.float64(0.8716780066689089)

#### Area under Curve(AUC)

In [32]:
metrics.roc_auc_score(np.where(y_test=='male',1,0), 
                      np.where(y_pred=='male',1,0))

np.float64(0.9336247010622717)

#### Save Face Recognition Model


In [33]:
import pickle

In [34]:
with open('model_svm.pickle', 'wb') as file: 
      
    # A new file will be created 
    pickle.dump(model_finale, file)