In [1]:
import numpy as np
import pandas as pd


from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import metrics



In [2]:
#load numpy array

data = np.load('./data/pri_data_pca_50_target.npz')
data.files

['arr_0', 'arr_1']

In [3]:
data.allow_pickle = True

In [4]:
X = data['arr_0'] #pca with 50 components
Y = data['arr_1'] #gender variables

In [5]:
X.shape, Y.shape

((4319, 50), (4319,))

In [6]:
X

array([[ 0.87500479,  0.25726321,  0.2497782 , ...,  0.9083734 ,
        -0.12257948, -0.60648072],
       [ 1.17320121, -0.23166085,  0.32997371, ..., -0.99894255,
         0.73620793,  1.70836639],
       [-0.7846088 , -0.32022847,  0.35730457, ..., -0.62253751,
         1.00815899,  1.60921544],
       ...,
       [ 1.32978927,  0.62625788, -1.24674449, ...,  1.49936503,
         1.42904642,  0.97104247],
       [-1.23761691, -0.43789963, -0.43623775, ..., -0.53505545,
         0.60400384, -0.79097151],
       [ 1.2394078 ,  0.94157044,  0.11207923, ...,  0.34329051,
         0.06169439,  0.02482428]])

In [7]:
Y

array(['female', 'female', 'female', ..., 'male', 'male', 'male'],
      dtype=object)

# Split the data for training and testing

In [8]:
x_test, x_train, y_test, y_train = train_test_split(X,Y, test_size = 0.2, stratify = Y) #applying stratified sampling

In [9]:
x_test.shape, x_train.shape, y_test.shape, y_train.shape

((3455, 50), (864, 50), (3455,), (864,))

# Training the ML model

In [10]:
model_svc = SVC(probability = True)

#ref: https://www.vebuso.com/2020/03/svm-hyperparameter-tuning-using-gridsearchcv/

param_grid = {'C':[0.5, 1,10,20,30,50],                     #Create a dictionary called param_grid and fill out some parameters for kernels, C and gamma
                 'kernel':['rbf', 'poly'],                      
                 'gamma':[0.1,0.05,0.01,0.001,0.002,0.005],
                 'coef0':[0,1]}

In [11]:
model_grid = GridSearchCV(model_svc, 
                          param_grid = param_grid , 
                          scoring = 'accuracy',
                          cv = 3,
                          verbose = 2)

In [None]:
model_grid.fit(x_train, y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=   0.1s
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   0.0s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   0.0s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   0.0s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   0.0s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   0.0s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   0.0s
[CV] END ............C=0.5, coef0=0, gamma=0.05, kernel=poly; total time=   0.0s
[CV] END ............C=0.5, coef0=0, gamma=0.05, kernel=poly; total time=   0.0s
[CV] END ............C=0.5, coef0=0, gamma=0.0

[CV] END ..............C=1, coef0=0, gamma=0.005, kernel=rbf; total time=   0.0s
[CV] END ..............C=1, coef0=0, gamma=0.005, kernel=rbf; total time=   0.0s
[CV] END ..............C=1, coef0=0, gamma=0.005, kernel=rbf; total time=   0.0s
[CV] END .............C=1, coef0=0, gamma=0.005, kernel=poly; total time=   0.0s
[CV] END .............C=1, coef0=0, gamma=0.005, kernel=poly; total time=   0.0s
[CV] END .............C=1, coef0=0, gamma=0.005, kernel=poly; total time=   0.0s
[CV] END ................C=1, coef0=1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END ................C=1, coef0=1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END ................C=1, coef0=1, gamma=0.1, kernel=rbf; total time=   0.1s
[CV] END ...............C=1, coef0=1, gamma=0.1, kernel=poly; total time=   0.0s
[CV] END ...............C=1, coef0=1, gamma=0.1, kernel=poly; total time=   0.0s
[CV] END ...............C=1, coef0=1, gamma=0.1, kernel=poly; total time=   0.0s
[CV] END ...............C=1,

In [None]:
model_grid.best_params_

In [None]:
model_final = model_grid.best_estimator_

In [None]:
model_final.get_params()

### Model Evaluation
- Classification Report
    - Precision, Recall, F1-Score
#    
- Kappa Score
    - -ve (worst model)
    - 0 to 0.5 (bad model)
    - 0.5 to 0.7 (Good Model)
    - 0.7 to 0.9 (Excellent Model)
    - 0.9 to 1.0 (Perfect Model)
#    
- AUC(Area Under the Receiver Operating Characteristic Curve)
    - Less than 0.5 (Worst Model)
    - 0.5 to 0.6 (Bad Model)
    - 0.6 to 0.8 (Good Model)
    - 0.8 to 0.9 (Excellent Model)
    - 0.9 to 1.0 (Perfect Model)

In [None]:
y_pred = model_final.predict(x_test)

In [None]:
y_pred

# Generating classification report


In [None]:
class_report = metrics.classification_report(y_test, y_pred, output_dict = True)

In [None]:
class_report

In [None]:
pd.DataFrame(class_report).T

# Kappa Score

In [None]:
metrics.cohen_kappa_score(y_test, y_pred)

# Area under Curve

In [None]:
metrics.roc_auc_score(np.where(y_test == 'male',1,0), 
                      np.where(y_pred== 'male',1,0))

# Save the model

In [None]:
import pickle

In [None]:
pickle.dump(model_final,open('./Pri_Data/gender_classification_model_svm.pickle',mode='wb'))