In [1]:
import numpy as np
import pandas as pd

# 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [2]:
data = np.load('./data/data_pca_50_target.npz')
data.files

['arr_0', 'arr_1']

In [3]:
data.allow_pickle = True

In [4]:
X = data['arr_0'] # pca data with 50 components
y = data['arr_1'] # target or dependent variable

In [5]:
X.shape,y.shape

((4319, 50), (4319,))

In [6]:
X

array([[ 0.90240489,  0.2236217 , -0.25336184, ...,  0.37265942,
         0.38251832,  1.45763833],
       [ 1.19382934, -0.26459231, -0.32776914, ..., -0.33907283,
        -1.57870643, -1.49896107],
       [-0.76315049, -0.29718844, -0.34934871, ...,  0.21651087,
        -1.54593592, -1.88283978],
       ...,
       [ 1.36621404,  0.58709013,  1.23486043, ...,  1.57363417,
        -1.73405957, -0.60622027],
       [-1.21542345, -0.38749324,  0.4420646 , ..., -1.08303089,
        -0.41748562,  0.57976519],
       [ 1.27880825,  0.88333391, -0.12038309, ...,  0.22486837,
        -0.02681867, -0.48386561]], shape=(4319, 50))

In [7]:
y

array(['female', 'female', 'female', ..., 'male', 'male', 'male'],
      shape=(4319,), dtype=object)

In [8]:
#split the data into tarining and testing

x_train,x_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(3455, 50) (864, 50) (3455,) (864,)


In [9]:
#Training machine model

model_svc = SVC(probability=True)

param_grid = {'C':[0.5,1,10,20,30,50],
             'kernel':['rbf','poly'],
             'gamma':[0.1,0.05,0.01,0.001,0.002,0.005],
             'coef0':[0,1]}

In [10]:
model_grid = GridSearchCV(model_svc,
                          param_grid=param_grid,
                          scoring='accuracy',cv=3,verbose=2)

In [11]:
model_grid.fit(x_train,y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=   2.4s
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=   2.2s
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=   2.3s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   1.7s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   1.5s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   1.5s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   1.9s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   2.0s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   1.9s
[CV] END ............C=0.5, coef0=0, gamma=0.05, kernel=poly; total time=   1.2s
[CV] END ............C=0.5, coef0=0, gamma=0.05, kernel=poly; total time=   1.3s
[CV] END ............C=0.5, coef0=0, gamma=0.0

In [12]:
model_grid.best_params_

{'C': 1, 'coef0': 1, 'gamma': 0.01, 'kernel': 'poly'}

In [13]:
model_final = model_grid.best_estimator_

In [14]:
model_final.get_params()

{'C': 1,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 1,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 0.01,
 'kernel': 'poly',
 'max_iter': -1,
 'probability': True,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [16]:
#Model Evoluatation

y_pred = model_final.predict(x_test) # predicted values

In [17]:
y_pred

array(['female', 'male', 'male', 'female', 'female', 'male', 'male',
       'male', 'female', 'male', 'female', 'female', 'male', 'female',
       'male', 'male', 'male', 'female', 'male', 'male', 'female',
       'female', 'male', 'female', 'male', 'male', 'female', 'male',
       'male', 'female', 'female', 'male', 'male', 'female', 'female',
       'female', 'male', 'female', 'female', 'female', 'female', 'female',
       'male', 'male', 'male', 'female', 'female', 'female', 'male',
       'female', 'female', 'male', 'male', 'male', 'male', 'female',
       'female', 'male', 'female', 'female', 'male', 'male', 'female',
       'male', 'female', 'male', 'male', 'female', 'male', 'male', 'male',
       'male', 'female', 'male', 'male', 'female', 'female', 'female',
       'male', 'male', 'female', 'female', 'female', 'female', 'female',
       'female', 'male', 'male', 'female', 'female', 'female', 'male',
       'female', 'male', 'female', 'female', 'female', 'female', 'female',
    

In [20]:
#Classification report

cr = metrics.classification_report(y_test,y_pred,output_dict=True)
pd.DataFrame(cr).T

Unnamed: 0,precision,recall,f1-score,support
female,0.797495,0.799163,0.798328,478.0
male,0.750649,0.748705,0.749676,386.0
accuracy,0.77662,0.77662,0.77662,0.77662
macro avg,0.774072,0.773934,0.774002,864.0
weighted avg,0.776566,0.77662,0.776592,864.0


In [21]:
#Kappa score

metrics.cohen_kappa_score(y_test,y_pred)

np.float64(0.548004467044703)

In [23]:
#Area under Curve(AUC)
metrics.roc_auc_score(np.where(y_test=="male",1,0),
                      np.where(y_pred=="male",1,0))

np.float64(0.7739339215643767)

In [24]:
#Saving model

import pickle
pickle.dump(model_final,open('./model/model_svm.pickle',mode='wb'))