In [1]:
import joblib

In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.metrics import classification_report, confusion_matrix, recall_score, f1_score, precision_score, roc_auc_score, roc_curve, auc

In [4]:
X_train = np.genfromtxt("Data/X_train.csv", delimiter=',')
y_train = np.genfromtxt("Data/y_train.csv", delimiter=',')

X_cv = np.genfromtxt("Data/X_cv.csv", delimiter=',')
y_cv = np.genfromtxt("Data/y_cv.csv", delimiter=',')

X_test = np.genfromtxt("Data/X_test.csv", delimiter=',')
y_test = np.genfromtxt("Data/y_test.csv", delimiter=',')

##Using strattified set - in pipeline

In [5]:
X_train.shape

(11494, 151)

In [6]:
X_train

array([[-1.67239838, -0.9568655 , -1.60575708, ..., -0.41906592,
        -1.45427135,  1.45427135],
       [ 0.56402217, -0.26671559,  0.6227592 , ..., -0.41906592,
         0.68762958, -0.68762958],
       [ 0.78359801, -0.83903503,  0.6227592 , ..., -0.41906592,
         0.68762958, -0.68762958],
       ...,
       [ 0.73480338,  2.00572921,  0.6227592 , ..., -0.41906592,
         0.68762958, -0.68762958],
       [-1.67239838, -0.41821192,  0.6227592 , ..., -0.41906592,
         0.68762958, -0.68762958],
       [ 0.65347899, -0.31721437,  0.6227592 , ..., -0.41906592,
         0.68762958, -0.68762958]])

In [7]:
y_train

array([0., 0., 0., ..., 0., 0., 0.])

# SVM

In [8]:
from sklearn.svm import SVC

In [9]:
# C_list = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100]
C_list =[100, 200, 500]
gamma_list =[0.0002,0.0005,0.001]

In [10]:
def cross_validate_C(C_list,gamma_list, X_train, y_train, X_cv, y_cv):
    models = []
    train_f1_score = []
    cv_f1_score = []

    for C in C_list:
        for gamma in gamma_list:
            svm = SVC(kernel = 'rbf', class_weight = 'balanced', C = C, gamma = gamma, random_state =42)
            svm.fit(X_train, y_train)
            models.append(svm)

            y_train_pred = svm.predict(X_train)
            train_f1_score.append(f1_score(y_train, y_train_pred))

            y_cv_pred = svm.predict(X_cv)
            cv_f1_score.append(f1_score(y_cv, y_cv_pred))
        
    print(train_f1_score)
    print(cv_f1_score)
    
    print('Best param C value:', C_list[int(np.argmax(cv_f1_score)/ len(C_list))])
    print('Best param gamma value:', gamma_list[np.argmax(cv_f1_score)% len(gamma_list)])
    best_model = models[np.argmax(cv_f1_score)]
    
    return best_model

In [11]:
svm = cross_validate_C(C_list, gamma_list, X_train, y_train, X_cv, y_cv)

[0.596674182638106, 0.639697322467986, 0.6640023682652456, 0.6070575003514691, 0.655060489820006, 0.6718842291789723, 0.6366922076044528, 0.6671610797982794, 0.680857310628303]
[0.6073040623717686, 0.6092827004219409, 0.6052974381241858, 0.6053169734151329, 0.6103004291845493, 0.5974924340683097, 0.608294930875576, 0.6024305555555556, 0.599396291504959]
Best param C value: 200
Best param gamma value: 0.0005


## Evaluation

In [12]:
def evaluate(model, X, y):
    h = model.predict(X)
    print(classification_report(y,h))
    print('Confusion matrix:\n',confusion_matrix(y,h))
    print('Recall Score = ',recall_score(y, h))
    print('Precision Score = ',precision_score(y, h))
    print('F1 score = ', f1_score(y,h))

    return evaluate

### Train set

In [13]:
evaluate(svm, X_train, y_train)

              precision    recall  f1-score   support

         0.0       0.91      0.81      0.86      8613
         1.0       0.57      0.77      0.66      2881

    accuracy                           0.80     11494
   macro avg       0.74      0.79      0.76     11494
weighted avg       0.83      0.80      0.81     11494

Confusion matrix:
 [[6936 1677]
 [ 661 2220]]
Recall Score =  0.7705657757723012
Precision Score =  0.5696689761354888
F1 score =  0.655060489820006


<function __main__.evaluate(model, X, y)>

### Cross validation set

In [14]:
evaluate(svm, X_cv, y_cv)

              precision    recall  f1-score   support

         0.0       0.90      0.77      0.83      2876
         1.0       0.52      0.74      0.61       956

    accuracy                           0.76      3832
   macro avg       0.71      0.76      0.72      3832
weighted avg       0.80      0.76      0.78      3832

Confusion matrix:
 [[2213  663]
 [ 245  711]]
Recall Score =  0.7437238493723849
Precision Score =  0.517467248908297
F1 score =  0.6103004291845493


<function __main__.evaluate(model, X, y)>

### Test set

In [15]:
evaluate(svm, X_test, y_test)

              precision    recall  f1-score   support

         0.0       0.91      0.78      0.84      2877
         1.0       0.54      0.78      0.64       955

    accuracy                           0.78      3832
   macro avg       0.73      0.78      0.74      3832
weighted avg       0.82      0.78      0.79      3832

Confusion matrix:
 [[2239  638]
 [ 208  747]]
Recall Score =  0.7821989528795812
Precision Score =  0.5393501805054152
F1 score =  0.6384615384615384


<function __main__.evaluate(model, X, y)>

## Export model

In [16]:
joblib.dump(svm, "Models/svm_no_under_sample_balanced_class_weight.pkl")

['Models/svm_no_under_sample_balanced_class_weight.pkl']