In [1]:
import joblib

In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.metrics import classification_report, confusion_matrix, recall_score, f1_score, precision_score, roc_auc_score, roc_curve, auc

In [4]:
X_train = np.genfromtxt("Data/X_train.csv", delimiter=',')
y_train = np.genfromtxt("Data/y_train.csv", delimiter=',')

X_cv = np.genfromtxt("Data/X_cv.csv", delimiter=',')
y_cv = np.genfromtxt("Data/y_cv.csv", delimiter=',')

X_test = np.genfromtxt("Data/X_test.csv", delimiter=',')
y_test = np.genfromtxt("Data/y_test.csv", delimiter=',')

##Using strattified set - in pipeline

In [5]:
X_train.shape

(11494, 151)

In [6]:
X_train

array([[-1.67239838, -0.9568655 , -1.60575708, ..., -0.41906592,
        -1.45427135,  1.45427135],
       [ 0.56402217, -0.26671559,  0.6227592 , ..., -0.41906592,
         0.68762958, -0.68762958],
       [ 0.78359801, -0.83903503,  0.6227592 , ..., -0.41906592,
         0.68762958, -0.68762958],
       ...,
       [ 0.73480338,  2.00572921,  0.6227592 , ..., -0.41906592,
         0.68762958, -0.68762958],
       [-1.67239838, -0.41821192,  0.6227592 , ..., -0.41906592,
         0.68762958, -0.68762958],
       [ 0.65347899, -0.31721437,  0.6227592 , ..., -0.41906592,
         0.68762958, -0.68762958]])

In [7]:
y_train

array([0., 0., 0., ..., 0., 0., 0.])

### Recursive Feature Elimination

In [8]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
rfe = RFE(logreg)
rfe = rfe.fit(X_train, y_train)
print(rfe.support_)
print(rfe.ranking_)

[ True  True  True  True  True  True  True  True  True False  True False
 False  True  True  True False False False  True  True False  True  True
 False  True  True  True False False False False False  True False False
  True False  True  True False  True  True  True False  True  True False
 False  True False  True  True False False False False  True False  True
  True False  True False False False False False False False  True  True
  True  True False False False False  True False False  True  True  True
  True  True False False False False False False  True False False False
  True False False  True  True  True  True  True  True False  True  True
 False  True False  True  True False False  True False  True  True False
 False  True False False  True False False  True False False False  True
 False False  True False False False  True False  True False  True  True
 False False False  True  True  True  True]
[ 1  1  1  1  1  1  1  1  1 69  1 19 56  1  1  1  6 48 30  1  1 12  1  1
 14  1 

In [9]:
X_train = X_train[:,np.argwhere(np.array( rfe.support_)).reshape(np.argwhere(np.array( rfe.support_)).shape[0],)]
X_cv = X_cv[:,np.argwhere(np.array( rfe.support_)).reshape(np.argwhere(np.array( rfe.support_)).shape[0],)]
X_test = X_test[:,np.argwhere(np.array( rfe.support_)).reshape(np.argwhere(np.array( rfe.support_)).shape[0],)]

# SVM

In [10]:
from sklearn.svm import SVC

In [11]:
# C_list = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100]
C_list =[1,10,100]
gamma_list =[0.001,0.01, 0.1]

In [12]:
def cross_validate_C(C_list,gamma_list, X_train, y_train, X_cv, y_cv):
    models = []
    train_f1_score = []
    cv_f1_score = []

    for C in C_list:
        for gamma in gamma_list:
            svm = SVC(kernel = 'rbf', class_weight = 'balanced', C = C, gamma = gamma)
            svm.fit(X_train, y_train)
            models.append(svm)

            y_train_pred = svm.predict(X_train)
            train_f1_score.append(f1_score(y_train, y_train_pred))

            y_cv_pred = svm.predict(X_cv)
            cv_f1_score.append(f1_score(y_cv, y_cv_pred))
        
    print(train_f1_score)
    print(cv_f1_score)
    
    print('Best param C value:', C_list[int(np.argmax(cv_f1_score)/ len(C_list))])
    print('Best param gamma value:', gamma_list[np.argmax(cv_f1_score)% len(gamma_list)])
    best_model = models[np.argmax(cv_f1_score)]
    
    return best_model

In [13]:
svm = cross_validate_C(C_list, gamma_list, X_train, y_train, X_cv, y_cv)

[0.586928662964545, 0.6124873243517311, 0.6868148148148149, 0.6006227002547411, 0.6456460881096213, 0.7542047801711419, 0.62093553907587, 0.6835255841467021, 0.8153404801449494]
[0.6037267080745342, 0.606931530008453, 0.6162018592297477, 0.6095862351495289, 0.6188034188034188, 0.5763765541740674, 0.612008281573499, 0.6080554352533564, 0.5245009074410163]
Best param C value: 10
Best param gamma value: 0.01


## Evaluation

In [14]:
def evaluate(model, X, y):
    h = model.predict(X)
    print(classification_report(y,h))
    print('Confusion matrix:\n',confusion_matrix(y,h))
    print('Recall Score = ',recall_score(y, h))
    print('Precision Score = ',precision_score(y, h))
    print('F1 score = ', f1_score(y,h))

    return evaluate

### Train set

In [15]:
evaluate(svm, X_train, y_train)

              precision    recall  f1-score   support

         0.0       0.91      0.80      0.85      8613
         1.0       0.56      0.76      0.65      2881

    accuracy                           0.79     11494
   macro avg       0.73      0.78      0.75     11494
weighted avg       0.82      0.79      0.80     11494

Confusion matrix:
 [[6898 1715]
 [ 690 2191]]
Recall Score =  0.7604998264491496
Precision Score =  0.5609318996415771
F1 score =  0.6456460881096213


<function __main__.evaluate(model, X, y)>

### Cross validation set

In [16]:
evaluate(svm, X_cv, y_cv)

              precision    recall  f1-score   support

         0.0       0.91      0.77      0.83      2876
         1.0       0.52      0.76      0.62       956

    accuracy                           0.77      3832
   macro avg       0.71      0.76      0.73      3832
weighted avg       0.81      0.77      0.78      3832

Confusion matrix:
 [[2216  660]
 [ 232  724]]
Recall Score =  0.7573221757322176
Precision Score =  0.523121387283237
F1 score =  0.6188034188034188


<function __main__.evaluate(model, X, y)>

### Test set

In [17]:
evaluate(svm, X_test, y_test)

              precision    recall  f1-score   support

         0.0       0.92      0.78      0.85      2877
         1.0       0.55      0.80      0.65       955

    accuracy                           0.79      3832
   macro avg       0.74      0.79      0.75      3832
weighted avg       0.83      0.79      0.80      3832

Confusion matrix:
 [[2255  622]
 [ 190  765]]
Recall Score =  0.8010471204188482
Precision Score =  0.55155010814708
F1 score =  0.6532877882152007


<function __main__.evaluate(model, X, y)>

## Export model

In [18]:
joblib.dump(svm, "Models/svm_no_under_sample_balanced_class_weight.pkl")

['Models/svm_no_under_sample_balanced_class_weight.pkl']