In [63]:
import joblib

In [64]:
import numpy as np
import pandas as pd

In [65]:
from sklearn.metrics import classification_report, confusion_matrix, recall_score, f1_score, precision_score, roc_auc_score, roc_curve, auc

In [66]:
# X_train = np.genfromtxt("Data/X_train_under.csv", delimiter=',')
# y_train = np.genfromtxt("Data/y_train_under.csv", delimiter=',')
X_train = np.genfromtxt("Data/X_train.csv", delimiter=',')
y_train = np.genfromtxt("Data/y_train.csv", delimiter=',')

X_cv = np.genfromtxt("Data/X_cv.csv", delimiter=',')
y_cv = np.genfromtxt("Data/y_cv.csv", delimiter=',')

X_test = np.genfromtxt("Data/X_test.csv", delimiter=',')
y_test = np.genfromtxt("Data/y_test.csv", delimiter=',')

##Using strattified set - in pipeline

In [67]:
X_train.shape

(11494, 149)

In [68]:
# X_train

In [69]:
# y_train

### Recursive Feature Elimination

In [70]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
rfe = RFE(logreg)
rfe = rfe.fit(X_train, y_train)
print(rfe.support_)
print(rfe.ranking_)

[ True  True  True  True  True  True  True  True False False  True False
 False  True  True  True False False False  True  True False  True  True
 False False False  True False False  True False False  True  True False
  True False  True  True False  True  True  True False  True  True False
 False False  True  True  True False False False False  True  True False
  True False  True False False False False  True False False  True  True
  True False False False False False False  True  True False False  True
  True  True False False False False False  True False  True False False
 False  True False False  True  True False False False  True  True False
  True  True  True False  True  True False False  True False  True  True
 False False  True  True False False False False False  True False  True
 False False False  True  True  True  True False  True  True  True  True
  True False False  True  True]
[ 1  1  1  1  1  1  1  1 16 60  1 10 30  1  1  1 37 69 33  1  1  8  1  1
 53 62 18  1 42 25 

In [71]:
X_train = X_train[:,np.argwhere(np.array( rfe.support_)).reshape(np.argwhere(np.array( rfe.support_)).shape[0],)]
X_cv = X_cv[:,np.argwhere(np.array( rfe.support_)).reshape(np.argwhere(np.array( rfe.support_)).shape[0],)]
X_test = X_test[:,np.argwhere(np.array( rfe.support_)).reshape(np.argwhere(np.array( rfe.support_)).shape[0],)]

# SVM

In [72]:
from sklearn.svm import SVC

In [73]:
# C_list = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100]
C_list =[0.01,1,100]
gamma_list =[0.01, 1, 100]

In [74]:
def cross_validate_C(C_list,gamma_list, X_train, y_train, X_cv, y_cv):
    models = []
    train_f1_score = []
    cv_f1_score = []

    for C in C_list:
        for gamma in gamma_list:
            svm = SVC(kernel = 'rbf', class_weight = 'balanced', C = C, gamma = gamma)
            svm.fit(X_train, y_train)
            models.append(svm)

            y_train_pred = svm.predict(X_train)
            train_f1_score.append(f1_score(y_train, y_train_pred))

            y_cv_pred = svm.predict(X_cv)
            cv_f1_score.append(f1_score(y_cv, y_cv_pred))
        
    print(train_f1_score)
    print(cv_f1_score)
    
    print('Best param C value:', C_list[int(np.argmax(cv_f1_score)/ len(gamma_list))])
    print('Best param gamma value:', gamma_list[np.argmax(cv_f1_score)% len(gamma_list)])
    best_model = models[np.argmax(cv_f1_score)]
    
    return best_model

In [75]:
svm = cross_validate_C(C_list, gamma_list, X_train, y_train, X_cv, y_cv)

[0.5436347673397717, 0.40619549291844, 0.4046082309667569, 0.6196748878923768, 0.8361669242658423, 0.9866215071972904, 0.6908771416019914, 0.9508357915437562, 0.9962406015037594]
[0.5480195333695063, 0.3899344470289702, 0.3878838872528397, 0.5934924078091107, 0.5002773155851358, 0.09561752988047809, 0.5869565217391304, 0.4081632653061225, 0.10119047619047619]
Best param C value: 1
Best param gamma value: 0.01


## Basic Evaluation

In [76]:
def evaluate(model, X, y):
    h = model.predict(X)
    errors = abs(h - y)
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print(classification_report(y,h))
    print('Confusion matrix:\n',confusion_matrix(y,h))
    print('Recall Score = ',recall_score(y, h))
    print('Precision Score = ',precision_score(y, h))
    print('F1 score = ', f1_score(y,h))

    return evaluate

### Train set

In [77]:
evaluate(svm, X_train, y_train)

Average Error: 0.2361 degrees.
              precision    recall  f1-score   support

         0.0       0.90      0.77      0.83      8579
         1.0       0.52      0.76      0.62      2915

    accuracy                           0.76     11494
   macro avg       0.71      0.76      0.72     11494
weighted avg       0.81      0.76      0.78     11494

Confusion matrix:
 [[6569 2010]
 [ 704 2211]]
Recall Score =  0.7584905660377359
Precision Score =  0.5238095238095238
F1 score =  0.6196748878923768


<function __main__.evaluate(model, X, y)>

### Cross validation set

In [78]:
evaluate(svm, X_cv, y_cv)

Average Error: 0.2445 degrees.
              precision    recall  f1-score   support

         0.0       0.90      0.76      0.83      2910
         1.0       0.49      0.74      0.59       922

    accuracy                           0.76      3832
   macro avg       0.70      0.75      0.71      3832
weighted avg       0.80      0.76      0.77      3832

Confusion matrix:
 [[2211  699]
 [ 238  684]]
Recall Score =  0.7418655097613883
Precision Score =  0.4945770065075922
F1 score =  0.5934924078091107


<function __main__.evaluate(model, X, y)>

### Test set

In [79]:
evaluate(svm, X_test, y_test)

Average Error: 0.2370 degrees.
              precision    recall  f1-score   support

         0.0       0.90      0.77      0.83      2895
         1.0       0.51      0.74      0.61       937

    accuracy                           0.76      3832
   macro avg       0.71      0.76      0.72      3832
weighted avg       0.81      0.76      0.78      3832

Confusion matrix:
 [[2226  669]
 [ 239  698]]
Recall Score =  0.7449306296691569
Precision Score =  0.5106071689831748
F1 score =  0.6059027777777778


<function __main__.evaluate(model, X, y)>

### Coefficients

In [80]:
# joblib.dump(log_reg, "Models/default_log_reg_first_ver.pkl")
# joblib.dump(log_reg, "Models/log_reg_under_sample.pkl")
joblib.dump(svm, "Models/svm_no_under_sample_balanced_class_weight.pkl")

['Models/svm_no_under_sample_balanced_class_weight.pkl']