In [28]:
import joblib

In [29]:
import numpy as np
import pandas as pd

In [30]:
from sklearn.metrics import classification_report, confusion_matrix, recall_score, f1_score, precision_score, roc_auc_score, roc_curve, auc

In [31]:
# X_train = np.genfromtxt("Data/X_train_under.csv", delimiter=',')
# y_train = np.genfromtxt("Data/y_train_under.csv", delimiter=',')
X_train = np.genfromtxt("Data/X_train.csv", delimiter=',')
y_train = np.genfromtxt("Data/y_train.csv", delimiter=',')

X_cv = np.genfromtxt("Data/X_cv.csv", delimiter=',')
y_cv = np.genfromtxt("Data/y_cv.csv", delimiter=',')

X_test = np.genfromtxt("Data/X_test.csv", delimiter=',')
y_test = np.genfromtxt("Data/y_test.csv", delimiter=',')

##Using strattified set - in pipeline

In [32]:
X_train.shape

(11494, 149)

In [33]:
# X_train

In [34]:
# y_train

### Recursive Feature Elimination

In [35]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
rfe = RFE(logreg)
rfe = rfe.fit(X_train, y_train)
print(rfe.support_)
print(rfe.ranking_)

[ True  True  True  True  True  True  True  True False False  True False
 False  True  True  True False False False  True  True False  True  True
 False False False  True False False  True False False  True  True False
  True False  True  True False  True  True  True False  True  True False
 False False  True  True  True False False False False  True  True False
  True False  True False False False False  True False False  True  True
  True False False False False False False  True  True False False  True
  True  True False False False False False  True False  True False False
 False  True False False  True  True False False False  True  True False
  True  True  True False  True  True False False  True False  True  True
 False False  True  True False False False False False  True False  True
 False False False  True  True  True  True False  True  True  True  True
  True False False  True  True]
[ 1  1  1  1  1  1  1  1 16 60  1 10 30  1  1  1 37 69 33  1  1  8  1  1
 53 62 18  1 42 25 

In [36]:
X_train = X_train[:,np.argwhere(np.array( rfe.support_)).reshape(np.argwhere(np.array( rfe.support_)).shape[0],)]
X_cv = X_cv[:,np.argwhere(np.array( rfe.support_)).reshape(np.argwhere(np.array( rfe.support_)).shape[0],)]
X_test = X_test[:,np.argwhere(np.array( rfe.support_)).reshape(np.argwhere(np.array( rfe.support_)).shape[0],)]

# SVM

In [37]:
from sklearn.svm import SVC

In [38]:
# C_list = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100]
C_list =[100,1000, 10000]

In [39]:
def cross_validate_C(C_list, X_train, y_train, X_cv, y_cv):
    models = []
    train_f1_score = []
    cv_f1_score = []

    for C in C_list:
        svm = SVC(kernel = 'rbf', class_weight = 'balanced', C = C)
        svm.fit(X_train, y_train)
        models.append(svm)

        y_train_pred = svm.predict(X_train)
        train_f1_score.append(f1_score(y_train, y_train_pred))

        y_cv_pred = svm.predict(X_cv)
        cv_f1_score.append(f1_score(y_cv, y_cv_pred))
        
    print(train_f1_score)
    print(cv_f1_score)
    
    print('Best param value:', C_list[np.argmax(cv_f1_score)])
    best_model = models[np.argmax(cv_f1_score)]
    
    return best_model

In [40]:
svm = cross_validate_C(C_list, X_train, y_train, X_cv, y_cv)

[0.7009098914000588, 0.7405421507298183, 0.7800090593386684]
[0.5805580558055805, 0.5648021828103684, 0.541647701411015]
Best param value: 100


## Basic Evaluation

In [47]:
def evaluate(model, X, y):
    h = model.predict(X)
    errors = abs(h - y)
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print(classification_report(y,h))
    print('Confusion matrix:\n',confusion_matrix(y,h))
    print('Recall Score = ',recall_score(y, h))
    print('Precision Score = ',precision_score(y, h))
    print('F1 score = ', f1_score(y,h))

    return evaluate

### Train set

In [48]:
evaluate(svm, X_train, y_train)

Average Error: 0.1773 degrees.
              precision    recall  f1-score   support

         0.0       0.93      0.82      0.87      8579
         1.0       0.61      0.82      0.70      2915

    accuracy                           0.82     11494
   macro avg       0.77      0.82      0.79     11494
weighted avg       0.85      0.82      0.83     11494

Confusion matrix:
 [[7068 1511]
 [ 527 2388]]
Recall Score =  0.8192109777015437
Precision Score =  0.6124647345473199
F1 score =  0.7009098914000588


<function __main__.evaluate(model, X, y)>

### Cross validation set

In [49]:
evaluate(svm, X_cv, y_cv)

Average Error: 0.2432 degrees.
              precision    recall  f1-score   support

         0.0       0.89      0.77      0.83      2910
         1.0       0.50      0.70      0.58       922

    accuracy                           0.76      3832
   macro avg       0.69      0.74      0.70      3832
weighted avg       0.80      0.76      0.77      3832

Confusion matrix:
 [[2255  655]
 [ 277  645]]
Recall Score =  0.6995661605206074
Precision Score =  0.49615384615384617
F1 score =  0.5805580558055805


<function __main__.evaluate(model, X, y)>

### Test set

In [50]:
evaluate(svm, X_test, y_test)

Average Error: 0.1952 degrees.
              precision    recall  f1-score   support

         0.0       0.92      0.82      0.86      2895
         1.0       0.58      0.77      0.66       937

    accuracy                           0.80      3832
   macro avg       0.75      0.79      0.76      3832
weighted avg       0.83      0.80      0.81      3832

Confusion matrix:
 [[2365  530]
 [ 218  719]]
Recall Score =  0.767342582710779
Precision Score =  0.5756605284227382
F1 score =  0.6578225068618482


<function __main__.evaluate(model, X, y)>

### Coefficients

In [51]:
# joblib.dump(log_reg, "Models/default_log_reg_first_ver.pkl")
# joblib.dump(log_reg, "Models/log_reg_under_sample.pkl")
joblib.dump(svm, "Models/svm_no_under_sample_balanced_class_weight.pkl")

['Models/svm_no_under_sample_balanced_class_weight.pkl']