In [1]:
import joblib

In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.metrics import classification_report, confusion_matrix, recall_score, f1_score, precision_score, roc_auc_score, roc_curve, auc

In [4]:
# X_train = np.genfromtxt("Data/X_train_under.csv", delimiter=',')
# y_train = np.genfromtxt("Data/y_train_under.csv", delimiter=',')
X_train = np.genfromtxt("Data/X_train.csv", delimiter=',')
y_train = np.genfromtxt("Data/y_train.csv", delimiter=',')

X_cv = np.genfromtxt("Data/X_cv.csv", delimiter=',')
y_cv = np.genfromtxt("Data/y_cv.csv", delimiter=',')

X_test = np.genfromtxt("Data/X_test.csv", delimiter=',')
y_test = np.genfromtxt("Data/y_test.csv", delimiter=',')

##Using strattified set - in pipeline

In [5]:
X_train.shape

(11494, 148)

In [6]:
# X_train

In [7]:
# y_train

### Recursive Feature Elimination

In [8]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
rfe = RFE(logreg)
rfe = rfe.fit(X_train, y_train)
print(rfe.support_)
print(rfe.ranking_)

[ True  True  True  True  True  True  True  True  True False  True False
 False  True  True  True False False False  True  True False  True  True
 False  True  True  True False False False False False  True False False
  True False  True  True False  True  True  True False  True  True False
 False  True False  True  True False False False False  True False  True
  True False  True False False False False False False False  True  True
  True  True False False False False  True False False  True  True  True
  True  True False False False False False False  True False False False
  True False False  True  True  True  True  True  True False  True  True
 False  True False  True  True False False  True False  True  True False
 False  True False False False False False  True False False  True False
  True False False  True False False False  True  True  True  True  True
 False False  True  True]
[ 1  1  1  1  1  1  1  1  1 71  1 24 58  1  1  1  6 49 32  1  1 12  1  1
 18  1  1  1 67 72 36 28 

In [9]:
X_train = X_train[:,np.argwhere(np.array( rfe.support_)).reshape(np.argwhere(np.array( rfe.support_)).shape[0],)]
X_cv = X_cv[:,np.argwhere(np.array( rfe.support_)).reshape(np.argwhere(np.array( rfe.support_)).shape[0],)]
X_test = X_test[:,np.argwhere(np.array( rfe.support_)).reshape(np.argwhere(np.array( rfe.support_)).shape[0],)]

# SVM

In [10]:
from sklearn.svm import SVC

In [11]:
# C_list = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100]
C_list =[1,10,100]
gamma_list =[0.001,0.01, 0.1]

In [12]:
def cross_validate_C(C_list,gamma_list, X_train, y_train, X_cv, y_cv):
    models = []
    train_f1_score = []
    cv_f1_score = []

    for C in C_list:
        for gamma in gamma_list:
            svm = SVC(kernel = 'rbf', class_weight = 'balanced', C = C, gamma = gamma)
            svm.fit(X_train, y_train)
            models.append(svm)

            y_train_pred = svm.predict(X_train)
            train_f1_score.append(f1_score(y_train, y_train_pred))

            y_cv_pred = svm.predict(X_cv)
            cv_f1_score.append(f1_score(y_cv, y_cv_pred))
        
    print(train_f1_score)
    print(cv_f1_score)
    
    print('Best param C value:', C_list[int(np.argmax(cv_f1_score)/ len(gamma_list))])
    print('Best param gamma value:', gamma_list[np.argmax(cv_f1_score)% len(gamma_list)])
    best_model = models[np.argmax(cv_f1_score)]
    
    return best_model

In [13]:
svm = cross_validate_C(C_list, gamma_list, X_train, y_train, X_cv, y_cv)

[0.5920960759670437, 0.608986476661335, 0.6886750555144338, 0.5995224048321394, 0.647162599235519, 0.7605011053795137, 0.6210842511800887, 0.6842260153031194, 0.8214123695750795]
[0.6060851926977687, 0.6071126164267571, 0.6140273489192766, 0.6076360682372055, 0.6195142735406902, 0.5793226381461676, 0.610857853294654, 0.6105624731644482, 0.5290909090909091]
Best param C value: 10
Best param gamma value: 0.01


## Basic Evaluation

In [14]:
def evaluate(model, X, y):
    h = model.predict(X)
    errors = abs(h - y)
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print(classification_report(y,h))
    print('Confusion matrix:\n',confusion_matrix(y,h))
    print('Recall Score = ',recall_score(y, h))
    print('Precision Score = ',precision_score(y, h))
    print('F1 score = ', f1_score(y,h))

    return evaluate

### Train set

In [15]:
evaluate(svm, X_train, y_train)

Average Error: 0.2088 degrees.
              precision    recall  f1-score   support

         0.0       0.91      0.80      0.85      8613
         1.0       0.56      0.76      0.65      2881

    accuracy                           0.79     11494
   macro avg       0.74      0.78      0.75     11494
weighted avg       0.82      0.79      0.80     11494

Confusion matrix:
 [[6893 1720]
 [ 680 2201]]
Recall Score =  0.763970843457133
Precision Score =  0.5613363937770977
F1 score =  0.647162599235519


<function __main__.evaluate(model, X, y)>

### Cross validation set

In [16]:
evaluate(svm, X_cv, y_cv)

Average Error: 0.2330 degrees.
              precision    recall  f1-score   support

         0.0       0.91      0.77      0.83      2876
         1.0       0.52      0.76      0.62       956

    accuracy                           0.77      3832
   macro avg       0.71      0.76      0.73      3832
weighted avg       0.81      0.77      0.78      3832

Confusion matrix:
 [[2212  664]
 [ 229  727]]
Recall Score =  0.7604602510460251
Precision Score =  0.5226455787203451
F1 score =  0.6195142735406902


<function __main__.evaluate(model, X, y)>

### Test set

In [17]:
evaluate(svm, X_test, y_test)

Average Error: 0.2127 degrees.
              precision    recall  f1-score   support

         0.0       0.92      0.78      0.85      2877
         1.0       0.55      0.80      0.65       955

    accuracy                           0.79      3832
   macro avg       0.74      0.79      0.75      3832
weighted avg       0.83      0.79      0.80      3832

Confusion matrix:
 [[2251  626]
 [ 189  766]]
Recall Score =  0.8020942408376963
Precision Score =  0.5502873563218391
F1 score =  0.6527481891776736


<function __main__.evaluate(model, X, y)>

### Coefficients

In [18]:
# joblib.dump(log_reg, "Models/default_log_reg_first_ver.pkl")
# joblib.dump(log_reg, "Models/log_reg_under_sample.pkl")
joblib.dump(svm, "Models/svm_no_under_sample_balanced_class_weight.pkl")

['Models/svm_no_under_sample_balanced_class_weight.pkl']