In [22]:
import joblib

In [23]:
import numpy as np
import pandas as pd

In [24]:
from sklearn.metrics import classification_report, confusion_matrix, recall_score, f1_score, precision_score, roc_auc_score, roc_curve, auc

In [25]:
# X_train = np.genfromtxt("Data/X_train_under.csv", delimiter=',')
# y_train = np.genfromtxt("Data/y_train_under.csv", delimiter=',')
X_train = np.genfromtxt("Data/X_train.csv", delimiter=',')
y_train = np.genfromtxt("Data/y_train.csv", delimiter=',')

X_cv = np.genfromtxt("Data/X_cv.csv", delimiter=',')
y_cv = np.genfromtxt("Data/y_cv.csv", delimiter=',')

X_test = np.genfromtxt("Data/X_test.csv", delimiter=',')
y_test = np.genfromtxt("Data/y_test.csv", delimiter=',')

##Using strattified set - in pipeline

In [26]:
X_train.shape

(12260, 147)

In [27]:
# X_train

In [28]:
# y_train

### Recursive Feature Elimination

In [29]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
rfe = RFE(logreg)
rfe = rfe.fit(X_train, y_train)
print(rfe.support_)
print(rfe.ranking_)

[ True  True  True  True  True  True  True  True False False  True False
 False  True  True  True False False False  True False  True  True False
 False  True  True False False False False False  True  True False  True
 False  True  True False  True  True  True False  True  True False False
  True  True  True  True False  True False False  True  True  True  True
 False  True  True False False  True  True False  True  True  True False
 False  True False False  True False  True False False False  True  True
 False False False False False False False  True False  True False  True
 False False  True  True  True False  True  True False False  True False
 False False False  True False False  True False  True  True False False
 False False False False False False  True  True False False False False
 False False  True  True False  True  True  True  True  True  True False
 False  True  True]
[ 1  1  1  1  1  1  1  1 20 38  1 26 23  1  1  1 62 37 46  1 11  1  1 63
 48  1  1 55 71 14 51 57  1  1 

In [30]:
X_train = X_train[:,np.argwhere(np.array( rfe.support_)).reshape(np.argwhere(np.array( rfe.support_)).shape[0],)]
X_cv = X_cv[:,np.argwhere(np.array( rfe.support_)).reshape(np.argwhere(np.array( rfe.support_)).shape[0],)]
X_test = X_test[:,np.argwhere(np.array( rfe.support_)).reshape(np.argwhere(np.array( rfe.support_)).shape[0],)]

# SVM

In [31]:
from sklearn.svm import SVC

In [32]:
# C_list = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100]
C_list =[0.001,1,100]

In [33]:
def cross_validate_C(C_list, X_train, y_train, X_cv, y_cv):
    models = []
    train_f1_score = []
    cv_f1_score = []

    for C in C_list:
        svm = SVC(kernel = 'rbf', class_weight = 'balanced', C = C)
        svm.fit(X_train, y_train)
        models.append(svm)

        y_train_pred = svm.predict(X_train)
        train_f1_score.append(f1_score(y_train, y_train_pred))

        y_cv_pred = svm.predict(X_cv)
        cv_f1_score.append(f1_score(y_cv, y_cv_pred))
        
    print(train_f1_score)
    print(cv_f1_score)
    
    print('Best param value:', C_list[np.argmax(cv_f1_score)])
    best_model = models[np.argmax(cv_f1_score)]
    
    return best_model

In [34]:
svm = cross_validate_C(C_list, X_train, y_train, X_cv, y_cv)

[0.0, 0.617007075156855, 0.6887380523618231]
[0.0, 0.5821697099892588, 0.5826330532212886]
Best param value: 100


### LGBM for more insight

In [35]:
# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder
# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer
# from sklearn.linear_model import LinearRegression, LogisticRegression
# import seaborn as sns
# import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split as tts
# from sklearn.model_selection import RandomizedSearchCV
# from lightgbm.sklearn import LGBMClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier 
# import six
# import sys
# sys.modules['sklearn.externals.six'] = six
# from imblearn.over_sampling import SVMSMOTE
# from imblearn.ensemble import EasyEnsembleClassifier
# from sklearn.inspection import permutation_importance
# import eli5
# from eli5.sklearn import PermutationImportance

In [36]:
# easy_lgbm = EasyEnsembleClassifier(base_estimator= LGBMClassifier(random_state=42), n_estimators=250, n_jobs=1,
#                        random_state=42, replacement=True,
#                        sampling_strategy='auto', verbose=0,
#                        warm_start=True)
# easy_lgbm.fit(X_train, y_train)


In [37]:
# evaluate(easy_lgbm, X_train, y_train)

In [38]:
# evaluate(easy_lgbm, X_test, y_test)

In [39]:
# print(f1_score(y_train, easy_lgbm.predict(X_train)))
# print(f1_score(y_test, easy_lgbm.predict(X_test)))

# predict_proba_easy_lgbm = pd.DataFrame(easy_lgbm.predict_proba(X_test))
# predict_proba_easy_lgbm

In [40]:
# prediction = pd.DataFrame(easy_lgbm.predict(X_test))
# prediction.value_counts()

## Basic Evaluation

In [41]:
def evaluate(model, X, y):
    h = model.predict(X)
    errors = abs(h - y)
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print(classification_report(y,h))
    print('Confusion matrix:\n',confusion_matrix(y,h))
    print('Recall Score = ',recall_score(y, h))
    print('Precision Score = ',precision_score(y, h))
    print('F1 score = ', f1_score(y,h))

    return evaluate

### Train set

In [42]:
evaluate(svm, X_train, y_train)

Average Error: 0.1833 degrees.
              precision    recall  f1-score   support

         0.0       0.93      0.82      0.87      9193
         1.0       0.60      0.81      0.69      3067

    accuracy                           0.82     12260
   macro avg       0.76      0.81      0.78     12260
weighted avg       0.85      0.82      0.82     12260

Confusion matrix:
 [[7527 1666]
 [ 581 2486]]
Recall Score =  0.8105640691229214
Precision Score =  0.598747591522158
F1 score =  0.6887380523618231


<function __main__.evaluate(model, X, y)>

### Cross validation set

In [43]:
evaluate(svm, X_cv, y_cv)

Average Error: 0.2430 degrees.
              precision    recall  f1-score   support

         0.0       0.88      0.78      0.83      2296
         1.0       0.51      0.68      0.58       770

    accuracy                           0.76      3066
   macro avg       0.70      0.73      0.71      3066
weighted avg       0.79      0.76      0.77      3066

Confusion matrix:
 [[1801  495]
 [ 250  520]]
Recall Score =  0.6753246753246753
Precision Score =  0.5123152709359606
F1 score =  0.5826330532212886


<function __main__.evaluate(model, X, y)>

### Test set

In [44]:
evaluate(svm, X_test, y_test)

Average Error: 0.1962 degrees.
              precision    recall  f1-score   support

         0.0       0.92      0.81      0.86      2871
         1.0       0.58      0.79      0.67       961

    accuracy                           0.80      3832
   macro avg       0.75      0.80      0.76      3832
weighted avg       0.83      0.80      0.81      3832

Confusion matrix:
 [[2324  547]
 [ 205  756]]
Recall Score =  0.7866805411030177
Precision Score =  0.580199539524175
F1 score =  0.667844522968198


<function __main__.evaluate(model, X, y)>

### Coefficients

In [45]:
# joblib.dump(log_reg, "Models/default_log_reg_first_ver.pkl")
# joblib.dump(log_reg, "Models/log_reg_under_sample.pkl")
joblib.dump(svm, "Models/svm_no_under_sample_balanced_class_weight.pkl")

['Models/svm_no_under_sample_balanced_class_weight.pkl']