In [57]:
import joblib

In [58]:
import numpy as np
import pandas as pd

In [59]:
from sklearn.metrics import classification_report, confusion_matrix, recall_score, f1_score, precision_score, roc_auc_score, roc_curve, auc

In [60]:
# X_train = np.genfromtxt("Data/X_train_under.csv", delimiter=',')
# y_train = np.genfromtxt("Data/y_train_under.csv", delimiter=',')
X_train = np.genfromtxt("Data/X_train.csv", delimiter=',')
y_train = np.genfromtxt("Data/y_train.csv", delimiter=',')

X_cv = np.genfromtxt("Data/X_cv.csv", delimiter=',')
y_cv = np.genfromtxt("Data/y_cv.csv", delimiter=',')

##Using strattified set - in pipeline

In [61]:
X_train.shape

(12260, 147)

In [62]:
# X_train

In [63]:
# y_train

# Batch Logistic Regression

In [64]:
from sklearn.linear_model import LogisticRegression

In [65]:
log_reg = LogisticRegression(max_iter=1000, random_state=42, class_weight="balanced")

In [66]:
log_reg.fit(X_train, y_train)

LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)

In [67]:
C_list = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100]

In [68]:
def cross_validate_C(model, C_list, X_train, y_train, X_cv, y_cv):
    models = []
    train_f1_score = []
    cv_f1_score = []

    for C in C_list:
        log_reg = LogisticRegression(max_iter=1000, random_state=42, class_weight="balanced", C=C)
        log_reg.fit(X_train, y_train)
        models.append(log_reg)

        y_train_pred = log_reg.predict(X_train)
        train_f1_score.append(f1_score(y_train, y_train_pred))

        y_cv_pred = log_reg.predict(X_cv)
        cv_f1_score.append(f1_score(y_cv, y_cv_pred))
        
    print(train_f1_score)
    print(cv_f1_score)
    
    print('Best param value:', C_list[np.argmax(cv_f1_score)])
    best_model = models[np.argmax(cv_f1_score)]
    
    return best_model

In [69]:
logreg = cross_validate_C(log_reg, C_list, X_train, y_train, X_cv, y_cv)

[0.5819536423841061, 0.6186065243656866, 0.6223710649698594, 0.6206527554842162, 0.6205512443136205, 0.6204682274247492, 0.6204682274247492]
[0.5627462014631401, 0.5951219512195122, 0.5920344456404736, 0.5931108719052746, 0.5927918235610543, 0.5927918235610543, 0.5927918235610543]
Best param value: 0.001


### LGBM for more insight

In [70]:
# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder
# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer
# from sklearn.linear_model import LinearRegression, LogisticRegression
# import seaborn as sns
# import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split as tts
# from sklearn.model_selection import RandomizedSearchCV
# from lightgbm.sklearn import LGBMClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier 
# import six
# import sys
# sys.modules['sklearn.externals.six'] = six
# from imblearn.over_sampling import SVMSMOTE
# from imblearn.ensemble import EasyEnsembleClassifier
# from sklearn.inspection import permutation_importance
# import eli5
# from eli5.sklearn import PermutationImportance

In [71]:
# easy_lgbm = EasyEnsembleClassifier(base_estimator= LGBMClassifier(random_state=42), n_estimators=250, n_jobs=1,
#                        random_state=42, replacement=True,
#                        sampling_strategy='auto', verbose=0,
#                        warm_start=True)
# easy_lgbm.fit(X_train, y_train)


In [72]:
# evaluate(easy_lgbm, X_train, y_train)

In [73]:
# evaluate(easy_lgbm, X_test, y_test)

In [74]:
# print(f1_score(y_train, easy_lgbm.predict(X_train)))
# print(f1_score(y_test, easy_lgbm.predict(X_test)))

# predict_proba_easy_lgbm = pd.DataFrame(easy_lgbm.predict_proba(X_test))
# predict_proba_easy_lgbm

In [75]:
# prediction = pd.DataFrame(easy_lgbm.predict(X_test))
# prediction.value_counts()

## Basic Evaluation

In [76]:
def evaluate(model, X, y):
    h = model.predict(X)
    errors = abs(h - y)
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print(classification_report(y,h))
    print('Confusion matrix:\n',confusion_matrix(y,h))
    print('Recall Score = ',recall_score(y, h))
    print('Precision Score = ',precision_score(y, h))
    print('F1 score = ', f1_score(y,h))

    return evaluate

### Train set

In [77]:
evaluate(logreg, X_train, y_train)

Average Error: 0.2317 degrees.
              precision    recall  f1-score   support

         0.0       0.90      0.77      0.83      9193
         1.0       0.53      0.75      0.62      3067

    accuracy                           0.77     12260
   macro avg       0.71      0.76      0.73     12260
weighted avg       0.81      0.77      0.78     12260

Confusion matrix:
 [[7115 2078]
 [ 763 2304]]
Recall Score =  0.7512226931855233
Precision Score =  0.5257873117298038
F1 score =  0.6186065243656866


<function __main__.evaluate(model, X, y)>

### Cross validation set

In [78]:
X_cv = np.genfromtxt("Data/X_cv.csv", delimiter=',')
y_cv = np.genfromtxt("Data/y_cv.csv", delimiter=',')

In [79]:
evaluate(logreg, X_cv, y_cv)

Average Error: 0.2436 degrees.
              precision    recall  f1-score   support

         0.0       0.89      0.77      0.83      2296
         1.0       0.51      0.71      0.60       770

    accuracy                           0.76      3066
   macro avg       0.70      0.74      0.71      3066
weighted avg       0.79      0.76      0.77      3066

Confusion matrix:
 [[1770  526]
 [ 221  549]]
Recall Score =  0.712987012987013
Precision Score =  0.5106976744186047
F1 score =  0.5951219512195122


<function __main__.evaluate(model, X, y)>

### Test set

In [80]:
X_test = np.genfromtxt("Data/X_test.csv", delimiter=',')
y_test = np.genfromtxt("Data/y_test.csv", delimiter=',')

In [81]:
evaluate(logreg, X_test, y_test)

Average Error: 0.2299 degrees.
              precision    recall  f1-score   support

         0.0       0.91      0.77      0.83      2871
         1.0       0.53      0.76      0.62       961

    accuracy                           0.77      3832
   macro avg       0.72      0.77      0.73      3832
weighted avg       0.81      0.77      0.78      3832

Confusion matrix:
 [[2219  652]
 [ 229  732]]
Recall Score =  0.7617065556711758
Precision Score =  0.5289017341040463
F1 score =  0.6243070362473347


<function __main__.evaluate(model, X, y)>

### Coefficients

In [82]:
np.argmax(np.abs(logreg.coef_))

70

In [83]:
np.max(np.abs(logreg.coef_))

0.3323515408317608

In [84]:
# joblib.dump(log_reg, "Models/default_log_reg_first_ver.pkl")
# joblib.dump(log_reg, "Models/log_reg_under_sample.pkl")
joblib.dump(logreg, "Models/log_reg_no_under_sample_balanced_class_weight.pkl")

['Models/log_reg_no_under_sample_balanced_class_weight.pkl']