In [1]:
import joblib

In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.metrics import classification_report, confusion_matrix, recall_score, f1_score, precision_score, roc_auc_score, roc_curve, auc

In [4]:
# X_train = np.genfromtxt("Data/X_train_under.csv", delimiter=',')
# y_train = np.genfromtxt("Data/y_train_under.csv", delimiter=',')
X_train = np.genfromtxt("Data/X_train.csv", delimiter=',')
y_train = np.genfromtxt("Data/y_train.csv", delimiter=',')

X_cv = np.genfromtxt("Data/X_cv.csv", delimiter=',')
y_cv = np.genfromtxt("Data/y_cv.csv", delimiter=',')

X_test = np.genfromtxt("Data/X_test.csv", delimiter=',')
y_test = np.genfromtxt("Data/y_test.csv", delimiter=',')

##Using strattified set - in pipeline

In [5]:
X_train.shape

(11494, 151)

In [6]:
X_train

array([[-1.67239838, -0.9568655 , -1.60575708, ..., -0.41906592,
        -1.45427135,  1.45427135],
       [ 0.56402217, -0.26671559,  0.6227592 , ..., -0.41906592,
         0.68762958, -0.68762958],
       [ 0.78359801, -0.83903503,  0.6227592 , ..., -0.41906592,
         0.68762958, -0.68762958],
       ...,
       [ 0.73480338,  2.00572921,  0.6227592 , ..., -0.41906592,
         0.68762958, -0.68762958],
       [-1.67239838, -0.41821192,  0.6227592 , ..., -0.41906592,
         0.68762958, -0.68762958],
       [ 0.65347899, -0.31721437,  0.6227592 , ..., -0.41906592,
         0.68762958, -0.68762958]])

In [7]:
y_train

array([0., 0., 0., ..., 0., 0., 0.])

# Batch Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression

In [9]:
C_list = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100]

In [10]:
def cross_validate_C(C_list, X_train, y_train, X_cv, y_cv):
    models = []
    train_f1_score = []
    cv_f1_score = []

    for C in C_list:
        log_reg = LogisticRegression(max_iter=1000, random_state=42, class_weight="balanced", C=C)
        log_reg.fit(X_train, y_train)
        models.append(log_reg)

        y_train_pred = log_reg.predict(X_train)
        train_f1_score.append(f1_score(y_train, y_train_pred))

        y_cv_pred = log_reg.predict(X_cv)
        cv_f1_score.append(f1_score(y_cv, y_cv_pred))
        
    print(train_f1_score)
    print(cv_f1_score)
    
    print('Best param value:', C_list[np.argmax(cv_f1_score)])
    best_model = models[np.argmax(cv_f1_score)]
    
    return best_model

In [11]:
logreg = cross_validate_C(C_list, X_train, y_train, X_cv, y_cv)

[0.5828454332552693, 0.612250794568044, 0.6132333861899958, 0.6133256583681106, 0.6133256583681106, 0.6133256583681106, 0.6132374100719424]
[0.5992317541613316, 0.6178929765886287, 0.6188077246011754, 0.619387326898867, 0.6188679245283019, 0.6188679245283019, 0.6188679245283019]
Best param value: 0.1


## Basic Evaluation

In [12]:
def evaluate(model, X, y):
    h = model.predict(X)
    print(classification_report(y,h))
    print('Confusion matrix:\n',confusion_matrix(y,h))
    print('Recall Score = ',recall_score(y, h))
    print('Precision Score = ',precision_score(y, h))
    print('F1 score = ', f1_score(y,h))

    return evaluate

### Train set

In [13]:
evaluate(logreg, X_train, y_train)

Average Error: 0.2338 degrees.
              precision    recall  f1-score   support

         0.0       0.90      0.78      0.83      8613
         1.0       0.52      0.74      0.61      2881

    accuracy                           0.77     11494
   macro avg       0.71      0.76      0.72     11494
weighted avg       0.80      0.77      0.78     11494

Confusion matrix:
 [[6676 1937]
 [ 750 2131]]
Recall Score =  0.7396737244012496
Precision Score =  0.5238446411012783
F1 score =  0.6133256583681106


<function __main__.evaluate(model, X, y)>

### Cross validation set

In [14]:
evaluate(logreg, X_cv, y_cv)

Average Error: 0.2367 degrees.
              precision    recall  f1-score   support

         0.0       0.91      0.76      0.83      2876
         1.0       0.52      0.77      0.62       956

    accuracy                           0.76      3832
   macro avg       0.71      0.77      0.72      3832
weighted avg       0.81      0.76      0.78      3832

Confusion matrix:
 [[2187  689]
 [ 218  738]]
Recall Score =  0.7719665271966527
Precision Score =  0.5171688857743518
F1 score =  0.619387326898867


<function __main__.evaluate(model, X, y)>

### Test set

In [15]:
evaluate(logreg, X_test, y_test)

Average Error: 0.2302 degrees.
              precision    recall  f1-score   support

         0.0       0.92      0.76      0.83      2877
         1.0       0.53      0.79      0.63       955

    accuracy                           0.77      3832
   macro avg       0.72      0.78      0.73      3832
weighted avg       0.82      0.77      0.78      3832

Confusion matrix:
 [[2198  679]
 [ 203  752]]
Recall Score =  0.787434554973822
Precision Score =  0.5255066387141859
F1 score =  0.6303436714165969


<function __main__.evaluate(model, X, y)>

### Coefficients

In [16]:
np.argmax(np.abs(logreg.coef_))

72

In [17]:
np.max(np.abs(logreg.coef_))

0.4385732243318753

In [18]:
# joblib.dump(log_reg, "Models/default_log_reg_first_ver.pkl")
# joblib.dump(log_reg, "Models/log_reg_under_sample.pkl")
joblib.dump(logreg, "Models/log_reg_no_under_sample_balanced_class_weight.pkl")

['Models/log_reg_no_under_sample_balanced_class_weight.pkl']