In [1]:
import joblib

In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.metrics import classification_report, confusion_matrix, recall_score, f1_score, precision_score, roc_auc_score, roc_curve, auc

In [4]:
# X_train = np.genfromtxt("Data/X_train_under.csv", delimiter=',')
# y_train = np.genfromtxt("Data/y_train_under.csv", delimiter=',')
X_train = np.genfromtxt("Data/X_train.csv", delimiter=',')
y_train = np.genfromtxt("Data/y_train.csv", delimiter=',')

X_cv = np.genfromtxt("Data/X_cv.csv", delimiter=',')
y_cv = np.genfromtxt("Data/y_cv.csv", delimiter=',')

X_test = np.genfromtxt("Data/X_test.csv", delimiter=',')
y_test = np.genfromtxt("Data/y_test.csv", delimiter=',')

##Using strattified set - in pipeline

In [5]:
X_train.shape

(11494, 149)

In [6]:
X_train

array([[-1.64652041, -1.01234952,  0.6254507 , ..., -0.22905834,
         0.99064753, -0.69615321],
       [ 0.16373428, -0.81131152, -1.59884705, ..., -0.22905834,
        -1.00944077,  1.4364654 ],
       [ 0.66478691, -0.47624819,  0.6254507 , ..., -0.22905834,
         0.99064753, -0.69615321],
       ...,
       [ 0.47891255, -1.02910269, -1.59884705, ..., -0.22905834,
         0.99064753, -0.69615321],
       [-1.17779375, -0.57676719,  0.6254507 , ..., -0.22905834,
        -1.00944077,  1.4364654 ],
       [ 0.74560186,  1.46711915,  0.6254507 , ..., -0.22905834,
         0.99064753, -0.69615321]])

In [7]:
y_train

array([0., 1., 0., ..., 1., 0., 0.])

# Batch Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression

In [9]:
C_list = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100]

In [10]:
def cross_validate_C(C_list, X_train, y_train, X_cv, y_cv):
    models = []
    train_f1_score = []
    cv_f1_score = []

    for C in C_list:
        log_reg = LogisticRegression(max_iter=1000, random_state=42, class_weight="balanced", C=C)
        log_reg.fit(X_train, y_train)
        models.append(log_reg)

        y_train_pred = log_reg.predict(X_train)
        train_f1_score.append(f1_score(y_train, y_train_pred))

        y_cv_pred = log_reg.predict(X_cv)
        cv_f1_score.append(f1_score(y_cv, y_cv_pred))
        
    print(train_f1_score)
    print(cv_f1_score)
    
    print('Best param value:', C_list[np.argmax(cv_f1_score)])
    best_model = models[np.argmax(cv_f1_score)]
    
    return best_model

In [11]:
logreg = cross_validate_C(C_list, X_train, y_train, X_cv, y_cv)

[0.5825869916311849, 0.6173367550613058, 0.6194061656485297, 0.6185683912119065, 0.6191555681496174, 0.6190678566369173, 0.6190678566369173]
[0.5681610247026533, 0.5977924944812361, 0.5953324526640247, 0.5951648351648352, 0.5949033391915641, 0.5949033391915641, 0.5949033391915641]
Best param value: 0.001


## Basic Evaluation

In [12]:
def evaluate(model, X, y):
    h = model.predict(X)
    errors = abs(h - y)
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print(classification_report(y,h))
    print('Confusion matrix:\n',confusion_matrix(y,h))
    print('Recall Score = ',recall_score(y, h))
    print('Precision Score = ',precision_score(y, h))
    print('F1 score = ', f1_score(y,h))

    return evaluate

### Train set

In [13]:
evaluate(logreg, X_train, y_train)

Average Error: 0.2335 degrees.
              precision    recall  f1-score   support

         0.0       0.90      0.77      0.83      8579
         1.0       0.53      0.74      0.62      2915

    accuracy                           0.77     11494
   macro avg       0.71      0.76      0.72     11494
weighted avg       0.80      0.77      0.78     11494

Confusion matrix:
 [[6645 1934]
 [ 750 2165]]
Recall Score =  0.7427101200686106
Precision Score =  0.5281776042937302
F1 score =  0.6173367550613058


<function __main__.evaluate(model, X, y)>

### Cross validation set

In [14]:
evaluate(logreg, X_cv, y_cv)

Average Error: 0.2377 degrees.
              precision    recall  f1-score   support

         0.0       0.90      0.77      0.83      2910
         1.0       0.50      0.73      0.60       922

    accuracy                           0.76      3832
   macro avg       0.70      0.75      0.71      3832
weighted avg       0.81      0.76      0.78      3832

Confusion matrix:
 [[2244  666]
 [ 245  677]]
Recall Score =  0.7342733188720173
Precision Score =  0.5040953090096798
F1 score =  0.5977924944812361


<function __main__.evaluate(model, X, y)>

### Test set

In [15]:
evaluate(logreg, X_test, y_test)

Average Error: 0.2278 degrees.
              precision    recall  f1-score   support

         0.0       0.90      0.78      0.84      2895
         1.0       0.52      0.73      0.61       937

    accuracy                           0.77      3832
   macro avg       0.71      0.76      0.73      3832
weighted avg       0.81      0.77      0.78      3832

Confusion matrix:
 [[2271  624]
 [ 249  688]]
Recall Score =  0.7342582710779082
Precision Score =  0.524390243902439
F1 score =  0.611827478879502


<function __main__.evaluate(model, X, y)>

### Coefficients

In [16]:
np.argmax(np.abs(logreg.coef_))

72

In [17]:
np.max(np.abs(logreg.coef_))

0.3205078533262729

In [18]:
# joblib.dump(log_reg, "Models/default_log_reg_first_ver.pkl")
# joblib.dump(log_reg, "Models/log_reg_under_sample.pkl")
joblib.dump(logreg, "Models/log_reg_no_under_sample_balanced_class_weight.pkl")

['Models/log_reg_no_under_sample_balanced_class_weight.pkl']