In [1]:
import joblib

In [2]:
import numpy as np

In [3]:
clf = joblib.load("Models/log_reg_no_under_sample_balanced_class_weight.pkl")

In [4]:
clf

LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)

In [5]:
X_train = np.genfromtxt("Data/X_train.csv", delimiter=',')
y_train = np.genfromtxt("Data/y_train.csv", delimiter=',')

In [6]:
X_cv = np.genfromtxt("Data/X_cv.csv", delimiter=',')
y_cv = np.genfromtxt("Data/y_cv.csv", delimiter=',')

In [7]:
X_test = np.genfromtxt("Data/X_test.csv", delimiter=',')
y_test = np.genfromtxt("Data/y_test.csv", delimiter=',')

In [8]:
from sklearn.linear_model import LogisticRegression

In [9]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score

In [10]:
C_list = [0.0001, 0.001, 0.01, 0.1, 1.0, 3.0, 5.0, 10.0, 100]

In [11]:
models = []
train_f1_score = []
cv_f1_score = []

for C in C_list:
    log_reg = LogisticRegression(max_iter=1000, random_state=42, class_weight="balanced", C=C)
    log_reg.fit(X_train, y_train)
    models.append(log_reg)

    y_train_pred = log_reg.predict(X_train)
    train_f1_score.append(f1_score(y_train, y_train_pred))
    
    y_cv_pred = log_reg.predict(X_cv)
    cv_f1_score.append(f1_score(y_cv, y_cv_pred))

In [12]:
print(train_f1_score)

[0.5819536423841061, 0.6186065243656866, 0.6223710649698594, 0.6206527554842162, 0.6205512443136205, 0.6204682274247492, 0.6204682274247492, 0.6204682274247492, 0.6204682274247492]


In [13]:
print(cv_f1_score)

[0.5627462014631401, 0.5951219512195122, 0.5920344456404736, 0.5931108719052746, 0.5927918235610543, 0.5927918235610543, 0.5927918235610543, 0.5927918235610543, 0.5927918235610543]


In [14]:
for model in models:
    y_test_pred = model.predict(X_test)
    print(f1_score(y_test, y_test_pred))

0.5907692307692308
0.6243070362473347
0.6256366723259763
0.6228813559322034
0.6228813559322034
0.6226175349428208
0.6226175349428208
0.6226175349428208
0.6226175349428208


In [15]:
best_model = models[np.argmax(cv_f1_score)]

In [16]:
y_test_pred = best_model.predict(X_test)
print(f1_score(y_test, y_test_pred))

0.6243070362473347


In [17]:
joblib.dump(best_model, 'Models/best_model.pkl')

['Models/best_model.pkl']