# Training Logistic Regression 

## Code for training a logistic regression model using cross-validation

In [1]:
import os
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV,RepeatedStratifiedKFold, cross_val_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
"""Functions def"""
def read_data(raw_clinical_note):
    data = pd.read_csv(raw_clinical_note, header=0,na_filter=True)
    return data

def saving_model(model,model_name,logs_file):
    model_name = model_name + '.pkl'
    url_save = os.path.join(logs_file,str(model_name))
    joblib.dump(model, url_save)

In [3]:
"""Loading training sets"""
path = "C:/Users/Salvador/Modelo_COVID19/Libretas manuscrito/BCM Infectius diseases/BCM/"
x_train = pd.read_csv(path + "X_train.csv")
y_train = pd.read_csv(path + "y_train.csv")
print ("x_train:", x_train.shape, "y_train:",y_train.shape)

x_train: (9251, 35) y_train: (9251, 2)


In [4]:
x_train = x_train.drop(['Unnamed: 0'],axis=1)
y_train = y_train.drop(['Unnamed: 0'],axis=1)
print ("x_train:", x_train.shape, "y_train:", y_train.shape)

x_train: (9251, 34) y_train: (9251, 1)


In [5]:
y_train.value_counts()

lethality
0            7953
1            1298
dtype: int64

In [6]:
x_train

Unnamed: 0,Sex,Age,Fever,Cough,Odynophagia,Dyspnea,Irritability,Diarrhea,Chest pain,Chills,...,COPD,Asthma,Immunosuppression,Hypertension,Cardiovascular disease,Obesity,CKD,Smoking,Use of antipyretics,Days elapsed
0,0.923870,-1.200365,0.426006,0.390861,-1.264644,-0.886320,-0.425121,-0.519490,-0.696870,0.957761,...,-0.142059,-0.138453,-0.078738,-0.560746,-0.147124,-0.452107,-0.172061,-0.258154,-1.053191,0.226939
1,-1.082403,0.340091,0.426006,0.390861,0.790736,-0.886320,-0.425121,-0.519490,-0.696870,0.957761,...,-0.142059,-0.138453,-0.078738,-0.560746,-0.147124,-0.452107,-0.172061,-0.258154,-1.053191,-1.026691
2,-1.082403,0.054821,-2.347382,0.390861,-1.264644,1.128261,-0.425121,-0.519490,-0.696870,0.957761,...,7.039312,-0.138453,-0.078738,1.783339,-0.147124,-0.452107,-0.172061,-0.258154,-1.053191,-0.190938
3,0.923870,-1.314472,-2.347382,0.390861,0.790736,-0.886320,-0.425121,-0.519490,-0.696870,0.957761,...,-0.142059,-0.138453,-0.078738,-0.560746,-0.147124,-0.452107,-0.172061,-0.258154,-1.053191,1.898446
4,-1.082403,-0.173394,-2.347382,0.390861,0.790736,-0.886320,-0.425121,-0.519490,1.434989,0.957761,...,-0.142059,-0.138453,-0.078738,-0.560746,-0.147124,-0.452107,-0.172061,-0.258154,-1.053191,-1.026691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9246,-1.082403,-0.515718,0.426006,0.390861,-1.264644,-0.886320,-0.425121,1.924966,1.434989,-1.044102,...,-0.142059,-0.138453,-0.078738,-0.560746,-0.147124,-0.452107,-0.172061,-0.258154,0.949495,-0.608815
9247,-1.082403,0.682414,0.426006,0.390861,0.790736,1.128261,-0.425121,1.924966,-0.696870,0.957761,...,-0.142059,-0.138453,-0.078738,1.783339,-0.147124,2.211867,-0.172061,-0.258154,-1.053191,0.226939
9248,0.923870,1.538223,0.426006,0.390861,-1.264644,1.128261,2.352274,-0.519490,-0.696870,-1.044102,...,-0.142059,-0.138453,-0.078738,1.783339,-0.147124,-0.452107,-0.172061,3.873653,-1.053191,-0.608815
9249,0.923870,0.796522,0.426006,0.390861,-1.264644,-0.886320,2.352274,-0.519490,1.434989,0.957761,...,-0.142059,-0.138453,-0.078738,1.783339,-0.147124,-0.452107,-0.172061,-0.258154,-1.053191,0.644816


In [7]:
estimator = LogisticRegression()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=442)
sample_weights = class_weight.compute_sample_weight('balanced', y_train)
parameters={"C":np.logspace(-3,3,7), 
            "penalty":["l1","l2","elasticnet"],
            "class_weight": ['balanced'], 
            "l1_ratio":[0.1,0.2,0.4,0.5,0.6,0.7,0.8,0.9],
            "solver":['saga','liblinear'],
            "random_state": [422]}
grid = GridSearchCV(estimator=estimator, param_grid=parameters, cv = cv, scoring='roc_auc',refit = True)
grid.fit(x_train,y_train,sample_weight = sample_weights)

In [8]:
print('Best params achieved via GridSearch: \n',grid.best_params_)
print('Best score in training:', grid.best_score_)

Best params achieved via GridSearch: 
 {'C': 0.01, 'class_weight': 'balanced', 'l1_ratio': 0.5, 'penalty': 'elasticnet', 'random_state': 422, 'solver': 'saga'}
Best score in training: 0.8474042005913957


In [9]:
"""Visualing metrics from gridSearchCV"""
cv_results = pd.DataFrame.from_dict(grid.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_l1_ratio,param_penalty,param_random_state,param_solver,...,split93_test_score,split94_test_score,split95_test_score,split96_test_score,split97_test_score,split98_test_score,split99_test_score,mean_test_score,std_test_score,rank_test_score
0,0.148197,0.048504,0.007911,0.003321,0.001,balanced,0.1,l1,422,saga,...,0.820164,0.831930,0.813164,0.797663,0.820948,0.870576,0.844625,0.831898,0.014699,271
1,0.047482,0.016088,0.009942,0.003644,0.001,balanced,0.1,l1,422,liblinear,...,0.820842,0.832322,0.813454,0.797663,0.820735,0.871306,0.844920,0.832135,0.014663,263
2,0.176633,0.066700,0.007615,0.002767,0.001,balanced,0.1,l2,422,saga,...,0.841485,0.851746,0.827934,0.813662,0.831427,0.866638,0.863493,0.845275,0.014288,130
3,0.057154,0.017368,0.008869,0.003317,0.001,balanced,0.1,l2,422,liblinear,...,0.841795,0.851621,0.828428,0.814340,0.831950,0.867179,0.862796,0.845500,0.014277,122
4,0.170838,0.061328,0.007538,0.002891,0.001,balanced,0.1,elasticnet,422,saga,...,0.839516,0.850668,0.830392,0.814427,0.830377,0.872298,0.859860,0.845826,0.014222,121
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,0.043130,0.013865,0.005872,0.001995,1000,balanced,0.9,l1,422,liblinear,...,0.843169,0.851108,0.828428,0.816788,0.835946,0.874011,0.860581,0.847043,0.014314,45
332,0.446745,0.168726,0.004831,0.001862,1000,balanced,0.9,l2,422,saga,...,0.837576,0.851679,0.826463,0.812540,0.833982,0.871892,0.847615,0.842345,0.014723,243
333,0.049501,0.015823,0.006353,0.002081,1000,balanced,0.9,l2,422,liblinear,...,0.843169,0.851108,0.828418,0.816788,0.835955,0.874011,0.860561,0.847043,0.014313,53
334,0.589832,0.231197,0.005004,0.001921,1000,balanced,0.9,elasticnet,422,saga,...,0.837576,0.851679,0.826463,0.812540,0.833982,0.871892,0.847615,0.842346,0.014723,241


In [10]:
"""Saving AUC (cross-validation)"""
cv_results = pd.DataFrame.from_dict(grid.cv_results_)
auc_training = cv_results.mean_test_score
auc_training.to_csv(path+"AUC_crossval_LR_0.csv")

In [11]:
""" Re-training the model with the best params"""
best_params = grid.best_params_
model = LogisticRegression(**best_params)
model = model.fit(x_train,y_train)

In [12]:
"""Saving model"""
saving_model(model,"LR_model_0",path)