In [89]:
# basic modules
import os
import time
import random as rn
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# plotting style
plt.style.use('seaborn-v0_8-notebook')
# sns.set_style('notebook')
sns.set_style('darkgrid')

# pandas tricks for better display
pd.options.display.max_columns = 50  
pd.options.display.max_rows = 500     
pd.options.display.max_colwidth = 100
pd.options.display.precision = 3

# preprocessing
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

# models
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# metrics & utilities 
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, RocCurveDisplay, auc
from sklearn.metrics import average_precision_score, precision_recall_curve, PrecisionRecallDisplay
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.utils import resample

# warnings
import warnings
warnings.filterwarnings("ignore")

# user defined functions
from utility_functions import process_max_glu_serum, process_A1Cresult, process_medical_specialty, process_race
from utility_functions import process_diag_codes, process_age, process_discharge_disposition_id, process_admission_type_id
from utility_functions import process_admission_source_ID, process_readmitted, process_diabetesMed_and_change, preprocess_df
from utility_functions import get_previous_encounters, aggregate_previous_encounters, get_last_encounter, aggregate_encounters
from utility_functions import get_performance_metrics, get_results_df, plot_performance_metrics
from utility_functions import plot_ROC_curves, plot_PR_curves
from imblearn.over_sampling import SMOTE, SMOTENC

from imblearn.over_sampling import RandomOverSampler

In this code I first explore elastic net and lasso logisitic regression on our diabetes dataset, finding lasso to perform just as well a elastic net. Since lasso requires less hyperparameter tuning, we continue on with lasso logistic regression.

Next, we explore polynomial lasso logistic regression. To limit the number of terms, we use polynomial features only of terms that did not go to zero in lasso logistic regression, which ends up resulting in about 6000 features in the polynomial logistic regression. After the training, about 700 terms remain. This ends up being the best logistic regression model.

To improve the results of logistic regression, we attempt to use SMOTENC and random oversampling, but find that the oversampling typically results in worse test performance.

<h2> Loading in Training and Test Data </h2>

In [3]:
X_train_transformed = pd.read_csv('../data/X_train_transformed.csv', na_values='?',
                            low_memory=False, # silence the mixed dtypes warning
                            index_col='patient_nbr')

In [5]:
X_test_transformed = pd.read_csv('../data/X_test_transformed.csv', na_values='?',
                            low_memory=False, # silence the mixed dtypes warning
                            index_col='patient_nbr')

In [6]:
y_train = pd.read_csv('../data/y_train.csv', na_values='?',
                            low_memory=False, # silence the mixed dtypes warning
                            index_col='patient_nbr')

In [7]:
y_test = pd.read_csv('../data/y_test.csv', na_values='?',
                            low_memory=False, # silence the mixed dtypes warning
                            index_col='patient_nbr')

In [8]:
X_train_transformed.shape, X_test_transformed.shape, y_train.shape, y_test.shape

((55992, 135), (13998, 135), (55992, 1), (13998, 1))

In [10]:
def print_stats(dataset_type, model, X, y):
    y_pred_prob = model.predict_proba(X)[:,1]
    print(f'------------------{dataset_type}----------------------')
    print(f'Readmitted Rate:\tActual:{np.round(np.mean(y),4)}\tPredicted:{np.round(np.mean(y_pred_prob),4)}')
    print(f'Accuracy:\t\tNaive:{np.round(1-np.mean(y),3)}\tBase Model:{np.round(model.score(X,y),3)}')
    print(f'AUC:\t\t\t{np.round(roc_auc_score(y, y_pred_prob),3)}')


<h2> Trying Elastic Net Logistic </h2>

In [11]:
# Lasso Logistic Regression, uses elastic net and unbalanced

start_time = time.time()

# C values to try 
Cs = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]

# logistic regression model with elasticnet penalty and 10-fold CV
lrr = LogisticRegressionCV(
    Cs=Cs, cv=2, penalty='elasticnet', solver='saga', n_jobs=-1, # solver='liblinear' for l1
    scoring='roc_auc', max_iter=1000, random_state=109, l1_ratios=[0, 0.5, 1],
).fit(X_train_transformed, y_train)

end_time = time.time()
print(f"Model fitting time: {end_time - start_time:.1f} seconds")

Model fitting time: 440.6 seconds


In [12]:
print_stats('TRAIN-LASSO ELASTIC NET', lrr, X_train_transformed, y_train)
print_stats('TEST-LASSO ELASTIC NET', lrr, X_test_transformed, y_test)

------------------TRAIN-LASSO ELASTIC NET----------------------
Readmitted Rate:	Actual:0.0496	Predicted:0.0496
Accuracy:		Naive:0.95	Base Model:0.71
AUC:			0.71
------------------TEST-LASSO ELASTIC NET----------------------
Readmitted Rate:	Actual:0.0496	Predicted:0.05
Accuracy:		Naive:0.95	Base Model:0.68
AUC:			0.68


<h2> Trying Lasso Logistic Regression </h2>

In [13]:
# Lasso Logistic Regression

start_time = time.time()

# C values to try 
Cs = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]

# logistic regression model with elasticnet penalty and 10-fold CV
lasso_logit = LogisticRegressionCV(
    Cs=Cs, cv=3, penalty='l1', solver='liblinear', n_jobs=-1, # solver='liblinear' for l1
    scoring='roc_auc', max_iter=1000, random_state=109,
).fit(X_train_transformed, y_train)

end_time = time.time()
print(f"Model fitting time: {end_time - start_time:.1f} seconds")

Model fitting time: 163.8 seconds


In [14]:
print_stats('TRAIN - LASSO', lasso_logit, X_train_transformed, y_train)
print_stats('TEST  - LASSO', lasso_logit, X_test_transformed, y_test)

------------------TRAIN - LASSO----------------------
Readmitted Rate:	Actual:0.0496	Predicted:0.0496
Accuracy:		Naive:0.95	Base Model:0.711
AUC:			0.711
------------------TEST  - LASSO----------------------
Readmitted Rate:	Actual:0.0496	Predicted:0.0502
Accuracy:		Naive:0.95	Base Model:0.681
AUC:			0.681


In [17]:
lasso_logit.C_

array([1.])

In [None]:
zeroed_feats = [feature for coef, feature in zip(lasso_logit.coef_[0], X_train_transformed.columns) if coef == 0]
zeroed_feats


In [None]:
unzeroed_feats = [feature for coef, feature in zip(lasso_logit.coef_[0], X_train_transformed.columns) if coef != 0]
unzeroed_feats

In [16]:
len(zeroed_feats)

27

In [20]:
len(unzeroed_feats)

108

<h2> Poly Logistic Regression Only on Non Zero Features </h2>

In [24]:
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)

Make polynomial features only from features that did not go to zero in lasso logistic regression.

In [25]:
X_train_transformed_poly = poly.fit_transform(X_train_transformed[unzeroed_feats])

In [26]:
X_train_transformed_poly.shape

(55992, 5994)

In [27]:
X_test_transformed_poly = poly.transform(X_test_transformed[unzeroed_feats])

In [28]:
start_time = time.time()

# C values to try 
Cs = [1e-2, 1e-1, 1e0]

# logistic regression model with elasticnet penalty and 10-fold CV
lasso_logit_poly = LogisticRegressionCV(
    Cs=Cs, cv=2, penalty='l1', solver='liblinear', n_jobs=-1, # solver='liblinear' for l1
    scoring='roc_auc', max_iter=1000, random_state=109,
).fit(X_train_transformed_poly, y_train)

end_time = time.time()
print(f"Model fitting time: {end_time - start_time:.1f} seconds")

Model fitting time: 2553.7 seconds


Note, for potentially faster training later, that the optimal C for logisitic lasso with polynomial features was 0.1.

In [131]:
lasso_logit_poly.C_

array([0.1])

In [30]:
print_stats('TRAIN - LASSO', lasso_logit_poly, X_train_transformed_poly, y_train)
print_stats('TEST  - LASSO', lasso_logit_poly, X_test_transformed_poly, y_test)

------------------TRAIN - LASSO----------------------
Readmitted Rate:	Actual:0.0496	Predicted:0.0498
Accuracy:		Naive:0.95	Base Model:0.785
AUC:			0.785
------------------TEST  - LASSO----------------------
Readmitted Rate:	Actual:0.0496	Predicted:0.0499
Accuracy:		Naive:0.95	Base Model:0.706
AUC:			0.706


After the training, only 747 features remain. Over 5000 features went to zero.

In [132]:
sum(lasso_logit_poly.coef_[0] != 0), sum(lasso_logit_poly.coef_[0] == 0)

(747, 5247)

<h3> SMOTE on X_TRAIN_TRANSFORMED </h3>

Try SMOTE before any polynomial features are added but after the data processing. Formally, this is not correct, as SMOTE will not properly handle the one hot encoded variables.

In [36]:
smt = SMOTE(random_state=109)

In [48]:
X_train_transformed_resampled, y_train_resampled = smt.fit_resample(X_train_transformed, y_train)

In [45]:
X_train_transformed_resampled

Unnamed: 0,ohe__race_Asian,ohe__race_Caucasian,ohe__race_Hispanic,ohe__race_Other,ohe__race_UNK,ohe__gender_Male,ohe__gender_Unknown/Invalid,ohe__age_[50-60),ohe__age_[60-70),ohe__age_[70-80),ohe__age_[80-100),ohe__admission_type_id_Emergency,ohe__admission_type_id_Other,ohe__admission_type_id_Urgent,ohe__discharge_disposition_id_Home w/ Service,ohe__discharge_disposition_id_Other,ohe__discharge_disposition_id_SNF,ohe__admission_source_id_Other,ohe__admission_source_id_Physician Referral,ohe__medical_specialty_Emergency/Trauma,ohe__medical_specialty_Family/GeneralPractice,ohe__medical_specialty_InternalMedicine,ohe__medical_specialty_Nephrology,ohe__medical_specialty_Orthopedics,ohe__medical_specialty_Other,...,ohe__med_ever_changed_Yes,ohe__diabetic_med_always_prescribed_Yes,ohe__diabetic_med_ever_prescribed_Yes,ohe__always_readmitted_Yes,ohe__ever_readmitted_Yes,quant__num_procedures,quant__num_times_glu_high,quant__num_times_a1c_high,quant__log_time_in_hospital,quant__log_num_lab_procedures,quant__log_num_medications,quant__log_number_outpatient,quant__log_number_emergency,quant__log_number_inpatient,quant__log_number_diagnoses,quant__log_num_encounters,quant__log_avg_time_in_hospital,quant__log_avg_num_lab_procedures,quant__log_avg_num_procedures,quant__log_avg_times_glu_high,quant__log_avg_times_a1c_high,quant__log_avg_times_med_changed,quant__log_num_times_med_changed,quant__log_avg_times_readmitted,quant__log_num_times_readmitted
0,0.0,1.000,0.0,0.000,0.0,1.000,0.0,0.000,0.0,0.000,1.0,0.0,0.0,1.0,0.0,0.000,0.0,0.0,1.000,0.0,0.000,0.0,0.0,0.0,1.000,...,0.000,0.0,0.0,0.0,0.0,-0.224,-0.076,-0.18,-1.554,-1.408,-1.016,-0.375,-0.291,-0.564,-2.442,-0.501,-0.521,-0.539,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287
1,0.0,1.000,0.0,0.000,0.0,0.000,0.0,0.000,1.0,0.000,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.000,1.0,0.000,0.0,0.0,0.0,0.000,...,0.000,0.0,0.0,0.0,0.0,-0.802,-0.076,-0.18,-0.257,-3.683,-1.828,-0.375,-0.291,-0.564,-1.018,-0.501,-0.521,-0.539,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287
2,0.0,1.000,0.0,0.000,0.0,1.000,0.0,0.000,0.0,1.000,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,1.000,0.0,0.000,0.0,0.0,0.0,1.000,...,0.000,0.0,0.0,0.0,0.0,-0.224,-0.076,-0.18,-1.554,-3.683,0.372,-0.375,-0.291,-0.564,-2.442,-0.501,-0.521,-0.539,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287
3,0.0,0.000,0.0,0.000,1.0,1.000,0.0,0.000,1.0,0.000,0.0,1.0,0.0,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.000,0.0,0.0,0.0,1.000,...,0.000,0.0,0.0,0.0,0.0,-0.802,-0.076,-0.18,0.160,0.212,-0.280,-0.375,-0.291,-0.564,0.775,-0.501,-0.521,-0.539,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287
4,0.0,1.000,0.0,0.000,0.0,1.000,0.0,0.000,0.0,1.000,0.0,1.0,0.0,0.0,0.0,0.000,0.0,0.0,0.000,0.0,0.000,0.0,0.0,0.0,1.000,...,0.000,0.0,0.0,0.0,0.0,-0.802,-0.076,-0.18,-0.796,0.378,-0.805,-0.375,-0.291,-0.564,-1.018,-0.501,-0.521,-0.539,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106429,0.0,0.000,0.0,0.000,0.0,1.000,0.0,0.988,0.0,0.000,0.0,1.0,0.0,0.0,0.0,0.988,0.0,0.0,0.000,0.0,0.988,0.0,0.0,0.0,0.000,...,0.000,0.0,0.0,0.0,0.0,0.382,-0.076,-0.18,0.786,-0.237,-0.601,-0.375,-0.291,-0.564,0.775,-0.501,-0.521,-0.539,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287
106430,0.0,0.648,0.0,0.352,0.0,0.648,0.0,0.352,0.0,0.648,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,1.000,0.0,0.000,0.0,0.0,0.0,1.000,...,1.000,1.0,1.0,0.0,0.0,-0.599,-0.076,-0.18,-1.554,-2.653,-0.331,-0.375,-0.291,-0.564,0.260,1.679,0.854,1.692,2.882,-0.093,-0.188,2.891,2.620,-0.283,-0.287
106431,0.0,1.000,0.0,0.000,0.0,0.893,0.0,0.107,0.0,0.000,0.0,1.0,0.0,0.0,0.0,0.000,1.0,0.0,0.000,0.0,0.000,0.0,0.0,0.0,0.107,...,0.000,0.0,0.0,0.0,0.0,-0.224,-0.076,-0.18,0.501,0.612,0.663,-0.375,-0.291,-0.564,0.445,-0.501,-0.521,-0.539,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287
106432,0.0,1.000,0.0,0.000,0.0,0.000,0.0,0.000,0.0,0.000,1.0,1.0,0.0,0.0,0.0,0.000,1.0,0.0,0.000,0.0,0.000,0.0,0.0,0.0,1.000,...,0.000,0.0,0.0,0.0,0.0,0.293,-0.076,-0.18,1.260,1.178,1.423,-0.375,-0.291,-0.564,0.736,-0.501,-0.521,-0.539,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287


In [49]:
y_train_resampled

Unnamed: 0,readmitted
0,0
1,0
2,0
3,0
4,0
...,...
106429,1
106430,1
106431,1
106432,1


<h3> Lasso Using SMOTE </h3>

In [50]:
# Lasso Logistic Regression

start_time = time.time()

# C values to try 
Cs = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]

# logistic regression model with elasticnet penalty and 10-fold CV
lasso_logit_SMOTE = LogisticRegressionCV(
    Cs=Cs, cv=3, penalty='l1', solver='liblinear', n_jobs=-1, # solver='liblinear' for l1
    scoring='roc_auc', max_iter=1000, random_state=109,
).fit(X_train_transformed_resampled, y_train_resampled)

end_time = time.time()
print(f"Model fitting time: {end_time - start_time:.1f} seconds")

Model fitting time: 789.5 seconds


In [52]:
print_stats('TRAIN - LASSO', lasso_logit_SMOTE, X_train_transformed, y_train)
print_stats('TRAIN RESAMPLED - LASSO', lasso_logit_SMOTE, X_train_transformed_resampled, y_train_resampled)
print_stats('TEST - LASSO', lasso_logit_SMOTE, X_test_transformed, y_test)

------------------TRAIN - LASSO----------------------
Readmitted Rate:	Actual:0.0496	Predicted:0.4255
Accuracy:		Naive:0.95	Base Model:0.704
AUC:			0.704
------------------TRAIN RESAMPLED - LASSO----------------------
Readmitted Rate:	Actual:0.5	Predicted:0.5
Accuracy:		Naive:0.5	Base Model:0.733
AUC:			0.733
------------------TEST - LASSO----------------------
Readmitted Rate:	Actual:0.0496	Predicted:0.4262
Accuracy:		Naive:0.95	Base Model:0.672
AUC:			0.672


<h2> Reload Data to do SMOTENC </h2>

In order to properly handle the one hot encoded variables, SMOTENC should be used on the preprocessed data before one hot encoding. First, load the processed dataset.

In [53]:
# load the encounter-level data
df_patients = pd.read_csv('../data/diabetic_data_processed_2.csv', 
                            na_values='?',
                            low_memory=False, # silence the mixed dtypes warning
                            index_col='patient_nbr'
                           )
df_patients.head()

Unnamed: 0_level_0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,...,mean_diagnoses,min_diagnoses,max_diagnoses,unique_glu_measurements,avg_times_glu_high,num_times_glu_high,glu_always_high,glu_ever_high,unique_a1c_results,avg_times_a1c_high,num_times_a1c_high,a1c_always_high,a1c_ever_high,avg_times_med_changed,num_times_med_changed,med_always_changed,med_ever_changed,avg_times_diabetic_med_prescribed,num_times_diabetic_med_prescribed,diabetic_med_always_prescribed,diabetic_med_ever_prescribed,avg_times_readmitted,num_times_readmitted,always_readmitted,ever_readmitted
patient_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
135,Caucasian,Female,[50-60),Emergency,Home,Emergency Room,3,Other,31,1,14,0,0,1,injury,other,neoplasms,5,UNK,UNK,Steady,No,No,No,No,...,8.0,8.0,8.0,1.0,0.0,0.0,No,No,1.0,0.0,0.0,No,No,1.0,1.0,Yes,Yes,1.0,1.0,Yes,Yes,1.0,1.0,Yes,Yes
378,Caucasian,Female,[50-60),Elective,Home,Physician Referral,2,Other,49,1,11,0,0,0,musculoskeletal,other,neoplasms,3,UNK,UNK,No,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,No,No,0.0,0.0,0.0,No,No,0.0,0.0,No,No,0.0,0.0,No,No,0.0,0.0,No,No
729,Caucasian,Female,[80-100),Emergency,SNF,Emergency Room,4,InternalMedicine,68,2,23,0,0,0,injury,respiratory,injury,9,UNK,>7,Steady,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,No,No,0.0,0.0,0.0,No,No,0.0,0.0,No,No,0.0,0.0,No,No,0.0,0.0,No,No
774,Caucasian,Female,[80-100),Emergency,Home,Emergency Room,3,InternalMedicine,46,0,20,0,0,0,neoplasms,other,other,9,UNK,>8,Steady,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,No,No,0.0,0.0,0.0,No,No,0.0,0.0,No,No,0.0,0.0,No,No,0.0,0.0,No,No
927,AfricanAmerican,Female,[0-50),Emergency,Home,Emergency Room,5,InternalMedicine,49,0,5,0,0,0,genitournary,neoplasms,neoplasms,3,UNK,UNK,No,No,No,No,Steady,...,0.0,0.0,0.0,0.0,0.0,0.0,No,No,0.0,0.0,0.0,No,No,0.0,0.0,No,No,0.0,0.0,No,No,0.0,0.0,No,No


Split the dataset to get the train and test data.

In [54]:
# function to perform data partitioning
def stratified_split(df):
    y = df['readmitted']
    X = df.drop(columns=['readmitted'])
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        stratify=y, 
        train_size=0.80, 
        random_state=109
    )

    print('===============================================================')
    print('Before splitting the class percentage in our dataset is:',  
          round(df['readmitted'].sum()/len(df['readmitted']), 4))
    print('After splitting the class percentage in y_train is:', 
          round(y_train.sum()/len(y_train), 4))
    print('After splitting the class percentage in y_test is:', 
          round(y_test.sum()/len(y_test), 4))
    print('===============================================================')
    
    return X_train, X_test, y_train, y_test

In [55]:
# partition the data
X_train, X_test, y_train, y_test = stratified_split(df_patients)

# check the dimensions
print(f'Training features shape: {X_train.shape}')
print(f'Testing features shape: {X_test.shape}')
print(f'Training target shape: {y_train.shape}')
print(f'Testing target shape: {y_test.shape}')

Before splitting the class percentage in our dataset is: 0.0496
After splitting the class percentage in y_train is: 0.0496
After splitting the class percentage in y_test is: 0.0496
Training features shape: (55992, 83)
Testing features shape: (13998, 83)
Training target shape: (55992,)
Testing target shape: (13998,)


We will need a list of the categorical columns to feed into SMOTENC.

In [56]:
# get lists of qualitative and quantitative column names
qual_cols = [cname for cname in X_train.columns if X_train[cname].dtype == "object"]
quant_cols = [cname for cname in X_train.columns if X_train[cname].dtype != "object"]

In [None]:
qual_cols

<h2> Doing SMOTENC </h2>

In [60]:
smtnc = SMOTENC(categorical_features=qual_cols, random_state=109)

In [62]:
X_train_resampled, y_train_resampled = smtnc.fit_resample(X_train, y_train)

In [63]:
X_train_resampled.shape

(106434, 83)

<h2> Transformation of Resampled Dataframe </h2>

Now that we have done SMOTE, we can complete the one hot encoding (and standardization, which could have been done earlier too)

In [65]:
# custom transformer code based on A. Geron Book by O'Reilly 
class PrepQuant(BaseEstimator, TransformerMixin):
    
    def __init__(self, corr_threshold=0.85, cardinality_threshold=10):
        self.corr_threshold=corr_threshold
        self.cardinality_threshold=cardinality_threshold
    
    def fit(self, X, y=None):
        #source: stackoverflow
        corr_matrix = X.corr().abs()
        # Select upper triangle of correlation matrix
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        self.corr_cols = [column for column in upper.columns if any(upper[column] >= self.corr_threshold)] 
        
        self.high_cardinal_cols = [col for col in X.columns if X[col].nunique()>self.cardinality_threshold]
        
        return self
    
    def transform(self, X):
            
        X.drop(self.corr_cols, axis=1, inplace=True)
        
        #log Transformation is done here not because Logistic Regression
        #depends on it but to reduce the effects of outliers
        for col in X.columns:
            if col in self.high_cardinal_cols:
                X['log_'+col]=np.log1p(X[col])
                X.drop(col,axis=1,inplace=True)
        self.columns = X.columns
        return X
                
    def get_feature_names_out(self, *args, **params):
        return self.columns

In [66]:
# create the feature engineering pipeline
quant_pipeline = make_pipeline(PrepQuant(), StandardScaler())
ct = ColumnTransformer([('ohe',OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'),
                         qual_cols),
                       ('quant',quant_pipeline,quant_cols)])

# convert pipeline output to a dataframe
ct.set_output(transform='pandas')

# fit and transform on the training data
X_train_transformed = ct.fit_transform(X_train)
print(f'Transformed training data dimensions: {X_train_transformed.shape}')

# fit and transform on the training data
X_train_resampled_transformed = ct.transform(X_train_resampled)
print(f'Transformed training data dimensions: {X_train_resampled_transformed.shape}')

# transform the testing data
X_test_transformed = ct.transform(X_test)
print(f'Transformed testing data dimensions: {X_test_transformed.shape}')

Transformed training data dimensions: (55992, 135)
Transformed training data dimensions: (106434, 135)
Transformed testing data dimensions: (13998, 135)


In [67]:
X_train_resampled_transformed

Unnamed: 0,ohe__race_Asian,ohe__race_Caucasian,ohe__race_Hispanic,ohe__race_Other,ohe__race_UNK,ohe__gender_Male,ohe__gender_Unknown/Invalid,ohe__age_[50-60),ohe__age_[60-70),ohe__age_[70-80),ohe__age_[80-100),ohe__admission_type_id_Emergency,ohe__admission_type_id_Other,ohe__admission_type_id_Urgent,ohe__discharge_disposition_id_Home w/ Service,ohe__discharge_disposition_id_Other,ohe__discharge_disposition_id_SNF,ohe__admission_source_id_Other,ohe__admission_source_id_Physician Referral,ohe__medical_specialty_Emergency/Trauma,ohe__medical_specialty_Family/GeneralPractice,ohe__medical_specialty_InternalMedicine,ohe__medical_specialty_Nephrology,ohe__medical_specialty_Orthopedics,ohe__medical_specialty_Other,...,ohe__med_ever_changed_Yes,ohe__diabetic_med_always_prescribed_Yes,ohe__diabetic_med_ever_prescribed_Yes,ohe__always_readmitted_Yes,ohe__ever_readmitted_Yes,quant__num_procedures,quant__num_times_glu_high,quant__num_times_a1c_high,quant__log_time_in_hospital,quant__log_num_lab_procedures,quant__log_num_medications,quant__log_number_outpatient,quant__log_number_emergency,quant__log_number_inpatient,quant__log_number_diagnoses,quant__log_num_encounters,quant__log_avg_time_in_hospital,quant__log_avg_num_lab_procedures,quant__log_avg_num_procedures,quant__log_avg_times_glu_high,quant__log_avg_times_a1c_high,quant__log_avg_times_med_changed,quant__log_num_times_med_changed,quant__log_avg_times_readmitted,quant__log_num_times_readmitted
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,-0.224,-0.076,-0.18,-1.554,-1.408,-1.016,-0.375,-0.291,-0.564,-2.442,-0.501,-0.521,-0.539,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.802,-0.076,-0.18,-0.257,-3.683,-1.828,-0.375,-0.291,-0.564,-1.018,-0.501,-0.521,-0.539,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,-0.224,-0.076,-0.18,-1.554,-3.683,0.372,-0.375,-0.291,-0.564,-2.442,-0.501,-0.521,-0.539,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287
3,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,-0.802,-0.076,-0.18,0.160,0.212,-0.280,-0.375,-0.291,-0.564,0.775,-0.501,-0.521,-0.539,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,-0.802,-0.076,-0.18,-0.796,0.378,-0.805,-0.375,-0.291,-0.564,-1.018,-0.501,-0.521,-0.539,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106429,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,-0.224,-0.076,-0.18,0.790,-0.288,-0.614,-0.375,-0.291,-0.564,0.775,-0.501,-0.521,-0.539,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287
106430,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,-0.802,-0.076,-0.18,-1.554,-3.683,-0.440,-0.375,-0.291,-0.564,0.405,1.720,0.860,1.843,2.648,-0.093,-0.188,1.978,2.230,-0.283,-0.287
106431,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,-0.224,-0.076,-0.18,0.501,0.614,0.583,-0.375,-0.291,-0.564,0.405,-0.501,-0.521,-0.539,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287
106432,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,-0.224,-0.076,-0.18,1.260,0.525,1.108,-0.375,-0.291,-0.564,-0.008,-0.501,-0.521,-0.539,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287


<h2> Lasso Logistic Using SMOTENC </h2>

With the oversampled dataset, we can now try lasso logisitic regression.

In [68]:
# Lasso Logistic Regression

start_time = time.time()

# C values to try 
Cs = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]

# logistic regression model with elasticnet penalty and 10-fold CV
lasso_logit_SMOTENC = LogisticRegressionCV(
    Cs=Cs, cv=3, penalty='l1', solver='liblinear', n_jobs=-1, # solver='liblinear' for l1
    scoring='roc_auc', max_iter=1000, random_state=109,
).fit(X_train_resampled_transformed, y_train_resampled)

end_time = time.time()
print(f"Model fitting time: {end_time - start_time:.1f} seconds")

Model fitting time: 3217.4 seconds


Lasso logistic regression trained on SMOTENC oversampled data performs much worse on the test set, so don't move forward with this.

In [70]:
print_stats('TRAIN - LASSO', lasso_logit_SMOTENC, X_train_transformed, y_train)
print_stats('TRAIN RESAMPLED - LASSO', lasso_logit_SMOTENC, X_train_resampled_transformed, y_train_resampled)
print_stats('TEST - LASSO', lasso_logit_SMOTENC, X_test_transformed, y_test)

------------------TRAIN - LASSO----------------------
Readmitted Rate:	Actual:0.0496	Predicted:0.1988
Accuracy:		Naive:0.95	Base Model:0.583
AUC:			0.583
------------------TRAIN RESAMPLED - LASSO----------------------
Readmitted Rate:	Actual:0.5	Predicted:0.5
Accuracy:		Naive:0.5	Base Model:0.94
AUC:			0.94
------------------TEST - LASSO----------------------
Readmitted Rate:	Actual:0.0496	Predicted:0.2007
Accuracy:		Naive:0.95	Base Model:0.569
AUC:			0.569


In [83]:
lasso_logit_SMOTENC.coefs_paths_[1][0].shape

(7, 136)

In [84]:
X_train_resampled_transformed.describe()

Unnamed: 0,ohe__race_Asian,ohe__race_Caucasian,ohe__race_Hispanic,ohe__race_Other,ohe__race_UNK,ohe__gender_Male,ohe__gender_Unknown/Invalid,ohe__age_[50-60),ohe__age_[60-70),ohe__age_[70-80),ohe__age_[80-100),ohe__admission_type_id_Emergency,ohe__admission_type_id_Other,ohe__admission_type_id_Urgent,ohe__discharge_disposition_id_Home w/ Service,ohe__discharge_disposition_id_Other,ohe__discharge_disposition_id_SNF,ohe__admission_source_id_Other,ohe__admission_source_id_Physician Referral,ohe__medical_specialty_Emergency/Trauma,ohe__medical_specialty_Family/GeneralPractice,ohe__medical_specialty_InternalMedicine,ohe__medical_specialty_Nephrology,ohe__medical_specialty_Orthopedics,ohe__medical_specialty_Other,...,ohe__med_ever_changed_Yes,ohe__diabetic_med_always_prescribed_Yes,ohe__diabetic_med_ever_prescribed_Yes,ohe__always_readmitted_Yes,ohe__ever_readmitted_Yes,quant__num_procedures,quant__num_times_glu_high,quant__num_times_a1c_high,quant__log_time_in_hospital,quant__log_num_lab_procedures,quant__log_num_medications,quant__log_number_outpatient,quant__log_number_emergency,quant__log_number_inpatient,quant__log_number_diagnoses,quant__log_num_encounters,quant__log_avg_time_in_hospital,quant__log_avg_num_lab_procedures,quant__log_avg_num_procedures,quant__log_avg_times_glu_high,quant__log_avg_times_a1c_high,quant__log_avg_times_med_changed,quant__log_num_times_med_changed,quant__log_avg_times_readmitted,quant__log_num_times_readmitted
count,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106400.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,...,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0,106434.0
mean,0.004,0.854,0.012,0.009,0.014,0.446,2.819e-05,0.131,0.214,0.304,0.243,0.614,0.084,0.145,0.107,0.125,0.179,0.118,0.251,0.038,0.048,0.148,0.007,0.018,0.675,...,0.172,0.236,0.261,0.031,0.103,-0.115,0.028,0.018,0.055,0.077,0.078,-0.048,-0.032,0.122,0.075,0.144,0.134,0.126,0.073,0.024,0.007,0.11,0.135,0.106,0.135
std,0.061,0.353,0.107,0.093,0.119,0.497,0.005309,0.337,0.41,0.46,0.429,0.487,0.278,0.352,0.31,0.33,0.383,0.323,0.433,0.191,0.214,0.355,0.084,0.135,0.468,...,0.378,0.424,0.439,0.173,0.304,0.915,1.078,0.949,0.973,0.945,0.944,0.924,0.961,1.111,0.904,1.137,1.088,1.073,1.023,1.031,0.926,1.064,1.153,1.074,1.166
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.802,-0.076,-0.18,-1.554,-3.683,-4.028,-0.375,-0.291,-0.564,-4.875,-0.501,-0.521,-0.539,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287
25%,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.802,-0.076,-0.18,-0.796,-0.049,-0.44,-0.375,-0.291,-0.564,-0.477,-0.501,-0.521,-0.539,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287
50%,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,-0.224,-0.076,-0.18,0.16,0.325,0.136,-0.375,-0.291,-0.564,0.405,-0.501,-0.521,-0.539,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287
75%,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.354,-0.076,-0.18,0.79,0.635,0.681,-0.375,-0.291,0.987,0.775,1.08,1.02,1.443,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.668,93.627,33.835,2.216,1.617,3.408,8.812,14.809,6.14,2.638,7.912,3.278,2.585,4.555,12.309,6.337,2.891,11.132,4.285,12.951


<h2> Random Oversampling </h2>

Another technique is random oversampling, which will not impute data like SMOTENC does in the quantitative variables.

In [92]:
rand_oversamp = RandomOverSampler(random_state=109)

X_train_transformed_rand_resamp, y_train_rand_resamp = rand_oversamp.fit_resample(X_train_transformed, y_train)

In [93]:
X_train_transformed_rand_resamp

Unnamed: 0,ohe__race_Asian,ohe__race_Caucasian,ohe__race_Hispanic,ohe__race_Other,ohe__race_UNK,ohe__gender_Male,ohe__gender_Unknown/Invalid,ohe__age_[50-60),ohe__age_[60-70),ohe__age_[70-80),ohe__age_[80-100),ohe__admission_type_id_Emergency,ohe__admission_type_id_Other,ohe__admission_type_id_Urgent,ohe__discharge_disposition_id_Home w/ Service,ohe__discharge_disposition_id_Other,ohe__discharge_disposition_id_SNF,ohe__admission_source_id_Other,ohe__admission_source_id_Physician Referral,ohe__medical_specialty_Emergency/Trauma,ohe__medical_specialty_Family/GeneralPractice,ohe__medical_specialty_InternalMedicine,ohe__medical_specialty_Nephrology,ohe__medical_specialty_Orthopedics,ohe__medical_specialty_Other,...,ohe__med_ever_changed_Yes,ohe__diabetic_med_always_prescribed_Yes,ohe__diabetic_med_ever_prescribed_Yes,ohe__always_readmitted_Yes,ohe__ever_readmitted_Yes,quant__num_procedures,quant__num_times_glu_high,quant__num_times_a1c_high,quant__log_time_in_hospital,quant__log_num_lab_procedures,quant__log_num_medications,quant__log_number_outpatient,quant__log_number_emergency,quant__log_number_inpatient,quant__log_number_diagnoses,quant__log_num_encounters,quant__log_avg_time_in_hospital,quant__log_avg_num_lab_procedures,quant__log_avg_num_procedures,quant__log_avg_times_glu_high,quant__log_avg_times_a1c_high,quant__log_avg_times_med_changed,quant__log_num_times_med_changed,quant__log_avg_times_readmitted,quant__log_num_times_readmitted
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,-0.224,-0.076,-0.18,-1.554,-1.408,-1.016,-0.375,-0.291,-0.564,-2.442,-0.501,-0.521,-0.539,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.802,-0.076,-0.18,-0.257,-3.683,-1.828,-0.375,-0.291,-0.564,-1.018,-0.501,-0.521,-0.539,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,-0.224,-0.076,-0.18,-1.554,-3.683,0.372,-0.375,-0.291,-0.564,-2.442,-0.501,-0.521,-0.539,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287
3,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,-0.802,-0.076,-0.18,0.160,0.212,-0.280,-0.375,-0.291,-0.564,0.775,-0.501,-0.521,-0.539,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,-0.802,-0.076,-0.18,-0.796,0.378,-0.805,-0.375,-0.291,-0.564,-1.018,-0.501,-0.521,-0.539,-0.371,-0.093,-0.188,-0.374,-0.357,-0.283,-0.287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106429,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1.0,0.0,0.0,-0.802,-0.076,-0.18,1.948,0.182,-0.805,1.339,-0.291,0.987,-0.008,1.080,2.209,1.864,-0.371,-0.093,-0.188,2.891,1.802,-0.283,-0.287
106430,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,-0.802,-0.076,-0.18,1.040,0.270,-0.131,-0.375,-0.291,1.895,0.775,2.005,1.871,1.973,0.655,-0.093,-0.188,-0.374,-0.357,2.389,2.640
106431,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,0.354,-0.076,-0.18,0.160,-0.528,1.257,-0.375,-0.291,1.895,-1.658,2.005,2.105,1.826,3.703,-0.093,-0.188,2.891,3.065,4.285,4.351
106432,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,0.354,-0.076,-0.18,-1.554,-1.640,-0.805,-0.375,2.321,4.353,-1.658,4.510,1.871,2.011,1.682,-0.093,-0.188,2.587,6.119,2.917,7.278


<h2> Lasso Using Random Oversampling </h2>

In [94]:
# Lasso Logistic Regression

start_time = time.time()

# C values to try 
Cs = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]

# logistic regression model with elasticnet penalty and 10-fold CV
lasso_logit_rand_oversamp = LogisticRegressionCV(
    Cs=Cs, cv=3, penalty='l1', solver='liblinear', n_jobs=-1, # solver='liblinear' for l1
    scoring='roc_auc', max_iter=1000, random_state=109,
).fit(
X_train_transformed_rand_resamp, y_train_rand_resamp)

end_time = time.time()
print(f"Model fitting time: {end_time - start_time:.1f} seconds")

Model fitting time: 600.6 seconds


Random oversampling performs slightly better in AUC than lasso without oversampling, but no features are zeroed out.

In [95]:
print_stats('TRAIN - LASSO', lasso_logit_rand_oversamp, X_train_transformed, y_train)
print_stats('TRAIN RESAMPLED - LASSO', lasso_logit_rand_oversamp, X_train_transformed_rand_resamp, y_train_rand_resamp)
print_stats('TEST - LASSO', lasso_logit_rand_oversamp, X_test_transformed, y_test)

------------------TRAIN - LASSO----------------------
Readmitted Rate:	Actual:0.0496	Predicted:0.4353
Accuracy:		Naive:0.95	Base Model:0.716
AUC:			0.716
------------------TRAIN RESAMPLED - LASSO----------------------
Readmitted Rate:	Actual:0.5	Predicted:0.5
Accuracy:		Naive:0.5	Base Model:0.719
AUC:			0.719
------------------TEST - LASSO----------------------
Readmitted Rate:	Actual:0.0496	Predicted:0.4355
Accuracy:		Naive:0.95	Base Model:0.683
AUC:			0.683


In [113]:
sum(lasso_logit_rand_oversamp.coef_[0] != 0)

135

In [111]:
max(abs(lasso_logit_rand_oversamp.coef_[0]))

5.789701824911571

In [99]:
lasso_logit_rand_oversamp.n_iter_

array([[[13, 16, 19, 29, 29, 47, 44],
        [15, 17, 24, 25, 33, 42, 40],
        [12, 18, 18, 28, 30, 41, 32]]], dtype=int32)

In [114]:
X_train_transformed_rand_resamp_poly = poly.fit_transform(X_train_transformed_rand_resamp[unzeroed_feats])

In [115]:
X_train_transformed_rand_resamp_poly.shape

(106434, 5994)

In [116]:
X_test_transformed_poly.shape

(13998, 5994)

Random oversampling seems to help a little, but since no features are zeroed out we do not eliminate any poly features like before. Ultimately, we find this to be impractical and do not move forward.

In [133]:
start_time = time.time()

# C values to try 
Cs = [1e2, 1e3, 1e4]

# logistic regression model with elasticnet penalty and 10-fold CV
lasso_logit_rand_resamp_poly = LogisticRegressionCV(
    Cs=Cs, cv=2, penalty='l1', solver='liblinear', n_jobs=-1, # solver='liblinear' for l1
    scoring='roc_auc', max_iter=1000, random_state=109,
).fit(X_train_transformed_rand_resamp_poly, y_train_rand_resamp)

end_time = time.time()
print(f"Model fitting time: {end_time - start_time:.1f} seconds")

KeyboardInterrupt: 

In [122]:
print_stats('TRAIN - LASSO', lasso_logit_rand_resamp_poly, X_train_transformed_poly, y_train)
print_stats('TRAIN RESAMPLED - LASSO', lasso_logit_rand_resamp_poly, X_train_transformed_rand_resamp_poly, y_train_rand_resamp)
print_stats('TEST - LASSO', lasso_logit_rand_resamp_poly, X_test_transformed_poly, y_test)

------------------TRAIN - LASSO----------------------
Readmitted Rate:	Actual:0.0496	Predicted:0.2437
Accuracy:		Naive:0.95	Base Model:0.925
AUC:			0.925
------------------TRAIN RESAMPLED - LASSO----------------------
Readmitted Rate:	Actual:0.5	Predicted:0.5
Accuracy:		Naive:0.5	Base Model:0.927
AUC:			0.927
------------------TEST - LASSO----------------------
Readmitted Rate:	Actual:0.0496	Predicted:0.2478
Accuracy:		Naive:0.95	Base Model:0.589
AUC:			0.589


In [127]:
lasso_logit_rand_resamp_poly.C_

array([100.])

In [126]:
sum(lasso_logit_rand_resamp_poly.coef_[0] == 0)

1113

In [118]:
y_train_rand_resamp.head()

0    0
1    0
2    0
3    0
4    0
Name: readmitted, dtype: int64

In [119]:
y_train.head()

patient_nbr
17047953    0
36361287    0
16037325    0
22436523    0
60087429    0
Name: readmitted, dtype: int64