# Quantifying the World
## Satvik Ajmera
## Summer 2022

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

Your case study is to build a classifier using logistic regression to predict hospital readmittance. There is missing data that must be imputed. Once again, discuss variable importances as part of your submission.

In [2]:
pd.set_option('display.max_columns', 100)

In [3]:
pd.set_option('display.max_rows', 100)

In [4]:
diabetes_df = pd.read_csv("../dataset_diabetes/diabetic_data.csv")

In [5]:
ids = pd.read_csv("../dataset_diabetes/IDs_mapping.csv")

In [6]:
ids = ids.dropna(subset=["admission_type_id"])

In [7]:
ids

Unnamed: 0,admission_type_id,description
0,1,Emergency
1,2,Urgent
2,3,Elective
3,4,Newborn
4,5,Not Available
5,6,
6,7,Trauma Center
7,8,Not Mapped
9,discharge_disposition_id,description
10,1,Discharged to home


In [8]:
admission_type_id = ids[0:8]

In [9]:
admission_type_id = admission_type_id.reset_index(drop=True)

In [10]:
admission_type_id

Unnamed: 0,admission_type_id,description
0,1,Emergency
1,2,Urgent
2,3,Elective
3,4,Newborn
4,5,Not Available
5,6,
6,7,Trauma Center
7,8,Not Mapped


In [11]:
admission_type_id["admission_type_id"] = admission_type_id["admission_type_id"].astype(int)

In [12]:
discharge_disposition_id = ids[9:39]

In [13]:
discharge_disposition_id.reset_index(inplace=True,drop=True)

In [14]:
discharge_disposition_id.columns = ["discharge_disposition_id","description"]

In [15]:
discharge_disposition_id = discharge_disposition_id.reset_index(drop=True)

In [16]:
discharge_disposition_id["discharge_disposition_id"] = discharge_disposition_id["discharge_disposition_id"].astype(int)

In [17]:
admission_source_id = ids[40:]

In [18]:
admission_source_id.reset_index(inplace=True, drop=True)

In [19]:
admission_source_id.columns = ["admission_source_id", "description"]

In [20]:
admission_source_id["admission_source_id"] = admission_source_id["admission_source_id"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  admission_source_id["admission_source_id"] = admission_source_id["admission_source_id"].astype(int)


In [21]:
df = diabetes_df.merge(admission_type_id, on= "admission_type_id", how = "left").merge(discharge_disposition_id, on= "discharge_disposition_id", how = "left").merge(admission_source_id, on= "admission_source_id", how = "left")

In [22]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,description_x,description_y,description
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO,,Not Mapped,Physician Referral
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30,Emergency,Discharged to home,Emergency Room
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648.0,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO,Emergency,Discharged to home,Emergency Room
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,?,?,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,?,?,51,0,8,0,0,0,197.0,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room


In [23]:
df_replaced = df.rename({'description_x': 'admission_type_id_x', 'description_y': 'discharge_disposition_id_y','description':'admission_source_id_z'}, axis=1)

In [24]:
df_replaced

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type_id_x,discharge_disposition_id_y,admission_source_id_z
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO,,Not Mapped,Physician Referral
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30,Emergency,Discharged to home,Emergency Room
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO,Emergency,Discharged to home,Emergency Room
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,?,?,44,1,16,0,0,0,8,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,?,?,51,0,8,0,0,0,197,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,AfricanAmerican,Male,[70-80),?,1,3,7,3,MC,?,51,0,16,0,0,0,250.13,291,458,9,,>8,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,>30,Emergency,Discharged/transferred to SNF,Emergency Room
101762,443847782,74694222,AfricanAmerican,Female,[80-90),?,1,4,5,5,MC,?,33,3,18,0,0,1,560,276,787,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,NO,Emergency,Discharged/transferred to ICF,Transfer from a Skilled Nursing Facility (SNF)
101763,443854148,41088789,Caucasian,Male,[70-80),?,1,1,7,1,MC,?,53,0,9,1,0,0,38,590,296,13,,,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room
101764,443857166,31693671,Caucasian,Female,[80-90),?,2,3,7,10,MC,Surgery-General,45,2,21,0,0,1,996,285,998,9,,,No,No,No,No,No,No,Steady,No,No,Steady,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Urgent,Discharged/transferred to SNF,Emergency Room


In [25]:
df_replaced["admission_type_id_x"].value_counts()

Emergency        53990
Elective         18869
Urgent           18480
Not Available     4785
Not Mapped         320
Trauma Center       21
Newborn             10
Name: admission_type_id_x, dtype: int64

In [26]:
df_replaced["admission_source_id_z"].value_counts()

 Emergency Room                                               57494
 Physician Referral                                           29565
Transfer from a hospital                                       3187
 Transfer from another health care facility                    2264
Clinic Referral                                                1104
 Transfer from a Skilled Nursing Facility (SNF)                 855
HMO Referral                                                    187
 Not Mapped                                                     161
 Not Available                                                  125
 Court/Law Enforcement                                           16
 Transfer from hospital inpt/same fac reslt in a sep claim       12
 Transfer from critial access hospital                            8
 Extramural Birth                                                 2
Normal Delivery                                                   2
 Transfer from Ambulatory Surgery Center        

In [27]:
df_replaced["admission_type_id"].value_counts()

1    53990
3    18869
2    18480
6     5291
5     4785
8      320
7       21
4       10
Name: admission_type_id, dtype: int64

Drop the Subset of all NaN's in the columns of admission_type_id_x 	discharge_disposition_id_y 	admission_source_id_z

In [28]:
df_replaced["patient_nbr"].value_counts()

88785891     40
43140906     28
1660293      23
88227540     23
23199021     23
             ..
11005362      1
98252496      1
1019673       1
13396320      1
175429310     1
Name: patient_nbr, Length: 71518, dtype: int64

In [29]:
from pandas_profiling import ProfileReport

In [30]:
pt = df_replaced["patient_nbr"].value_counts().reset_index()

In [31]:
df_replaced[df_replaced["patient_nbr"]== 88785891]

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type_id_x,discharge_disposition_id_y,admission_source_id_z
38307,119039172,88785891,Caucasian,Female,[20-30),?,1,1,7,1,OG,Emergency/Trauma,32,0,10,0,0,0,250.13,244,490,3,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30,Emergency,Discharged to home,Emergency Room
40252,125094312,88785891,Caucasian,Female,[20-30),?,1,1,7,1,BC,Emergency/Trauma,10,0,10,1,0,2,250.11,244,?,2,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,<30,Emergency,Discharged to home,Emergency Room
40661,126171582,88785891,Caucasian,Female,[20-30),?,1,1,7,5,BC,Emergency/Trauma,35,0,15,1,0,3,250.13,536,V58,6,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30,Emergency,Discharged to home,Emergency Room
44515,137245596,88785891,Caucasian,Female,[20-30),?,3,1,7,2,SP,Surgery-General,1,0,8,1,0,5,250.11,465,244,3,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,<30,Elective,Discharged to home,Emergency Room
45147,139425576,88785891,Caucasian,Female,[20-30),?,1,1,7,2,BC,Emergency/Trauma,32,0,4,1,0,6,250.11,V58,V15,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,<30,Emergency,Discharged to home,Emergency Room
45986,141994242,88785891,Caucasian,Female,[20-30),?,2,1,7,4,BC,Emergency/Trauma,8,0,14,1,0,7,250.1,276,276,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,<30,Urgent,Discharged to home,Emergency Room
50167,150986298,88785891,Caucasian,Female,[20-30),?,2,1,7,1,BC,Emergency/Trauma,2,0,9,4,1,9,8.0,250.01,V09,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,<30,Urgent,Discharged to home,Emergency Room
50393,151413846,88785891,Caucasian,Female,[20-30),?,1,1,7,4,SP,Emergency/Trauma,64,0,19,4,1,9,250.11,305,V58,6,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,<30,Emergency,Discharged to home,Emergency Room
50773,152188656,88785891,Caucasian,Female,[20-30),?,2,7,7,1,BC,Surgery-General,33,0,10,4,1,10,250.11,244,V15,6,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,<30,Urgent,Left AMA,Emergency Room
51519,153558456,88785891,Caucasian,Female,[20-30),?,2,1,7,1,BC,Emergency/Trauma,35,0,10,3,1,11,250.11,V15,V58,4,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,<30,Urgent,Discharged to home,Emergency Room


In [32]:
df_drop = df_replaced.drop_duplicates(keep="first",subset="patient_nbr")

In [33]:
df_drop.isna().sum()

encounter_id                     0
patient_nbr                      0
race                             0
gender                           0
age                              0
weight                           0
admission_type_id                0
discharge_disposition_id         0
admission_source_id              0
time_in_hospital                 0
payer_code                       0
medical_specialty                0
num_lab_procedures               0
num_procedures                   0
num_medications                  0
number_outpatient                0
number_emergency                 0
number_inpatient                 0
diag_1                           0
diag_2                           0
diag_3                           0
number_diagnoses                 0
max_glu_serum                    0
A1Cresult                        0
metformin                        0
repaglinide                      0
nateglinide                      0
chlorpropamide                   0
glimepiride         

In [34]:
df_drop = df_drop.dropna(subset=["admission_type_id_x","discharge_disposition_id_y","admission_source_id_z"])

In [35]:
df_drop.isna().sum()

encounter_id                  0
patient_nbr                   0
race                          0
gender                        0
age                           0
weight                        0
admission_type_id             0
discharge_disposition_id      0
admission_source_id           0
time_in_hospital              0
payer_code                    0
medical_specialty             0
num_lab_procedures            0
num_procedures                0
num_medications               0
number_outpatient             0
number_emergency              0
number_inpatient              0
diag_1                        0
diag_2                        0
diag_3                        0
number_diagnoses              0
max_glu_serum                 0
A1Cresult                     0
metformin                     0
repaglinide                   0
nateglinide                   0
chlorpropamide                0
glimepiride                   0
acetohexamide                 0
glipizide                     0
glyburid

In [36]:
df_drop["admission_type_id_x"].value_counts()

Emergency        34732
Elective         13163
Urgent           12511
Not Available     1329
Not Mapped         291
Trauma Center       21
Newborn              9
Name: admission_type_id_x, dtype: int64

In [37]:
df_drop["discharge_disposition_id_y"].value_counts()

Discharged to home                                                                                           40677
Discharged/transferred to SNF                                                                                 7727
Discharged/transferred to home with home health service                                                       7366
Discharged/transferred to another rehab fac including rehab units of a hospital .                             1370
Discharged/transferred to another short term hospital                                                         1337
Expired                                                                                                        962
Discharged/transferred to another type of inpatient care institution                                           798
Discharged/transferred to ICF                                                                                  533
Left AMA                                                                        

In [38]:
df_drop["admission_source_id_z"].value_counts()

 Emergency Room                                               35659
 Physician Referral                                           20765
Transfer from a hospital                                       2192
 Transfer from another health care facility                    1780
Clinic Referral                                                 839
 Transfer from a Skilled Nursing Facility (SNF)                 469
 Not Mapped                                                     155
 Not Available                                                   95
HMO Referral                                                     74
 Court/Law Enforcement                                           11
 Transfer from critial access hospital                            7
 Transfer from hospital inpt/same fac reslt in a sep claim        4
 Extramural Birth                                                 2
 Transfer from Ambulatory Surgery Center                          2
Normal Delivery                                 

In [39]:
next_df = df_drop.drop(columns = ["admission_type_id","discharge_disposition_id", "admission_source_id"])

In [40]:
next_df = next_df.replace("?",np.nan)

In [41]:
next_df.isna().sum()

encounter_id                      0
patient_nbr                       0
race                           1822
gender                            0
age                               0
weight                        59700
time_in_hospital                  0
payer_code                    22524
medical_specialty             30406
num_lab_procedures                0
num_procedures                    0
num_medications                   0
number_outpatient                 0
number_emergency                  0
number_inpatient                  0
diag_1                           10
diag_2                          274
diag_3                         1039
number_diagnoses                  0
max_glu_serum                     0
A1Cresult                         0
metformin                         0
repaglinide                       0
nateglinide                       0
chlorpropamide                    0
glimepiride                       0
acetohexamide                     0
glipizide                   

Drop weight, and payer code for the reasons in the paper. However, replace medical specialty and race missing values to Missing.

In [42]:
next_df = next_df.drop(columns = ["weight","payer_code"])

In [43]:
next_df.isna().sum()

encounter_id                      0
patient_nbr                       0
race                           1822
gender                            0
age                               0
time_in_hospital                  0
medical_specialty             30406
num_lab_procedures                0
num_procedures                    0
num_medications                   0
number_outpatient                 0
number_emergency                  0
number_inpatient                  0
diag_1                           10
diag_2                          274
diag_3                         1039
number_diagnoses                  0
max_glu_serum                     0
A1Cresult                         0
metformin                         0
repaglinide                       0
nateglinide                       0
chlorpropamide                    0
glimepiride                       0
acetohexamide                     0
glipizide                         0
glyburide                         0
tolbutamide                 

In [44]:
next_df

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type_id_x,discharge_disposition_id_y,admission_source_id_z
1,149190,55629189,Caucasian,Female,[10-20),3,,59,0,18,0,0,0,276,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30,Emergency,Discharged to home,Emergency Room
2,64410,86047875,AfricanAmerican,Female,[20-30),2,,11,5,13,2,0,1,648,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO,Emergency,Discharged to home,Emergency Room
3,500364,82442376,Caucasian,Male,[30-40),2,,44,1,16,0,0,0,8,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room
4,16680,42519267,Caucasian,Male,[40-50),1,,51,0,8,0,0,0,197,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room
5,35754,82637451,Caucasian,Male,[50-60),3,,31,6,16,0,0,0,414,411,250,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,Urgent,Discharged to home,Clinic Referral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101754,443842016,183087545,Caucasian,Female,[70-80),9,,50,2,33,0,0,0,574,574,250.02,9,,>7,No,No,No,No,No,No,No,Up,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,>30,Emergency,Discharged to home,Emergency Room
101755,443842022,188574944,Other,Female,[40-50),14,,73,6,26,0,1,0,592,599,518,9,,>8,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30,Emergency,Discharged to home,Emergency Room
101756,443842070,140199494,Other,Female,[60-70),2,,46,6,17,1,1,1,996,585,403,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,Emergency,Discharged to home,Emergency Room
101758,443842340,120975314,Caucasian,Female,[80-90),5,,76,1,22,0,1,0,292,8,304,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room


In [45]:
new_df = next_df.replace(np.nan, "Missing")

In [46]:
new_df["age"].value_counts()

[70-80)     15549
[60-70)     13918
[50-60)     10887
[80-90)     10076
[40-50)      5952
[30-40)      2359
[90-100)     1658
[20-30)      1028
[10-20)       488
[0-10)        141
Name: age, dtype: int64

In [47]:
new_df["admission_source_id_z"].value_counts()

 Emergency Room                                               35659
 Physician Referral                                           20765
Transfer from a hospital                                       2192
 Transfer from another health care facility                    1780
Clinic Referral                                                 839
 Transfer from a Skilled Nursing Facility (SNF)                 469
 Not Mapped                                                     155
 Not Available                                                   95
HMO Referral                                                     74
 Court/Law Enforcement                                           11
 Transfer from critial access hospital                            7
 Transfer from hospital inpt/same fac reslt in a sep claim        4
 Extramural Birth                                                 2
 Transfer from Ambulatory Surgery Center                          2
Normal Delivery                                 

In [48]:
new_df["discharge_disposition_id_y"].value_counts()

Discharged to home                                                                                           40677
Discharged/transferred to SNF                                                                                 7727
Discharged/transferred to home with home health service                                                       7366
Discharged/transferred to another rehab fac including rehab units of a hospital .                             1370
Discharged/transferred to another short term hospital                                                         1337
Expired                                                                                                        962
Discharged/transferred to another type of inpatient care institution                                           798
Discharged/transferred to ICF                                                                                  533
Left AMA                                                                        

In [49]:
new_df["admission_type_id_x"].value_counts()

Emergency        34732
Elective         13163
Urgent           12511
Not Available     1329
Not Mapped         291
Trauma Center       21
Newborn              9
Name: admission_type_id_x, dtype: int64

In [50]:
new_df

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type_id_x,discharge_disposition_id_y,admission_source_id_z
1,149190,55629189,Caucasian,Female,[10-20),3,Missing,59,0,18,0,0,0,276,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30,Emergency,Discharged to home,Emergency Room
2,64410,86047875,AfricanAmerican,Female,[20-30),2,Missing,11,5,13,2,0,1,648,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO,Emergency,Discharged to home,Emergency Room
3,500364,82442376,Caucasian,Male,[30-40),2,Missing,44,1,16,0,0,0,8,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room
4,16680,42519267,Caucasian,Male,[40-50),1,Missing,51,0,8,0,0,0,197,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room
5,35754,82637451,Caucasian,Male,[50-60),3,Missing,31,6,16,0,0,0,414,411,250,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,Urgent,Discharged to home,Clinic Referral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101754,443842016,183087545,Caucasian,Female,[70-80),9,Missing,50,2,33,0,0,0,574,574,250.02,9,,>7,No,No,No,No,No,No,No,Up,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,>30,Emergency,Discharged to home,Emergency Room
101755,443842022,188574944,Other,Female,[40-50),14,Missing,73,6,26,0,1,0,592,599,518,9,,>8,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30,Emergency,Discharged to home,Emergency Room
101756,443842070,140199494,Other,Female,[60-70),2,Missing,46,6,17,1,1,1,996,585,403,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,Emergency,Discharged to home,Emergency Room
101758,443842340,120975314,Caucasian,Female,[80-90),5,Missing,76,1,22,0,1,0,292,8,304,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room


Removed hospice and expired for the id columns

In [51]:
removed_df = new_df[(new_df["discharge_disposition_id_y"] != "Hospice / home") & (new_df["discharge_disposition_id_y"] != "Hospice / medical facility") & (new_df["discharge_disposition_id_y"] != "Expired") & (new_df["discharge_disposition_id_y"] != "Expired at home. Medicaid only, hospice.")]

In [52]:
removed_df

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type_id_x,discharge_disposition_id_y,admission_source_id_z
1,149190,55629189,Caucasian,Female,[10-20),3,Missing,59,0,18,0,0,0,276,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30,Emergency,Discharged to home,Emergency Room
2,64410,86047875,AfricanAmerican,Female,[20-30),2,Missing,11,5,13,2,0,1,648,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO,Emergency,Discharged to home,Emergency Room
3,500364,82442376,Caucasian,Male,[30-40),2,Missing,44,1,16,0,0,0,8,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room
4,16680,42519267,Caucasian,Male,[40-50),1,Missing,51,0,8,0,0,0,197,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room
5,35754,82637451,Caucasian,Male,[50-60),3,Missing,31,6,16,0,0,0,414,411,250,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,Urgent,Discharged to home,Clinic Referral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101754,443842016,183087545,Caucasian,Female,[70-80),9,Missing,50,2,33,0,0,0,574,574,250.02,9,,>7,No,No,No,No,No,No,No,Up,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,>30,Emergency,Discharged to home,Emergency Room
101755,443842022,188574944,Other,Female,[40-50),14,Missing,73,6,26,0,1,0,592,599,518,9,,>8,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30,Emergency,Discharged to home,Emergency Room
101756,443842070,140199494,Other,Female,[60-70),2,Missing,46,6,17,1,1,1,996,585,403,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,Emergency,Discharged to home,Emergency Room
101758,443842340,120975314,Caucasian,Female,[80-90),5,Missing,76,1,22,0,1,0,292,8,304,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room


In [53]:
removed_df.reset_index(inplace=True,drop=True)

In [54]:
removed_df

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type_id_x,discharge_disposition_id_y,admission_source_id_z
0,149190,55629189,Caucasian,Female,[10-20),3,Missing,59,0,18,0,0,0,276,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30,Emergency,Discharged to home,Emergency Room
1,64410,86047875,AfricanAmerican,Female,[20-30),2,Missing,11,5,13,2,0,1,648,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO,Emergency,Discharged to home,Emergency Room
2,500364,82442376,Caucasian,Male,[30-40),2,Missing,44,1,16,0,0,0,8,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room
3,16680,42519267,Caucasian,Male,[40-50),1,Missing,51,0,8,0,0,0,197,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room
4,35754,82637451,Caucasian,Male,[50-60),3,Missing,31,6,16,0,0,0,414,411,250,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,Urgent,Discharged to home,Clinic Referral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60667,443842016,183087545,Caucasian,Female,[70-80),9,Missing,50,2,33,0,0,0,574,574,250.02,9,,>7,No,No,No,No,No,No,No,Up,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,>30,Emergency,Discharged to home,Emergency Room
60668,443842022,188574944,Other,Female,[40-50),14,Missing,73,6,26,0,1,0,592,599,518,9,,>8,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30,Emergency,Discharged to home,Emergency Room
60669,443842070,140199494,Other,Female,[60-70),2,Missing,46,6,17,1,1,1,996,585,403,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,Emergency,Discharged to home,Emergency Room
60670,443842340,120975314,Caucasian,Female,[80-90),5,Missing,76,1,22,0,1,0,292,8,304,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room


Finally, we need to drop the encounter_id and patient_nbr

In [55]:
final_df = removed_df.drop(columns = ["encounter_id","patient_nbr"])

In [56]:
final_df.head()

Unnamed: 0,race,gender,age,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type_id_x,discharge_disposition_id_y,admission_source_id_z
0,Caucasian,Female,[10-20),3,Missing,59,0,18,0,0,0,276,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30,Emergency,Discharged to home,Emergency Room
1,AfricanAmerican,Female,[20-30),2,Missing,11,5,13,2,0,1,648,250.0,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO,Emergency,Discharged to home,Emergency Room
2,Caucasian,Male,[30-40),2,Missing,44,1,16,0,0,0,8,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room
3,Caucasian,Male,[40-50),1,Missing,51,0,8,0,0,0,197,157.0,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,Emergency,Discharged to home,Emergency Room
4,Caucasian,Male,[50-60),3,Missing,31,6,16,0,0,0,414,411.0,250,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,Urgent,Discharged to home,Clinic Referral


In [58]:
final_df['readmitted'].value_counts()

NO     36176
>30    19116
<30     5380
Name: readmitted, dtype: int64

Convert NO to class 1, >30 to class 3, <30 to class 2

In [59]:
final_df['readmitted'] = final_df['readmitted'].replace({"NO":1,">30":3,"<30":2})

In [60]:
final_df

Unnamed: 0,race,gender,age,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type_id_x,discharge_disposition_id_y,admission_source_id_z
0,Caucasian,Female,[10-20),3,Missing,59,0,18,0,0,0,276,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,3,Emergency,Discharged to home,Emergency Room
1,AfricanAmerican,Female,[20-30),2,Missing,11,5,13,2,0,1,648,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,1,Emergency,Discharged to home,Emergency Room
2,Caucasian,Male,[30-40),2,Missing,44,1,16,0,0,0,8,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,1,Emergency,Discharged to home,Emergency Room
3,Caucasian,Male,[40-50),1,Missing,51,0,8,0,0,0,197,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,1,Emergency,Discharged to home,Emergency Room
4,Caucasian,Male,[50-60),3,Missing,31,6,16,0,0,0,414,411,250,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,3,Urgent,Discharged to home,Clinic Referral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60667,Caucasian,Female,[70-80),9,Missing,50,2,33,0,0,0,574,574,250.02,9,,>7,No,No,No,No,No,No,No,Up,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,3,Emergency,Discharged to home,Emergency Room
60668,Other,Female,[40-50),14,Missing,73,6,26,0,1,0,592,599,518,9,,>8,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,3,Emergency,Discharged to home,Emergency Room
60669,Other,Female,[60-70),2,Missing,46,6,17,1,1,1,996,585,403,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,3,Emergency,Discharged to home,Emergency Room
60670,Caucasian,Female,[80-90),5,Missing,76,1,22,0,1,0,292,8,304,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,1,Emergency,Discharged to home,Emergency Room


In [61]:
final_df.head()

Unnamed: 0,race,gender,age,time_in_hospital,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type_id_x,discharge_disposition_id_y,admission_source_id_z
0,Caucasian,Female,[10-20),3,Missing,59,0,18,0,0,0,276,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,3,Emergency,Discharged to home,Emergency Room
1,AfricanAmerican,Female,[20-30),2,Missing,11,5,13,2,0,1,648,250.0,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,1,Emergency,Discharged to home,Emergency Room
2,Caucasian,Male,[30-40),2,Missing,44,1,16,0,0,0,8,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,1,Emergency,Discharged to home,Emergency Room
3,Caucasian,Male,[40-50),1,Missing,51,0,8,0,0,0,197,157.0,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,1,Emergency,Discharged to home,Emergency Room
4,Caucasian,Male,[50-60),3,Missing,31,6,16,0,0,0,414,411.0,250,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,3,Urgent,Discharged to home,Clinic Referral


In [62]:
X = final_df.drop("readmitted", axis = 1).copy()
y = final_df["readmitted"].values.copy()

In [63]:
numeric_features = ['time_in_hospital', 'num_lab_procedures', 'num_procedures',
                    'num_medications','number_outpatient','number_emergency',
                    'number_inpatient','number_diagnoses']

In [64]:
categorical_features = [i for i in X.columns if i not in numeric_features]

## Preprocessing and Standard Scaling use sklearn's pipeline.

In [65]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler(with_mean=False))])


categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(drop="first"))])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

In [66]:
X_scaled = pipeline.fit_transform(X)

In [67]:
X_scaled

<60672x2310 sparse matrix of type '<class 'numpy.float64'>'
	with 1764138 stored elements in Compressed Sparse Row format>

In [68]:
y

array([3, 1, 1, ..., 3, 1, 1])

# Logistic Regression with no regularization

In [71]:
from sklearn.model_selection import StratifiedShuffleSplit 
sss_cv = StratifiedShuffleSplit(n_splits=5,test_size=0.10, random_state=42)

In [72]:
%%time
from sklearn.linear_model import LogisticRegression
from sklearn import metrics as mt
import time

lr_clf = LogisticRegression(multi_class = "multinomial",max_iter=5000, random_state=42)
acc_scores = []
iter_num=0

for train_indices, test_indices in sss_cv.split(X_scaled,y): 
    X_train = X_scaled[train_indices]
    y_train = y[train_indices]
    
    X_test = X_scaled[test_indices]
    y_test = y[test_indices]
    lr_clf.fit(X_train,y_train)  # train object

    y_hat = lr_clf.predict(X_test) # get test set predictions
    print("====Iteration",iter_num," ====")
    acc = mt.accuracy_score(y_test,y_hat)
    acc_scores.append(acc)
    print('Accuracy:', acc)
    iter_num+=1

====Iteration 0  ====
Accuracy: 0.6143704680290046
====Iteration 1  ====
Accuracy: 0.6058009228740936
====Iteration 2  ====
Accuracy: 0.6092617007251153
====Iteration 3  ====
Accuracy: 0.6086025049439684
====Iteration 4  ====
Accuracy: 0.6072841133816743
CPU times: user 2min 4s, sys: 4.37 s, total: 2min 8s
Wall time: 2min 8s


In [73]:
print("Mean Accuracy is:",np.mean(acc))

Mean Accuracy is: 0.6072841133816743


# L1 Regularization Grid Search

In [74]:
from sklearn.model_selection import StratifiedShuffleSplit 
sss_cv = StratifiedShuffleSplit(n_splits=5,test_size=0.10, random_state=42)

In [None]:
%%time
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV


regEstimator = LogisticRegression(multi_class = "multinomial")

parameters = { 'penalty':['l1']
              ,'C': [0.001, 0.01, 0.1]
              ,'random_state': [42]
              ,'solver': ['saga']
              ,'max_iter':[5000]
             }


GridSearch = GridSearchCV(estimator=regEstimator
                   , verbose=3
                   , param_grid=parameters
                   , cv=sss_cv 
                   , scoring='accuracy')

GridSearch.fit(X_scaled, y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END C=0.001, max_iter=5000, penalty=l1, random_state=42, solver=saga;, score=0.602 total time=  46.0s
[CV 2/5] END C=0.001, max_iter=5000, penalty=l1, random_state=42, solver=saga;, score=0.603 total time=  45.8s
[CV 3/5] END C=0.001, max_iter=5000, penalty=l1, random_state=42, solver=saga;, score=0.602 total time=  45.0s
[CV 4/5] END C=0.001, max_iter=5000, penalty=l1, random_state=42, solver=saga;, score=0.601 total time=  45.7s
[CV 5/5] END C=0.001, max_iter=5000, penalty=l1, random_state=42, solver=saga;, score=0.602 total time=  47.1s
[CV 1/5] END C=0.01, max_iter=5000, penalty=l1, random_state=42, solver=saga;, score=0.608 total time= 2.6min


In [None]:
l1_best_est = GridSearch.best_estimator_

In [None]:
print(l1_best_est)

In [None]:
l1_best_params = GridSearch.best_params_

In [None]:
print(l1_best_params)

In [None]:
l1_best_score = GridSearch.best_score_

In [None]:
print('Highest Accuracy', l1_best_score )

# L2 Regularization Gridsearch

In [62]:
from sklearn.model_selection import StratifiedShuffleSplit 
sss_cv = StratifiedShuffleSplit(n_splits=5,test_size=0.10, random_state=42)

In [63]:
%%time
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV


regEstimator = LogisticRegression(multi_class = "multinomial")

parameters = { 'penalty':['l2']
              ,'C': [0.001, 0.01, 0.1]
              ,'random_state': [42]
              ,'solver': ['saga']
              ,'max_iter':[5000]
             }


l2_GridSearch = GridSearchCV(estimator=regEstimator
                   , verbose=3
                   , param_grid=parameters
                   , cv=sss_cv 
                   , scoring='accuracy')

l2_GridSearch.fit(X_scaled, y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END C=0.001, max_iter=5000, penalty=l2, random_state=42, solver=saga;, score=0.613 total time=   0.7s
[CV 2/5] END C=0.001, max_iter=5000, penalty=l2, random_state=42, solver=saga;, score=0.615 total time=   0.5s
[CV 3/5] END C=0.001, max_iter=5000, penalty=l2, random_state=42, solver=saga;, score=0.616 total time=   0.7s
[CV 4/5] END C=0.001, max_iter=5000, penalty=l2, random_state=42, solver=saga;, score=0.616 total time=   0.7s
[CV 5/5] END C=0.001, max_iter=5000, penalty=l2, random_state=42, solver=saga;, score=0.614 total time=   0.7s
[CV 1/5] END C=0.01, max_iter=5000, penalty=l2, random_state=42, solver=saga;, score=0.619 total time=   3.2s
[CV 2/5] END C=0.01, max_iter=5000, penalty=l2, random_state=42, solver=saga;, score=0.621 total time=   2.7s
[CV 3/5] END C=0.01, max_iter=5000, penalty=l2, random_state=42, solver=saga;, score=0.624 total time=   3.1s
[CV 4/5] END C=0.01, max_iter=5000, penalty=l2, random_

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=5, random_state=42, test_size=0.1,
            train_size=None),
             estimator=LogisticRegression(multi_class='multinomial'),
             param_grid={'C': [0.001, 0.01, 0.1], 'max_iter': [5000],
                         'penalty': ['l2'], 'random_state': [42],
                         'solver': ['saga']},
             scoring='accuracy', verbose=3)

In [64]:
l2_best_est = l2_GridSearch.best_estimator_

In [65]:
print(l2_best_est)

LogisticRegression(C=0.1, max_iter=5000, multi_class='multinomial',
                   random_state=42, solver='saga')


In [66]:
l2_best_params = l2_GridSearch.best_params_

In [67]:
print(l2_best_params)

{'C': 0.1, 'max_iter': 5000, 'penalty': 'l2', 'random_state': 42, 'solver': 'saga'}


In [68]:
l2_best_score = l2_GridSearch.best_score_

In [69]:
print('Highest Accuracy', l2_best_score )

Highest Accuracy 0.6212052852078633


# Variable Importance - L2 Regularization

In [70]:
numeric_features_new = ['time_in_hospital', 'num_lab_procedures', 'num_procedures',
                    'num_medications','number_outpatient','number_emergency',
                    'number_inpatient','number_diagnoses']

X_columns = final_df.drop("readmitted", axis = 1).copy()
X_columns = pipeline.fit(X_columns)
X_columns.named_steps["preprocessor"].transformers_[1][1].named_steps["onehot"]

column_name = X_columns.named_steps['preprocessor'].transformers_[1][1]\
   .named_steps['onehot'].get_feature_names_out(categorical_features)

column_name = list(column_name)
column_name[:0] = numeric_features_new


In [71]:
l2_coefficients = pd.concat([pd.DataFrame(column_name),pd.DataFrame(np.transpose(l2_best_est.coef_))], axis = 1)

In [73]:
l2_coefficients.columns = ["Features", "NO", "<30", ">30"]

In [74]:
l2_coefficients['Absolute_Sum'] = abs(l2_coefficients["NO"]) + abs(l2_coefficients["<30"]) + abs(l2_coefficients[">30"])


In [None]:
l2_coefficients.sort_values(by="Absolute_Sum", ascending=False)

In [78]:
l2_coefficients.sort_values(by="Absolute_Sum", ascending=False)

Unnamed: 0,Features,NO,<30,>30,Absolute_Sum
2298,discharge_disposition_id_y_Expired,2.074097,-0.897387,-1.176710,4.148195
2301,discharge_disposition_id_y_Hospice / medical f...,0.963771,-0.221466,-0.742305,1.927542
2300,discharge_disposition_id_y_Hospice / home,0.899576,-0.516744,-0.382832,1.799152
773,diag_1_V58,-0.544282,0.820788,-0.276506,1.641577
2297,discharge_disposition_id_y_Discharged/transfer...,-0.520666,0.730072,-0.209406,1.460144
...,...,...,...,...,...
547,diag_1_671,0.003712,-0.001161,-0.002551,0.007425
632,diag_1_804,0.002320,-0.001057,-0.001263,0.004640
1604,diag_3_263,-0.002249,0.001874,0.000375,0.004497
786,diag_2_130,0.001688,-0.000708,-0.000980,0.003377
