In [2]:
pip install category_encoders

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np

import category_encoders as ce

In [4]:
readmission_df = pd.read_csv('diabetic_data.csv')
readmission_df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [5]:
readmission_df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [6]:
# encode/get dummies: race, gender, age, admission_type_id, diag_1, diag_2, diag_3, number_diagnoses, max_glu_serum, A1Cresult
ce_OHE = ce.OneHotEncoder(cols=['race','gender', 'age', 'admission_type_id', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult'])

readmission_df_encoded = ce_OHE.fit_transform(readmission_df)
readmission_df_encoded.head()

Unnamed: 0,encounter_id,patient_nbr,race_1,race_2,race_3,race_4,race_5,race_6,gender_1,gender_2,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,1,0,0,0,0,0,1,0,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,1,0,0,0,0,0,1,0,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,0,1,0,0,0,0,1,0,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,1,0,0,0,0,0,0,1,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,1,0,0,0,0,0,0,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [7]:
# Drop: admission_source_id, payer_code, medical_specialty, number_inpatient, number_outpatient, number_emergency, weight, patient_nbr, medications
readmission_cleaned = readmission_df_encoded.drop(columns=['admission_source_id', 'payer_code', 'medical_specialty', 'number_inpatient', 'number_outpatient', 'number_emergency', 
                                                  'weight', 'patient_nbr', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 
                                                           'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
                                                           'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 
                                                           'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed'])
readmission_cleaned.head()

Unnamed: 0,encounter_id,race_1,race_2,race_3,race_4,race_5,race_6,gender_1,gender_2,gender_3,...,number_diagnoses_16,max_glu_serum_1,max_glu_serum_2,max_glu_serum_3,max_glu_serum_4,A1Cresult_1,A1Cresult_2,A1Cresult_3,A1Cresult_4,readmitted
0,2278392,1,0,0,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,NO
1,149190,1,0,0,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,>30
2,64410,0,1,0,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,NO
3,500364,1,0,0,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,NO
4,16680,1,0,0,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,NO


In [12]:
# Change to boolean: readmitted
readmission_final = readmission_cleaned
for i in readmission_final['readmitted']:
    if i == '>30':
        readmission_final['readmitted'] = i.replace(">30", '0')
    if i == '<30':
        readmission_final['readmitted'] = i.replace("<30", '1')
    if i == 'NO':
        readmission_final['readmitted'] = i.replace("NO", '0')
for i in readmission_final['readmitted']:
    int(i)
readmission_final.head()

Unnamed: 0,encounter_id,race_1,race_2,race_3,race_4,race_5,race_6,gender_1,gender_2,gender_3,...,number_diagnoses_16,max_glu_serum_1,max_glu_serum_2,max_glu_serum_3,max_glu_serum_4,A1Cresult_1,A1Cresult_2,A1Cresult_3,A1Cresult_4,readmitted
0,2278392,1,0,0,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,0
1,149190,1,0,0,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,0
2,64410,0,1,0,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,0
3,500364,1,0,0,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
4,16680,1,0,0,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
