# Preprocessing the Dataset

In this file all preprocessing steps are done and at the end two files combined and preprocessed are generated for the use of EDA and models. Combined has the original dataset + all the one-hot features and preprocessed only has one hot features and labels for training models.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 20)
df = pd.read_csv('diabetic_data.csv')
df.shape

(101766, 50)

In [3]:
# Helper function to print out the details of each column of a dataframe
def datframe_value_counts(df):
    for c in df.columns:
        print ("---- %s --- %d unique values ---" % (c, df[c].nunique()))
        print (df[c].value_counts()[:20])
        
def scatter_plot(df, x, y, title=None, hold_plot = False):
    plt.scatter(df[x], df[y])
    plt.xlabel(x)
    plt.ylabel(y)
    
    if title:
        plt.title(title)
    
    if not hold_plot:
        plt.show()

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
encounter_id                101766 non-null int64
patient_nbr                 101766 non-null int64
race                        101766 non-null object
gender                      101766 non-null object
age                         101766 non-null object
weight                      101766 non-null object
admission_type_id           101766 non-null int64
discharge_disposition_id    101766 non-null int64
admission_source_id         101766 non-null int64
time_in_hospital            101766 non-null int64
payer_code                  101766 non-null object
medical_specialty           101766 non-null object
num_lab_procedures          101766 non-null int64
num_procedures              101766 non-null int64
num_medications             101766 non-null int64
number_outpatient           101766 non-null int64
number_emergency            101766 non-null int64
number_inpatient            10176

In [5]:
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [6]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648.0,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,?,?,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,?,?,51,0,8,0,0,0,197.0,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [7]:
df['readmitted'].value_counts()

NO     54864
>30    35545
<30    11357
Name: readmitted, dtype: int64

In [8]:
df = pd.get_dummies(df, columns = ['readmitted'])
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,...,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted_<30,readmitted_>30,readmitted_NO
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,...,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,0,0,1
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276.0,250.01,255,9,,,No,...,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,0,1,0
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648.0,250,V27,6,,,No,...,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,0,0,1
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,?,?,44,1,16,0,0,0,8.0,250.43,403,7,,,No,...,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,0,0,1
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,?,?,51,0,8,0,0,0,197.0,157,250,5,,,No,...,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,0,0,1


Let's first drop the encounter ID and patient ID as they won't probably be useful (specially since there is no date included in the dataset).

In [9]:
df = df.drop(['encounter_id','patient_nbr'], 1)
df.head()

Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted_<30,readmitted_>30,readmitted_NO
0,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,0,0,1
1,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,0,1,0
2,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648.0,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,0,0,1
3,Caucasian,Male,[30-40),?,1,1,7,2,?,?,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,0,0,1
4,Caucasian,Male,[40-50),?,1,1,7,1,?,?,51,0,8,0,0,0,197.0,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,0,0,1


Instead of the intervals for age, let's convert all the values to the midpoint of the interval instead:

In [10]:
age_dict = {'[0-10)':5, '[10-20)':15, '[20-30)':25, '[30-40)':35, '[40-50)':45, '[50-60)':55, '[60-70)':65, '[70-80)':75, '[80-90)':85, '[90-100)':95}
df['age'] = df['age'].replace(age_dict)
df.head()

Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted_<30,readmitted_>30,readmitted_NO
0,Caucasian,Female,5,?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,0,0,1
1,Caucasian,Female,15,?,1,1,7,3,?,?,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,0,1,0
2,AfricanAmerican,Female,25,?,1,1,7,2,?,?,11,5,13,2,0,1,648.0,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,0,0,1
3,Caucasian,Male,35,?,1,1,7,2,?,?,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,0,0,1
4,Caucasian,Male,45,?,1,1,7,1,?,?,51,0,8,0,0,0,197.0,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,0,0,1


In [11]:
datframe_value_counts(df)

---- race --- 6 unique values ---
Caucasian          76099
AfricanAmerican    19210
?                   2273
Hispanic            2037
Other               1506
Asian                641
Name: race, dtype: int64
---- gender --- 3 unique values ---
Female             54708
Male               47055
Unknown/Invalid        3
Name: gender, dtype: int64
---- age --- 10 unique values ---
75    26068
65    22483
55    17256
85    17197
45     9685
35     3775
95     2793
25     1657
15      691
5       161
Name: age, dtype: int64
---- weight --- 10 unique values ---
?            98569
[75-100)      1336
[50-75)        897
[100-125)      625
[125-150)      145
[25-50)         97
[0-25)          48
[150-175)       35
[175-200)       11
>200             3
Name: weight, dtype: int64
---- admission_type_id --- 8 unique values ---
1    53990
3    18869
2    18480
6     5291
5     4785
8      320
7       21
4       10
Name: admission_type_id, dtype: int64
---- discharge_disposition_id --- 26 unique valu

---- metformin-pioglitazone --- 2 unique values ---
No        101765
Steady         1
Name: metformin-pioglitazone, dtype: int64
---- change --- 2 unique values ---
No    54755
Ch    47011
Name: change, dtype: int64
---- diabetesMed --- 2 unique values ---
Yes    78363
No     23403
Name: diabetesMed, dtype: int64
---- readmitted_<30 --- 2 unique values ---
0    90409
1    11357
Name: readmitted_<30, dtype: int64
---- readmitted_>30 --- 2 unique values ---
0    66221
1    35545
Name: readmitted_>30, dtype: int64
---- readmitted_NO --- 2 unique values ---
1    54864
0    46902
Name: readmitted_NO, dtype: int64


As it was also mentioned in the PDF describing the dataset, around 97% of the weights are missing we so it would be better to drop this column. Also we can see that two columns (citoglipton and examide) have the same value in all rows so we can drop these. Columns citoglipton, glimepiride-pioglitazone, acetohexamide, tolbutamide, troglitazone, glipizide-metformin, metformin-rosiglitazone, and metformin-pioglitazone only have one or two rows with a different value, so they are not going to contribute to the list of the features and we can drop them as well.<br>

One other thing is to remove the unkown values for the gender which are 3 in total.

In [12]:
df = df.drop(['weight','examide','citoglipton','glimepiride-pioglitazone',
              'acetohexamide', 'tolbutamide', 'troglitazone', 'glipizide-metformin',
              'metformin-rosiglitazone','metformin-pioglitazone'], 1)
df = df[df.gender != 'Unknown/Invalid']
df.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,acarbose,miglitol,tolazamide,insulin,glyburide-metformin,change,diabetesMed,readmitted_<30,readmitted_>30,readmitted_NO
0,Caucasian,Female,5,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,0,0,1
1,Caucasian,Female,15,1,1,7,3,?,?,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,Ch,Yes,0,1,0
2,AfricanAmerican,Female,25,1,1,7,2,?,?,11,5,13,2,0,1,648.0,250,V27,6,,,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,Yes,0,0,1
3,Caucasian,Male,35,1,1,7,2,?,?,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,Ch,Yes,0,0,1
4,Caucasian,Male,45,1,1,7,1,?,?,51,0,8,0,0,0,197.0,157,250,5,,,No,No,No,No,No,Steady,No,No,No,No,No,No,Steady,No,Ch,Yes,0,0,1


For the rest of the missing values, we first convert them to nan and then for the categorical variables such as medical speciality, race or payer code we fill them as UNK as a separate category (especially since almost half of the mediacl specialty and payer code is missing we do not want to lose half of our data beacuse of Nans).

In [13]:
df = df.replace('?',np.nan)
df['medical_specialty'] = df['medical_specialty'].fillna('UNK')
df['diag_1'] = df['diag_1'].fillna('UNK')
df['diag_2'] = df['diag_2'].fillna('UNK')
df['diag_3'] = df['diag_3'].fillna('UNK')
df['medical_specialty'] = df['medical_specialty'].fillna('UNK')
df['payer_code'] = df['payer_code'].fillna('UNK')
df.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,acarbose,miglitol,tolazamide,insulin,glyburide-metformin,change,diabetesMed,readmitted_<30,readmitted_>30,readmitted_NO
0,Caucasian,Female,5,6,25,1,1,UNK,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,UNK,UNK,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,0,0,1
1,Caucasian,Female,15,1,1,7,3,UNK,UNK,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,Ch,Yes,0,1,0
2,AfricanAmerican,Female,25,1,1,7,2,UNK,UNK,11,5,13,2,0,1,648.0,250,V27,6,,,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,Yes,0,0,1
3,Caucasian,Male,35,1,1,7,2,UNK,UNK,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,Ch,Yes,0,0,1
4,Caucasian,Male,45,1,1,7,1,UNK,UNK,51,0,8,0,0,0,197.0,157,250,5,,,No,No,No,No,No,Steady,No,No,No,No,No,No,Steady,No,Ch,Yes,0,0,1


In [14]:
# Double checking if there are any null values in the numerical columns before proceeding
numerical_columns = ['age','time_in_hospital','number_diagnoses',
                     'num_lab_procedures','num_procedures','num_medications',
                     'number_outpatient','number_emergency','number_inpatient']
df[numerical_columns].isnull().sum()

age                   0
time_in_hospital      0
number_diagnoses      0
num_lab_procedures    0
num_procedures        0
num_medications       0
number_outpatient     0
number_emergency      0
number_inpatient      0
dtype: int64

 Using the UCI ID_mappings description file (https://data.world/uci/diabetes-130-us-hospitals-for-years-1999-2008), we can see that discharge dispositions 11,13,14 and 19,20 and 21 refer to patients that either expired or were hospiced after being discharged (hospice is the end of life care where there is no readmission to the hospital). We need to separate these cases, as them not being readmitted does not mean that they did not need readmission but it was because they died.

In [15]:
df = df[~df['discharge_disposition_id'].isin([11,13,14,19,20,21])]

For all the categorical variables we ocnvert them to on-hot encodings. To do that we first need to convert admission type ID, discharge disposition ID and admission source ID to strings.

In [16]:
df = df.astype({'admission_type_id': 'str','discharge_disposition_id': 'str','admission_source_id': 'str'})
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99340 entries, 0 to 101765
Data columns (total 40 columns):
race                        97108 non-null object
gender                      99340 non-null object
age                         99340 non-null int64
admission_type_id           99340 non-null object
discharge_disposition_id    99340 non-null object
admission_source_id         99340 non-null object
time_in_hospital            99340 non-null int64
payer_code                  99340 non-null object
medical_specialty           99340 non-null object
num_lab_procedures          99340 non-null int64
num_procedures              99340 non-null int64
num_medications             99340 non-null int64
number_outpatient           99340 non-null int64
number_emergency            99340 non-null int64
number_inpatient            99340 non-null int64
diag_1                      99340 non-null object
diag_2                      99340 non-null object
diag_3                      99340 non-null objec

Among the categorical variables, medical specialty has 73 distict values, so we need to reduce the number of its categories if we want to convert it into a one-hot vector. To do that we consider the top 10 frequent categories and then put everything else in a remainig bucket called "others":

In [17]:
def reduce_categories(df, column, num_cat):
    # value_counts returns a sorted list by default
    top_10 = df[column].value_counts()[:num_cat].keys().tolist()
    df.loc[~df[column].isin(top_10), column] = 'Others'

df['medical_specialty'].value_counts()

UNK                                 48614
InternalMedicine                    14237
Emergency/Trauma                     7419
Family/GeneralPractice               7252
Cardiology                           5278
                                    ...  
Surgery-PlasticwithinHeadandNeck        1
Pediatrics-InfectiousDiseases           1
Dermatology                             1
Speech                                  1
SportsMedicine                          1
Name: medical_specialty, Length: 73, dtype: int64

In [18]:
# value_counts returns a sorted list by default
reduce_categories(df, 'medical_specialty', 10)
df['medical_specialty'].value_counts()

UNK                           48614
InternalMedicine              14237
Others                         8199
Emergency/Trauma               7419
Family/GeneralPractice         7252
Cardiology                     5278
Surgery-General                3059
Nephrology                     1539
Orthopedics                    1392
Orthopedics-Reconstructive     1230
Radiologist                    1121
Name: medical_specialty, dtype: int64

In [19]:
reduce_categories(df, 'diag_1', 10)
df['diag_1'].value_counts()

Others    64142
428        6663
414        6549
786        4015
410        3448
486        3383
427        2720
491        2240
715        2147
682        2029
780        2004
Name: diag_1, dtype: int64

In [20]:
reduce_categories(df, 'diag_2', 10)
df['diag_2'].value_counts()

Others    57234
276        6589
428        6459
250        6051
427        4892
401        3722
496        3246
599        3212
403        2743
414        2642
411        2550
Name: diag_2, dtype: int64

In [21]:
reduce_categories(df, 'diag_3', 10)
df['diag_3'].value_counts()

Others    54172
250       11466
401        8240
276        4953
428        4412
427        3785
414        3635
496        2504
403        2277
272        1966
585        1930
Name: diag_3, dtype: int64

In [22]:
# we should deal with diags here 'diag_1','diag_2', 'diag_3'
categorical_vars = ['race', 'gender', 'admission_type_id', 'discharge_disposition_id',
                    'admission_source_id', 'payer_code', 'medical_specialty', 'diag_1',
                    'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide',
                    'nateglinide', 'chlorpropamide', 'glimepiride',
                    'glipizide', 'glyburide', 'pioglitazone',
                    'rosiglitazone', 'acarbose', 'miglitol',
                    'tolazamide', 'insulin', 'glyburide-metformin',
                    'change', 'diabetesMed']
one_hot_features = pd.get_dummies(df[categorical_vars])
one_hot_features.columns

Index(['race_AfricanAmerican', 'race_Asian', 'race_Caucasian', 'race_Hispanic',
       'race_Other', 'gender_Female', 'gender_Male', 'admission_type_id_1',
       'admission_type_id_2', 'admission_type_id_3',
       ...
       'insulin_Steady', 'insulin_Up', 'glyburide-metformin_Down',
       'glyburide-metformin_No', 'glyburide-metformin_Steady',
       'glyburide-metformin_Up', 'change_Ch', 'change_No', 'diabetesMed_No',
       'diabetesMed_Yes'],
      dtype='object', length=182)

In [23]:
combined_df = pd.concat([df,one_hot_features],axis = 1)
output_labels = ['readmitted_<30','readmitted_>30','readmitted_NO']
preprocessed = combined_df[numerical_columns + list(one_hot_features.columns) + output_labels]
combined_df.to_csv('combined.csv')
preprocessed.to_csv('preprocessed.csv')