In [127]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
import time
import pyodbc
print(pyodbc.drivers())
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
from scipy import stats

['SQL Server', 'SQL Server Native Client RDA 11.0', 'ODBC Driver 17 for SQL Server', 'Microsoft Access dBASE Driver (*.dbf, *.ndx, *.mdx)', 'Microsoft Access Driver (*.mdb, *.accdb)', 'Microsoft Excel Driver (*.xls, *.xlsx, *.xlsm, *.xlsb)', 'Microsoft Access Text Driver (*.txt, *.csv)']


# SQL Connection

In [5]:
def create_sql_connection_win_auth(server, database):
    connection_string = f'DRIVER={{SQL Server}};SERVER={server};DATABASE={database};Trusted_Connection=yes;'
    conn = pyodbc.connect(connection_string)
    return conn

server = 'SAPNA\\SQLEXPRESS'
database = 'Diabetes_Data'

conn = create_sql_connection_win_auth(server, database)

In [6]:
def query_data(conn, query):
    """
    Execute a SQL query and fetch results as a pandas DataFrame.
    
    Parameters:
    - conn: A pyodbc connection object.
    - query (str): The SQL query to be executed.
    
    Returns:
    - data: A pandas DataFrame containing the query result.
    """
    start_time = time.time()  # Start time measurement
    try:
        cursor = conn.cursor()
        cursor.execute(query)
        
        # Fetch all results from the query
        rows = cursor.fetchall()
        
        # Get column names from cursor
        columns = [desc[0] for desc in cursor.description]
        
        # Create a pandas DataFrame from the results
        data = pd.DataFrame.from_records(rows, columns=columns)
        
    except pyodbc.Error as e:
        print(f"Error executing query: {e}")
        return None
    
    finally:
        cursor.close()
    
    end_time = time.time()  # End time measurement
    execution_time = end_time - start_time  # Calculate execution time
    
    # Print the DataFrame and execution time
    print(f"Query executed in: {execution_time:.4f} seconds")
    
    return df  

In [23]:
query = '''
select *, admission_source.description as admission_source_des, admission_type.description as admission_type_desc, discharge_disposition.description as discharge_desposition_desc 
from patient_data_train_test 
left join admission_source on admission_source.admission_source_id = patient_data_train_test.admission_source_id
left join discharge_disposition on discharge_disposition.discharge_disposition_id = patient_data_train_test.discharge_disposition_id
left join admission_type on admission_type.admission_type_id = patient_data_train_test.admission_type_id
left join readmission_status on readmission_status.encounter_id = patient_data_train_test.encounter_id'''

In [59]:
data_temp = query_data(conn, query)
data_temp.shape

Query executed in: 5.8507 seconds


(85000, 60)

In [60]:
data = data_temp.copy(deep=True)

# Data Cleaning

In [61]:
data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide_metformin', 'glipizide_metformin',
       'glimepiride_pioglitazone', 'metformin_rosiglitazone',
       'metformin_pioglitazone', 'change', 'diabetesMed',
       'admission_source_id', 'description', 'discharge_disposition_id',
       'description', 'admission_

In [62]:
selected_col = ['encounter_id', 'race', 'gender', 'age', 'weight',
       
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide_metformin', 'glipizide_metformin',
       'glimepiride_pioglitazone', 'metformin_rosiglitazone',
       'metformin_pioglitazone', 'change', 'diabetesMed',
        'readmitted',
       'admission_source_des', 'admission_type_desc',
       'discharge_desposition_desc']

In [63]:
data = data[selected_col].iloc[:,1:]

In [64]:
data.columns

Index(['encounter_id', 'race', 'gender', 'age', 'weight', 'time_in_hospital',
       'payer_code', 'medical_specialty', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide_metformin', 'glipizide_metformin',
       'glimepiride_pioglitazone', 'metformin_rosiglitazone',
       'metformin_pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'admission_source_des', 'admission_type_desc',
       'discharge_desposition_desc'],
      dtype='object')

In [65]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85000 entries, 0 to 84999
Data columns (total 49 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   encounter_id                85000 non-null  int64 
 1   race                        85000 non-null  object
 2   gender                      85000 non-null  object
 3   age                         85000 non-null  object
 4   weight                      85000 non-null  object
 5   time_in_hospital            85000 non-null  int64 
 6   payer_code                  85000 non-null  object
 7   medical_specialty           85000 non-null  object
 8   num_lab_procedures          85000 non-null  int64 
 9   num_procedures              85000 non-null  int64 
 10  num_medications             85000 non-null  int64 
 11  number_outpatient           85000 non-null  int64 
 12  number_emergency            82211 non-null  object
 13  number_inpatient            85000 non-null  in

In [39]:
data.describe()

Unnamed: 0,encounter_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_inpatient,number_diagnoses
count,85000.0,85000.0,85000.0,85000.0,85000.0,85000.0,85000.0,85000.0
mean,130473600.0,4.451612,43.300118,1.334894,15.861812,0.344929,0.631271,7.279941
std,68577670.0,3.015638,19.260734,1.687531,8.129045,1.222098,1.251215,1.959969
min,12522.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
25%,74146070.0,2.0,32.0,0.0,10.0,0.0,0.0,6.0
50%,131081300.0,4.0,44.0,1.0,15.0,0.0,0.0,8.0
75%,177405600.0,6.0,57.0,2.0,20.0,0.0,1.0,9.0
max,268762100.0,14.0,132.0,6.0,81.0,42.0,21.0,9.0


In [66]:
data.isnull().sum()

encounter_id                      0
race                              0
gender                            0
age                               0
weight                            0
time_in_hospital                  0
payer_code                        0
medical_specialty                 0
num_lab_procedures                0
num_procedures                    0
num_medications                   0
number_outpatient                 0
number_emergency               2789
number_inpatient                  0
diag_1                            0
diag_2                            0
diag_3                            0
number_diagnoses                  0
max_glu_serum                 79857
A1Cresult                     70951
metformin                         0
repaglinide                       0
nateglinide                       0
chlorpropamide                    0
glimepiride                       0
acetohexamide                     0
glipizide                         0
glyburide                   

### Step 1
- Will drop max_glu_serum ,A1Cresult since 90% rows have null values

In [67]:
data.drop(columns={'max_glu_serum', 'A1Cresult'},inplace=True)

In [42]:
data.shape

(85000, 47)

In [68]:
data['number_emergency'] = data['number_emergency'].fillna(0)

In [None]:
data.isnull().sum()

In [109]:
# Check for unique values in categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns
print(len(categorical_columns))
for col in categorical_columns:
    print(f"{col}: {data[col].unique()}")


37
race: ['Caucasian' 'AfricanAmerican' 'Other' 'Asian' 'Hispanic']
gender: ['Female' 'Male']
age: ['[80-90)' '[90-100)' '[40-50)' '[50-60)' '[60-70)' '[70-80)' '[20-30)'
 '[10-20)' '[30-40)' '[0-10)']
weight: ['?' '[75-100)' '[50-75)' '[0-25)' '[100-125)' '[25-50)' '[125-150)'
 '[175-200)' '[150-175)' '>200']
payer_code: ['Other' 'MC' 'MD' 'HM' 'UN' 'BC' 'SP' 'CP' 'SI' 'DM' 'CM' 'CH' 'PO' 'WC'
 'OT' 'OG' 'MP']
number_emergency: [False True]
diag_1: ['398' '434' '197' '250.7' '414' '157' '428' '518' '648' '999' '410' '682'
 '402' '737' '276' '572' 'V57' '189' '786' '427' '996' '277' '584' '462'
 '473' '411' '174' '486' '998' '511' '432' '626' '295' '8' '196' '250.6'
 '618' '182' '845' '423' '808' '250.4' '722' '403' '250.11' '784' '707'
 '440' '151' '715' '997' '198' '564' '812' '38' '590' '556' '578' '250.32'
 '433' 'V58' '569' '185' '536' '255' '250.13' '599' '558' '574' '250.83'
 '491' '560' '244' '250.03' '577' '730' '188' '824' '250.8' '332' '562'
 '291' '296' '510' '401' '263' '4

In [71]:
data['race'].value_counts()

race
Caucasian          63153
AfricanAmerican    16989
?                   1796
Hispanic            1558
Other               1066
Asian                438
Name: count, dtype: int64

In [72]:
# race (~2k values replaced with 'Other' category)
data['race'] = data['race'].replace('?','Other')
data['race'].value_counts()

race
Caucasian          63153
AfricanAmerican    16989
Other               2862
Hispanic            1558
Asian                438
Name: count, dtype: int64

In [73]:
# gender (Only 3 values with "Unknown/Invalid" entry replacing it with the 'Female' as female proportion in the dataset is on higher side)

data['gender'] = data['gender'].replace('Unknown/Invalid', 'Female')
data['gender'].value_counts()

gender
Female    45813
Male      39187
Name: count, dtype: int64

In [74]:
data['admission_type_desc'].value_counts()

admission_type_desc
Emergency        43691
Urgent           16026
Elective         15349
NULL              4888
Not Available     4761
Not Mapped         264
Trauma Center       13
Newborn              8
Name: count, dtype: int64

In [None]:
# # admission_type_description -  Contains 4785 Null values, replacing it with the existing category "Not Available". Also Not Mapped replace with same category 
# data['admission_type_desc'].value_counts(dropna=False)
# data['admission_type_desc'] = data['admission_type_desc'].fillna('Other').replace({'Not Mapped': 'Not Available','NULL': 'Not Available'})
# data['admission_type_desc'].value_counts(dropna=False)

In [75]:
pd.crosstab(data['admission_type_desc'], data['readmitted'])

readmitted,NO,YES
admission_type_desc,Unnamed: 1_level_1,Unnamed: 2_level_1
Elective,8737,6612
Emergency,22414,21277
,2201,2687
Newborn,5,3
Not Available,2550,2211
Not Mapped,164,100
Trauma Center,13,0
Urgent,8449,7577


In [76]:
# Define rare categories
rare_categories = ['Trauma Center', 'Newborn','NULL','Not Available','Not Mapped']

# Replace rare categories with 'Other'
data['admission_type_desc'] = data['admission_type_desc'].apply(
    lambda x: 'Other' if x in rare_categories else x
)

data['admission_type_desc'].value_counts()

admission_type_desc
Emergency    43691
Urgent       16026
Elective     15349
Other         9934
Name: count, dtype: int64

In [78]:
# discharge_disposition_description
data['discharge_desposition_desc'].value_counts(dropna= False)

discharge_desposition_desc
Discharged to home                                                                                           49888
Discharged/transferred to SNF                                                                                11305
Discharged/transferred to home with home health service                                                      10603
NULL                                                                                                          3683
Discharged/transferred to another short term hospital                                                         1812
Discharged/transferred to another rehab fac including rehab units of a hospital .                             1640
Expired                                                                                                       1411
Discharged/transferred to another type of inpatient care institution                                          1093
Not Mapped                                           

In [86]:
## making subcategories for the discharge disposition
discharge_categories = {
    #Discharged to Home
    'Discharged to home': 'Discharged to Home',
    'Discharged/transferred to home with home health service': 'Discharged to Home',
    'Discharged/transferred to home under care of Home IV provider': 'Discharged to Home',
    
    #Transfers to Other Healthcare Facilities
    'Discharged/transferred to SNF': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred to another short term hospital': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred to another rehab fac including rehab units of a hospital .': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred to another type of inpatient care institution': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred to ICF': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred to a long term care hospital.': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred/referred to a psychiatric hospital of psychiatric distinct part unit of a hospital': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred to a federal health care facility.': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred within this institution to Medicare approved swing bed': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare.': 'Transfers to Other Healthcare Facilities',
    'Neonate discharged to another hospital for neonatal aftercare': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred/referred to this institution for outpatient services': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred/referred another institution for outpatient services': 'Transfers to Other Healthcare Facilities',

    #Expired
    'Expired': 'Expired',
    'Expired at home. Medicaid only, hospice.': 'Expired',
    'Expired in a medical facility. Medicaid only, hospice.': 'Expired',
    
    #Hospice Care
    'Hospice / home': 'Hospice Care',
    'Hospice / medical facility': 'Hospice Care',
    
    #AMA (Against Medical Advice)
    'Left AMA': 'AMA (Against Medical Advice)',
    
    #Other
    'Not Mapped': 'Other',
    'NaN':'Other',
    'Still patient or expected to return for outpatient services': 'Other',
    'Admitted as an inpatient to this hospital': 'Other',
    
    np.nan: 'Other'
}

 

In [90]:
# Expired type not in target
pd.crosstab(data['discharge_desposition_desc'].map(discharge_categories), data['readmitted'])


readmitted,NO,YES
discharge_desposition_desc,Unnamed: 1_level_1,Unnamed: 2_level_1
AMA (Against Medical Advice),255,262
Discharged to Home,31206,29393
Expired,1420,0
Hospice Care,501,62
Other,526,479
Transfers to Other Healthcare Facilities,8425,8788


In [89]:
data['discharge_category'] = data['discharge_desposition_desc'].map(discharge_categories).fillna('Other')
data['discharge_category'].value_counts(dropna=False)


discharge_category
Discharged to Home                          60599
Transfers to Other Healthcare Facilities    17213
Other                                        4688
Expired                                      1420
Hospice Care                                  563
AMA (Against Medical Advice)                  517
Name: count, dtype: int64

In [92]:
# admission_source_description

data['admission_source_des'].value_counts(dropna=False)

admission_source_des
Emergency Room                                               46289
Physician Referral                                           24902
NULL                                                          6514
Transfer from a hospital                                      2852
Transfer from another health care facility                    2157
Clinic Referral                                               1090
Transfer from a Skilled Nursing Facility (SNF)                 711
HMO Referral                                                   182
Not Mapped                                                     161
Not Available                                                  119
Court/Law Enforcement                                           10
Transfer from critial access hospital                            8
Extramural Birth                                                 2
Transfer from hospital inpt/same fac reslt in a sep claim        2
Normal Delivery                          

In [98]:
# ## making subcategories for the admission source 

admission_source_categories = {
    'Emergency Room': 'Emergency Admission',
    'Court/Law Enforcement': 'Emergency Admission',
    
    'Physician Referral': 'Physician Referral',
    'HMO Referral': 'Physician Referral',
    'Clinic Referral': 'Physician Referral',
    
    'Transfer from a hospital': 'Transfers from Other Facilities',
    'Transfer from another health care facility': 'Transfers from Other Facilities',
    'Transfer from a Skilled Nursing Facility (SNF)': 'Transfers from Other Facilities',
    'Transfer from hospital inpt/same fac reslt in a sep claim': 'Transfers from Other Facilities',
    'Transfer from critial access hospital': 'Transfers from Other Facilities',
    'Transfer from Ambulatory Surgery Center': 'Transfers from Other Facilities',
    
    'Extramural Birth': 'Other',
    'Normal Delivery': 'Other',
    'Sick Baby': 'Other',
    
    np.nan: 'Other',
    
    'Not Mapped': 'Other',
    'Not Available': 'Other'
}

In [110]:
data['admission_category'] = data['admission_source_des'].map(admission_source_categories).fillna('Other')
data['admission_category'].value_counts(dropna=False)

admission_category
Emergency Admission                46299
Physician Referral                 26174
Other                               6797
Transfers from Other Facilities     5730
Name: count, dtype: int64

In [100]:
pd.crosstab(data['admission_category'], data['readmitted'])


readmitted,NO,YES
admission_category,Unnamed: 1_level_1,Unnamed: 2_level_1
Emergency Admission,22522,23777
Other,155,128
Physician Referral,14449,11725
Transfers from Other Facilities,3974,1756


In [101]:
data['medical_specialty'].value_counts(dropna=False)

medical_specialty
?                                       38623
InternalMedicine                        13479
Family/GeneralPractice                   7025
Emergency/Trauma                         5892
Cardiology                               4865
Surgery-General                          2731
Nephrology                               1541
Orthopedics-Reconstructive               1199
Orthopedics                              1191
Psychiatry                                824
Pulmonology                               823
Urology                                   650
ObstetricsandGynecology                   640
Surgery-Cardiovascular/Thoracic           629
Radiologist                               616
Gastroenterology                          518
Surgery-Vascular                          509
Surgery-Neuro                             460
PhysicalMedicineandRehabilitation         389
Oncology                                  325
Pediatrics                                242
Hematology/Oncol

In [102]:
del data['medical_specialty']

In [103]:
# Deleting columns examide and citoglipton as both columns has only "No" entry
print(data['citoglipton'].value_counts())
print(data['examide'].value_counts())

data.drop(columns={'examide', 'citoglipton'}, inplace=True)

citoglipton
No    85000
Name: count, dtype: int64
examide
No    85000
Name: count, dtype: int64


In [104]:
data.shape

(85000, 46)

In [105]:
data.head()

Unnamed: 0,encounter_id,race,gender,age,weight,time_in_hospital,payer_code,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide_metformin,glipizide_metformin,glimepiride_pioglitazone,metformin_rosiglitazone,metformin_pioglitazone,change,diabetesMed,readmitted,admission_source_des,admission_type_desc,discharge_desposition_desc,discharge_category,admission_category
0,12522,Caucasian,Female,[80-90),?,13,?,68,2,28,0,False,0,398.0,427,38,8,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,True,NO,Transfer from a hospital,Urgent,Discharged to home,Discharged to Home,Transfers from Other Facilities
1,15738,Caucasian,Female,[90-100),?,12,?,33,3,18,0,False,0,434.0,198,486,8,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,Steady,No,No,No,No,No,Ch,True,NO,Transfer from a hospital,Elective,Discharged/transferred to SNF,Transfers to Other Healthcare Facilities,Transfers from Other Facilities
2,16680,Caucasian,Male,[40-50),?,1,?,51,0,8,0,False,0,197.0,157,250,5,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,True,NO,Emergency Room,Emergency,Discharged to home,Discharged to Home,Emergency Admission
3,28236,AfricanAmerican,Female,[40-50),?,9,?,47,2,17,0,False,0,250.7,403,996,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,True,YES,Emergency Room,Emergency,Discharged to home,Discharged to Home,Emergency Admission
4,35754,Caucasian,Male,[50-60),?,3,?,31,6,16,0,False,0,414.0,411,250,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,True,YES,Clinic Referral,Urgent,Discharged to home,Discharged to Home,Physician Referral


In [108]:
pd.crosstab(data['payer_code'],data['readmitted'])

readmitted,NO,YES
payer_code,Unnamed: 1_level_1,Unnamed: 2_level_1
BC,2097,1401
CH,42,22
CM,683,634
CP,1114,826
DM,200,223
HM,2484,2257
MC,12362,12657
MD,1244,1314
MP,18,23
OG,384,341


In [107]:
data['payer_code'] = data['payer_code'].replace('?','Other')

In [133]:
data.shape

(85000, 48)

In [126]:
data[(data['number_outpatient']>0) & (data['number_inpatient'] > 0)].shape

(13230, 46)

## Outlier Treatment

In [134]:
def treat_outliers(df, columns):
   
    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        # Cap outliers
        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    return df

In [135]:
data = treat_outliers(data, ['num_lab_procedures','num_medications',])
data.describe()

Unnamed: 0,encounter_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_inpatient,number_diagnoses,number_outpatient_boxcox,number_inpatient_boxcox
count,85000.0,85000.0,85000.0,85000.0,85000.0,85000.0,85000.0,85000.0,85000.0,85000.0
mean,130473600.0,4.451612,43.289471,1.334894,15.645718,0.344929,0.631271,7.279941,0.025882,0.129595
std,68577670.0,3.015638,19.229168,1.687531,7.382708,1.222098,1.251215,1.959969,0.060284,0.184146
min,12522.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
25%,74146070.0,2.0,32.0,0.0,10.0,0.0,0.0,6.0,0.0,0.0
50%,131081300.0,4.0,44.0,1.0,15.0,0.0,0.0,8.0,0.0,0.0
75%,177405600.0,6.0,57.0,2.0,20.0,0.0,1.0,9.0,0.0,0.357118
max,268762100.0,14.0,94.5,6.0,35.0,42.0,21.0,9.0,0.167747,0.45735


In [138]:
# Box-Cox transformation
data['number_outpatient_boxcox'], _ = stats.boxcox(data['number_outpatient'] + 1)
data['number_inpatient_boxcox'], _ = stats.boxcox(data['number_inpatient'] + 1)


In [None]:
# Log Transformation
data['number_outpatient_treated'] = np.log1p(data['number_outpatient'])
data['number_inpatient_treated'] = np.log1p(data['number_inpatient'])

In [139]:
data.describe()

Unnamed: 0,encounter_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_inpatient,number_diagnoses,number_outpatient_boxcox,number_inpatient_boxcox,number_outpatient_treated,number_inpatient_treated
count,85000.0,85000.0,85000.0,85000.0,85000.0,85000.0,85000.0,85000.0,85000.0,85000.0,85000.0,85000.0
mean,130473600.0,4.451612,43.289471,1.334894,15.645718,0.344929,0.631271,7.279941,0.025882,0.129595,0.161555,0.32485
std,68577670.0,3.015638,19.229168,1.687531,7.382708,1.222098,1.251215,1.959969,0.060284,0.184146,0.41629,0.509695
min,12522.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,74146070.0,2.0,32.0,0.0,10.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0
50%,131081300.0,4.0,44.0,1.0,15.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0
75%,177405600.0,6.0,57.0,2.0,20.0,0.0,1.0,9.0,0.0,0.357118,0.0,0.693147
max,268762100.0,14.0,94.5,6.0,35.0,42.0,21.0,9.0,0.167747,0.45735,3.7612,3.091042


In [140]:
del data['number_outpatient_treated']
del data['number_inpatient_treated']
         

In [142]:
del data['number_outpatient']

del data['number_inpatient']
         

In [143]:
data.describe()

Unnamed: 0,encounter_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_diagnoses,number_outpatient_boxcox,number_inpatient_boxcox
count,85000.0,85000.0,85000.0,85000.0,85000.0,85000.0,85000.0,85000.0
mean,130473600.0,4.451612,43.289471,1.334894,15.645718,7.279941,0.025882,0.129595
std,68577670.0,3.015638,19.229168,1.687531,7.382708,1.959969,0.060284,0.184146
min,12522.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
25%,74146070.0,2.0,32.0,0.0,10.0,6.0,0.0,0.0
50%,131081300.0,4.0,44.0,1.0,15.0,8.0,0.0,0.0
75%,177405600.0,6.0,57.0,2.0,20.0,9.0,0.0,0.357118
max,268762100.0,14.0,94.5,6.0,35.0,9.0,0.167747,0.45735


In [146]:
data['weight'].value_counts()

weight
?            82268
[75-100)      1153
[50-75)        781
[100-125)      518
[125-150)      115
[25-50)         82
[0-25)          46
[150-175)       27
[175-200)        8
>200             2
Name: count, dtype: int64

In [147]:
del data['weight']

# Data Preprocessing

In [148]:
# Check for unique values in categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns
print(len(categorical_columns))
for col in categorical_columns:
    print(f"{col}: {data[col].unique()}")


36
race: ['Caucasian' 'AfricanAmerican' 'Other' 'Asian' 'Hispanic']
gender: ['Female' 'Male']
age: ['[80-90)' '[90-100)' '[40-50)' '[50-60)' '[60-70)' '[70-80)' '[20-30)'
 '[10-20)' '[30-40)' '[0-10)']
payer_code: ['Other' 'MC' 'MD' 'HM' 'UN' 'BC' 'SP' 'CP' 'SI' 'DM' 'CM' 'CH' 'PO' 'WC'
 'OT' 'OG' 'MP']
number_emergency: [False True]
diag_1: ['398' '434' '197' '250.7' '414' '157' '428' '518' '648' '999' '410' '682'
 '402' '737' '276' '572' 'V57' '189' '786' '427' '996' '277' '584' '462'
 '473' '411' '174' '486' '998' '511' '432' '626' '295' '8' '196' '250.6'
 '618' '182' '845' '423' '808' '250.4' '722' '403' '250.11' '784' '707'
 '440' '151' '715' '997' '198' '564' '812' '38' '590' '556' '578' '250.32'
 '433' 'V58' '569' '185' '536' '255' '250.13' '599' '558' '574' '250.83'
 '491' '560' '244' '250.03' '577' '730' '188' '824' '250.8' '332' '562'
 '291' '296' '510' '401' '263' '438' '70' '250.02' '493' '642' '625' '571'
 '738' '593' '250.42' '807' '456' '446' '575' '250.41' '820' '515' '

In [149]:
data.drop(columns={'metformin_rosiglitazone','metformin_pioglitazone',
                   'diag_1','diag_2','diag_3','discharge_desposition_desc','admission_source_des' }, inplace=True)

In [152]:
# Check for unique values in categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns
print(len(categorical_columns))
for col in categorical_columns:
    print(f"{col}: {data[col].unique()}")


29
race: ['Caucasian' 'AfricanAmerican' 'Other' 'Asian' 'Hispanic']
gender: ['Female' 'Male']
age: ['80-90' '90-100' '40-50' '50-60' '60-70' '70-80' '20-30' '10-20' '30-40'
 '0-10']
payer_code: ['Other' 'MC' 'MD' 'HM' 'UN' 'BC' 'SP' 'CP' 'SI' 'DM' 'CM' 'CH' 'PO' 'WC'
 'OT' 'OG' 'MP']
number_emergency: [False True]
metformin: ['No' 'Steady' 'Up' 'Down']
repaglinide: ['No' 'Up' 'Steady' 'Down']
nateglinide: ['No' 'Steady' 'Down' 'Up']
chlorpropamide: ['No' 'Steady' 'Down' 'Up']
glimepiride: ['No' 'Steady' 'Down' 'Up']
acetohexamide: ['No' 'Steady']
glipizide: ['Steady' 'No' 'Up' 'Down']
glyburide: ['No' 'Up' 'Steady' 'Down']
tolbutamide: ['No' 'Steady']
pioglitazone: ['No' 'Steady' 'Up' 'Down']
rosiglitazone: ['No' 'Steady' 'Up' 'Down']
acarbose: ['No' 'Steady' 'Up' 'Down']
miglitol: ['No' 'Steady' 'Down' 'Up']
troglitazone: ['No' 'Steady']
tolazamide: ['No' 'Steady' 'Up']
insulin: ['Steady' 'Down' 'No' 'Up']
glyburide_metformin: ['No' 'Steady' 'Down' 'Up']
glipizide_metformin: ['No' 'St

In [151]:
# Function to clean the age column using regex
def clean_age_column(age_series):
    return age_series.str.replace(r'[\[\]() ]', '', regex=True)

# Cleaning the age column
data['age'] = clean_age_column(data['age'])
data.head()

Unnamed: 0,encounter_id,race,gender,age,time_in_hospital,payer_code,num_lab_procedures,num_procedures,num_medications,number_emergency,number_diagnoses,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide_metformin,glipizide_metformin,glimepiride_pioglitazone,change,diabetesMed,readmitted,admission_type_desc,discharge_category,admission_category,number_outpatient_boxcox,number_inpatient_boxcox
0,12522,Caucasian,Female,80-90,13,Other,68.0,2,28.0,False,8,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,Steady,No,No,No,Ch,True,NO,Urgent,Discharged to Home,Transfers from Other Facilities,0.0,0.0
1,15738,Caucasian,Female,90-100,12,Other,33.0,3,18.0,False,8,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,Steady,No,No,No,Ch,True,NO,Elective,Transfers to Other Healthcare Facilities,Transfers from Other Facilities,0.0,0.0
2,16680,Caucasian,Male,40-50,1,Other,51.0,0,8.0,False,5,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,Steady,No,No,No,Ch,True,NO,Emergency,Discharged to Home,Emergency Admission,0.0,0.0
3,28236,AfricanAmerican,Female,40-50,9,Other,47.0,2,17.0,False,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,True,YES,Emergency,Discharged to Home,Emergency Admission,0.0,0.0
4,35754,Caucasian,Male,50-60,3,Other,31.0,6,16.0,False,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,True,YES,Urgent,Discharged to Home,Physician Referral,0.0,0.0


In [153]:
# Convert the target variable to binary (0 for NO, 1 for YES)
data['readmitted'] = data['readmitted'].map({'NO': 0, 'YES': 1})

In [155]:
# Calculate the total count of events (YES) and non-events (NO)
total_events = data['readmitted'].sum()
total_non_events = data['readmitted'].count() - total_events


In [156]:
# Calculate WoE and IV
def calculate_woe_iv(df, feature, target):
    # Create a dataframe to hold the counts
    woe_iv = pd.DataFrame()
    
    # Group by the feature and calculate the count of events and non-events
    woe_iv['total'] = df.groupby(feature)[target].count()
    woe_iv['events'] = df.groupby(feature)[target].sum()
    woe_iv['non_events'] = woe_iv['total'] - woe_iv['events']
    
    # Calculate event and non-event rates
    woe_iv['event_rate'] = woe_iv['events'] / total_events
    woe_iv['non_event_rate'] = woe_iv['non_events'] / total_non_events
    
    # Calculate WoE
    woe_iv['woe'] = np.log(woe_iv['event_rate'] / woe_iv['non_event_rate'])
    
    # Calculate IV
    woe_iv['iv'] = (woe_iv['event_rate'] - woe_iv['non_event_rate']) * woe_iv['woe']
    
    # Return the DataFrame with WoE and IV
    return woe_iv

In [158]:
# Calculate WoE and IV for the payer_code column
woe_iv_results = calculate_woe_iv(data, 'payer_code', 'readmitted')

# Display results
woe_iv_results


Unnamed: 0_level_0,total,events,non_events,event_rate,non_event_rate,woe,iv
payer_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BC,3498,1401,2097,0.034621,0.047089,-0.307578,0.003834841
CH,64,22,42,0.000544,0.000943,-0.550884,0.0002200603
CM,1317,634,683,0.015667,0.015337,0.021298,7.031361e-06
CP,1940,826,1114,0.020412,0.025015,-0.203374,0.0009362248
DM,423,223,200,0.005511,0.004491,0.204598,0.0002086105
HM,4741,2257,2484,0.055774,0.055779,-9e-05,4.518488e-10
MC,25019,12657,12362,0.312773,0.277592,0.119327,0.004198094
MD,2558,1314,1244,0.032471,0.027934,0.150488,0.0006826962
MP,41,23,18,0.000568,0.000404,0.340866,5.595989e-05
OG,725,341,384,0.008427,0.008623,-0.023016,4.515816e-06


In [159]:
# Calculate total IV
total_iv = woe_iv_results['iv'].sum()
print(f'Total IV for payer_code: {total_iv}')

Total IV for payer_code: 0.01856420866609557


In [161]:
# Calculate WoE and IV for the age column
woe_iv_results_age = calculate_woe_iv(data, 'age', 'readmitted')


In [164]:
woe_iv_results_age

Unnamed: 0_level_0,total,events,non_events,event_rate,non_event_rate,woe,iv
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0-10,155,28,127,0.000692,0.002852,-1.416239,0.003059
10-20,612,242,370,0.00598,0.008308,-0.328822,0.000766
20-30,1410,662,748,0.016359,0.016797,-0.026394,1.2e-05
30-40,3218,1426,1792,0.035239,0.04024,-0.132715,0.000664
40-50,8279,3810,4469,0.094151,0.100353,-0.063792,0.000396
50-60,14538,6574,7964,0.162453,0.178834,-0.096065,0.001574
60-70,18602,8915,9687,0.220303,0.217524,0.012694,3.5e-05
70-80,21929,10923,11006,0.269924,0.247143,0.088174,0.002009
80-90,14013,6974,7039,0.172338,0.158063,0.086466,0.001234
90-100,2244,913,1331,0.022562,0.029888,-0.281206,0.00206


In [165]:
# Create a mapping dictionary from payer_code to WoE values
woe_mapping = woe_iv_results_age['woe'].to_dict()

# Replace the original payer_code values with their corresponding WoE values
data['age_woe'] = data['age'].map(woe_mapping)

# Check the updated DataFrame
data[['age', 'age_woe']].head()

Unnamed: 0,age,age_woe
0,80-90,0.086466
1,90-100,-0.281206
2,40-50,-0.063792
3,40-50,-0.063792
4,50-60,-0.096065


In [166]:
woe_iv_results_discharge_category = calculate_woe_iv(data, 'discharge_category', 'readmitted')
woe_iv_results_discharge_category

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0_level_0,total,events,non_events,event_rate,non_event_rate,woe,iv
discharge_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AMA (Against Medical Advice),517,262,255,0.006474,0.005726,0.122825,9.2e-05
Discharged to Home,60599,29393,31206,0.726345,0.700739,0.03589,0.000919
Expired,1420,0,1420,0.0,0.031886,-inf,inf
Hospice Care,563,62,501,0.001532,0.01125,-1.993728,0.019375
Other,4688,1962,2726,0.048484,0.061213,-0.233127,0.002967
Transfers to Other Healthcare Facilities,17213,8788,8425,0.217165,0.189186,0.137927,0.003859


In [169]:
data.shape


(85000, 39)

In [170]:
data.to_csv('../data/interim/Final_data.csv')