In [3]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
import time
import pyodbc
print(pyodbc.drivers())
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
from scipy import stats

['SQL Server', 'ODBC Driver 17 for SQL Server', 'SQL Server Native Client RDA 11.0', 'Microsoft Access Driver (*.mdb, *.accdb)', 'Microsoft Excel Driver (*.xls, *.xlsx, *.xlsm, *.xlsb)', 'Microsoft Access Text Driver (*.txt, *.csv)', 'Microsoft Access dBASE Driver (*.dbf, *.ndx, *.mdx)']


# SQL Connection

In [4]:
def create_sql_connection(server, database, username, password, driver='{ODBC Driver 17 for SQL Server}'):
    """
    Establish a connection to a SQL Server database using pyodbc.

    Parameters:
    - server (str): The SQL Server address (e.g., 'localhost' or server IP).
    - database (str): The name of the database you want to connect to.
    - username (str): SQL Server username.
    - password (str): SQL Server password.
    - driver (str): ODBC driver to use. Default is '{ODBC Driver 17 for SQL Server}'.s

    Returns:
    - conn: A pyodbc connection object if successful.
    """
    connection_string = f"""
        DRIVER={driver};
        SERVER={server};
        DATABASE={database};
        UID={username};
        PWD={password};
    """
    try:
        conn = pyodbc.connect(connection_string)
        print("Connection established successfully!")
        return conn
    except Exception as e:
        print(f"Failed to connect to the database. Error: {e}")
        return None

In [5]:
def query_data(conn, query):
    """
    Execute a SQL query and fetch results as a pandas DataFrame.
    
    Parameters:
    - conn: A pyodbc connection object.
    - query (str): The SQL query to be executed.
    
    Returns:
    - df: A pandas DataFrame containing the query result.
    """
    start_time = time.time()  # Start time measurement
    try:
        cursor = conn.cursor()
        cursor.execute(query)
        
        # Fetch all results from the query
        rows = cursor.fetchall()
        
        # Get column names from cursor
        columns = [desc[0] for desc in cursor.description]
        
        # Create a pandas DataFrame from the results
        df = pd.DataFrame.from_records(rows, columns=columns)
        
    except pyodbc.Error as e:
        print(f"Error executing query: {e}")
        return None
    
    finally:
        cursor.close()
    
    end_time = time.time()  # End time measurement
    execution_time = end_time - start_time  # Calculate execution time
    
    # Print the DataFrame and execution time
    print(f"Query executed in: {execution_time:.4f} seconds")
    
    return df  

In [13]:
server = 'ROHIT'     
database = 'DiabetesData'  
username = 'rohit_kosamkar'       
password = 'September@2024' 

# Establish connection
conn = create_sql_connection(server, database, username, password)

Connection established successfully!


In [14]:
query = '''
select *, admissionsource.description as admission_source_des, admissiontype.description as admission_type_desc, discharge_disposition.description as discharge_desposition_desc 
from patient_data_validation
left join admissionsource on admissionsource.admission_source_id = patient_data_validation.admission_source_id
left join discharge_disposition on discharge_disposition.discharge_disposition_id = patient_data_validation.discharge_disposition_id
left join admissiontype on admissiontype.admission_type_id = patient_data_validation.admission_type_id
left join patient_readmission_status on patient_readmission_status.encounter_id = patient_data_validation.encounter_id'''

In [15]:
data_temp = query_data(conn, query)
data_temp.shape

Query executed in: 0.6590 seconds


(16766, 60)

In [16]:
data = data_temp.copy(deep=True)

# Data Cleaning

In [17]:
data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide_metformin', 'glipizide_metformin',
       'glimepiride_pioglitazone', 'metformin_rosiglitazone',
       'metformin_pioglitazone', 'change', 'diabetesMed',
       'admission_source_id', 'description', 'discharge_disposition_id',
       'description', 'admission_

In [18]:
selected_col = ['encounter_id', 'race', 'gender', 'age', 'weight',
       
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide_metformin', 'glipizide_metformin',
       'glimepiride_pioglitazone', 'metformin_rosiglitazone',
       'metformin_pioglitazone', 'change', 'diabetesMed',
        'readmitted',
       'admission_source_des', 'admission_type_desc',
       'discharge_desposition_desc']

In [19]:
data = data[selected_col].iloc[:,1:]

In [20]:
data.columns

Index(['encounter_id', 'race', 'gender', 'age', 'weight', 'time_in_hospital',
       'payer_code', 'medical_specialty', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide_metformin', 'glipizide_metformin',
       'glimepiride_pioglitazone', 'metformin_rosiglitazone',
       'metformin_pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'admission_source_des', 'admission_type_desc',
       'discharge_desposition_desc'],
      dtype='object')

In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16766 entries, 0 to 16765
Data columns (total 49 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   encounter_id                16766 non-null  int64 
 1   race                        16766 non-null  object
 2   gender                      16766 non-null  object
 3   age                         16766 non-null  object
 4   weight                      16766 non-null  object
 5   time_in_hospital            16766 non-null  int64 
 6   payer_code                  16766 non-null  object
 7   medical_specialty           16766 non-null  object
 8   num_lab_procedures          16766 non-null  int64 
 9   num_procedures              16766 non-null  int64 
 10  num_medications             16766 non-null  int64 
 11  number_outpatient           16766 non-null  int64 
 12  number_emergency            16766 non-null  int64 
 13  number_inpatient            16766 non-null  in

In [22]:
data.describe()

Unnamed: 0,encounter_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0
mean,341265400.0,4.113981,42.058988,1.364249,16.833174,0.493201,0.296374,0.657342,8.14589
std,54678210.0,2.80844,21.621008,1.795453,8.071631,1.468929,1.331495,1.32016,1.610011
min,268763500.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,287676800.0,2.0,29.0,0.0,11.0,0.0,0.0,0.0,8.0
50%,334883800.0,3.0,44.0,1.0,16.0,0.0,0.0,0.0,9.0
75%,390234200.0,5.0,58.0,2.0,21.0,0.0,0.0,1.0,9.0
max,443867200.0,14.0,126.0,6.0,68.0,40.0,64.0,16.0,16.0


In [23]:
data.isnull().sum()

encounter_id                      0
race                              0
gender                            0
age                               0
weight                            0
time_in_hospital                  0
payer_code                        0
medical_specialty                 0
num_lab_procedures                0
num_procedures                    0
num_medications                   0
number_outpatient                 0
number_emergency                  0
number_inpatient                  0
diag_1                            0
diag_2                            0
diag_3                            0
number_diagnoses                  0
max_glu_serum                 16563
A1Cresult                     13797
metformin                         0
repaglinide                       0
nateglinide                       0
chlorpropamide                    0
glimepiride                       0
acetohexamide                     0
glipizide                         0
glyburide                   

### Step 1
- Will drop max_glu_serum ,A1Cresult since 90% rows have null values

In [24]:
data.drop(columns={'max_glu_serum', 'A1Cresult'},inplace=True)

In [25]:
data.shape

(16766, 47)

In [26]:
data['number_emergency'] = data['number_emergency'].fillna(0)

In [27]:
data.isnull().sum()

encounter_id                  0
race                          0
gender                        0
age                           0
weight                        0
time_in_hospital              0
payer_code                    0
medical_specialty             0
num_lab_procedures            0
num_procedures                0
num_medications               0
number_outpatient             0
number_emergency              0
number_inpatient              0
diag_1                        0
diag_2                        0
diag_3                        0
number_diagnoses              0
metformin                     0
repaglinide                   0
nateglinide                   0
chlorpropamide                0
glimepiride                   0
acetohexamide                 0
glipizide                     0
glyburide                     0
tolbutamide                   0
pioglitazone                  0
rosiglitazone                 0
acarbose                      0
miglitol                      0
troglita

In [28]:
# Check for unique values in categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns
print(len(categorical_columns))
for col in categorical_columns:
    print(f"{col}: {data[col].unique()}")


38
race: ['AfricanAmerican' 'Caucasian' 'Other' '?' 'Asian' 'Hispanic']
gender: ['Female' 'Male']
age: ['[50-60)' '[80-90)' '[60-70)' '[70-80)' '[90-100)' '[40-50)' '[30-40)'
 '[20-30)' '[10-20)' '[0-10)']
weight: ['?' '[75-100)' '[100-125)' '[50-75)' '[125-150)' '[150-175)' '[25-50)'
 '[175-200)' '>200' '[0-25)']
payer_code: ['DM' 'PO' 'MC' 'MD' 'BC' 'HM' 'SP' '?' 'OG' 'CP' 'UN' 'CM' 'SI' 'CH' 'MP'
 'OT' 'WC' 'FR']
medical_specialty: ['?' 'Emergency/Trauma' 'Radiologist' 'InternalMedicine' 'Orthopedics'
 'Surgery-General' 'Family/GeneralPractice' 'Oncology' 'Cardiology'
 'Nephrology' 'Hematology' 'Gynecology' 'Podiatry'
 'ObstetricsandGynecology' 'Orthopedics-Reconstructive' 'Urology'
 'Radiology' 'Gastroenterology' 'Psychiatry' 'Hospitalist'
 'Surgery-Vascular' 'Ophthalmology' 'Pulmonology' 'Psychology'
 'Surgery-Neuro' 'Pediatrics' 'Neurology' 'InfectiousDiseases'
 'Endocrinology' 'Surgery-Cardiovascular' 'Otolaryngology'
 'Surgery-Cardiovascular/Thoracic' 'Surgery-Thoracic' 'Surgic

In [29]:
data['race'].value_counts()

race
Caucasian          12946
AfricanAmerican     2221
Hispanic             479
?                    477
Other                440
Asian                203
Name: count, dtype: int64

In [30]:
# race (~2k values replaced with 'Other' category)
data['race'] = data['race'].replace('?','Other')
data['race'].value_counts()

race
Caucasian          12946
AfricanAmerican     2221
Other                917
Hispanic             479
Asian                203
Name: count, dtype: int64

In [31]:
# gender (Only 3 values with "Unknown/Invalid" entry replacing it with the 'Female' as female proportion in the dataset is on higher side)

data['gender'] = data['gender'].replace('Unknown/Invalid', 'Female')
data['gender'].value_counts()

gender
Female    8898
Male      7868
Name: count, dtype: int64

In [32]:
data['admission_type_desc'].value_counts()

admission_type_desc
Emergency        10299
Elective          3520
Urgent            2454
NULL               403
Not Mapped          56
Not Available       24
Trauma Center        8
Newborn              2
Name: count, dtype: int64

In [None]:
# # admission_type_description -  Contains 4785 Null values, replacing it with the existing category "Not Available". Also Not Mapped replace with same category 
# data['admission_type_desc'].value_counts(dropna=False)
# data['admission_type_desc'] = data['admission_type_desc'].fillna('Other').replace({'Not Mapped': 'Not Available','NULL': 'Not Available'})
# data['admission_type_desc'].value_counts(dropna=False)

In [33]:
pd.crosstab(data['admission_type_desc'], data['readmitted'])

readmitted,NO,YES
admission_type_desc,Unnamed: 1_level_1,Unnamed: 2_level_1
Elective,2425,1095
Emergency,6046,4253
,273,130
Newborn,2,0
Not Available,19,5
Not Mapped,45,11
Trauma Center,8,0
Urgent,1513,941


In [34]:
# Define rare categories
rare_categories = ['Trauma Center', 'Newborn','NULL','Not Available','Not Mapped']

# Replace rare categories with 'Other'
data['admission_type_desc'] = data['admission_type_desc'].apply(
    lambda x: 'Other' if x in rare_categories else x
)

data['admission_type_desc'].value_counts()

admission_type_desc
Emergency    10299
Elective      3520
Urgent        2454
Other          493
Name: count, dtype: int64

In [35]:
# discharge_disposition_description
data['discharge_desposition_desc'].value_counts(dropna= False)

discharge_desposition_desc
Discharged to home                                                                                           10346
Discharged/transferred to SNF                                                                                 2649
Discharged/transferred to home with home health service                                                       2299
Discharged/transferred to another rehab fac including rehab units of a hospital .                              353
Discharged/transferred to another short term hospital                                                          316
Expired                                                                                                        231
Hospice / home                                                                                                 112
Left AMA                                                                                                       106
Hospice / medical facility                           

In [36]:
## making subcategories for the discharge disposition
discharge_categories = {
    #Discharged to Home
    'Discharged to home': 'Discharged to Home',
    'Discharged/transferred to home with home health service': 'Discharged to Home',
    'Discharged/transferred to home under care of Home IV provider': 'Discharged to Home',
    
    #Transfers to Other Healthcare Facilities
    'Discharged/transferred to SNF': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred to another short term hospital': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred to another rehab fac including rehab units of a hospital .': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred to another type of inpatient care institution': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred to ICF': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred to a long term care hospital.': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred/referred to a psychiatric hospital of psychiatric distinct part unit of a hospital': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred to a federal health care facility.': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred within this institution to Medicare approved swing bed': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare.': 'Transfers to Other Healthcare Facilities',
    'Neonate discharged to another hospital for neonatal aftercare': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred/referred to this institution for outpatient services': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred/referred another institution for outpatient services': 'Transfers to Other Healthcare Facilities',

    #Expired
    'Expired': 'Expired',
    'Expired at home. Medicaid only, hospice.': 'Expired',
    'Expired in a medical facility. Medicaid only, hospice.': 'Expired',
    
    #Hospice Care
    'Hospice / home': 'Hospice Care',
    'Hospice / medical facility': 'Hospice Care',
    
    #AMA (Against Medical Advice)
    'Left AMA': 'AMA (Against Medical Advice)',
    
    #Other
    'Not Mapped': 'Other',
    'NaN':'Other',
    'Still patient or expected to return for outpatient services': 'Other',
    'Admitted as an inpatient to this hospital': 'Other',
    
    np.nan: 'Other'
}

 

In [37]:
# Expired type not in target
pd.crosstab(data['discharge_desposition_desc'].map(discharge_categories), data['readmitted'])


readmitted,NO,YES
discharge_desposition_desc,Unnamed: 1_level_1,Unnamed: 2_level_1
AMA (Against Medical Advice),57,49
Discharged to Home,7866,4779
Expired,232,0
Hospice Care,184,24
Other,6,2
Transfers to Other Healthcare Facilities,1981,1578


In [38]:
data['discharge_category'] = data['discharge_desposition_desc'].map(discharge_categories).fillna('Other')
data['discharge_category'].value_counts(dropna=False)


discharge_category
Discharged to Home                          12645
Transfers to Other Healthcare Facilities     3559
Expired                                       232
Hospice Care                                  208
AMA (Against Medical Advice)                  106
Other                                          16
Name: count, dtype: int64

In [39]:
# admission_source_description

data['admission_source_des'].value_counts(dropna=False)

admission_source_des
Emergency Room                                               11205
Physician Referral                                            4663
Transfer from a hospital                                       335
NULL                                                           267
Transfer from a Skilled Nursing Facility (SNF)                 144
Transfer from another health care facility                     107
Clinic Referral                                                 14
Transfer from hospital inpt/same fac reslt in a sep claim       10
Court/Law Enforcement                                            6
Not Available                                                    6
HMO Referral                                                     5
Transfer from Ambulatory Surgery Center                          2
Normal Delivery                                                  1
Sick Baby                                                        1
Name: count, dtype: int64

In [40]:
# ## making subcategories for the admission source 

admission_source_categories = {
    'Emergency Room': 'Emergency Admission',
    'Court/Law Enforcement': 'Emergency Admission',
    
    'Physician Referral': 'Physician Referral',
    'HMO Referral': 'Physician Referral',
    'Clinic Referral': 'Physician Referral',
    
    'Transfer from a hospital': 'Transfers from Other Facilities',
    'Transfer from another health care facility': 'Transfers from Other Facilities',
    'Transfer from a Skilled Nursing Facility (SNF)': 'Transfers from Other Facilities',
    'Transfer from hospital inpt/same fac reslt in a sep claim': 'Transfers from Other Facilities',
    'Transfer from critial access hospital': 'Transfers from Other Facilities',
    'Transfer from Ambulatory Surgery Center': 'Transfers from Other Facilities',
    
    'Extramural Birth': 'Other',
    'Normal Delivery': 'Other',
    'Sick Baby': 'Other',
    
    np.nan: 'Other',
    
    'Not Mapped': 'Other',
    'Not Available': 'Other'
}

In [41]:
data['admission_category'] = data['admission_source_des'].map(admission_source_categories).fillna('Other')
data['admission_category'].value_counts(dropna=False)

admission_category
Emergency Admission                11211
Physician Referral                  4682
Transfers from Other Facilities      598
Other                                275
Name: count, dtype: int64

In [42]:
pd.crosstab(data['admission_category'], data['readmitted'])



readmitted,NO,YES
admission_category,Unnamed: 1_level_1,Unnamed: 2_level_1
Emergency Admission,6595,4616
Other,188,87
Physician Referral,3129,1553
Transfers from Other Facilities,419,179


In [43]:
data['medical_specialty'].value_counts(dropna=False)

medical_specialty
?                                    11326
Emergency/Trauma                      1673
InternalMedicine                      1156
Radiologist                            524
Cardiology                             487
Family/GeneralPractice                 415
Surgery-General                        368
Orthopedics                            209
Nephrology                              72
Pulmonology                             48
Gastroenterology                        46
Urology                                 35
Orthopedics-Reconstructive              34
ObstetricsandGynecology                 31
Psychiatry                              30
Hematology                              28
Surgery-Vascular                        24
Oncology                                23
Surgery-Cardiovascular/Thoracic         23
Neurology                               20
Gynecology                              19
SurgicalSpecialty                       19
Podiatry                            

In [44]:
del data['medical_specialty']

In [45]:
# Deleting columns examide and citoglipton as both columns has only "No" entry
print(data['citoglipton'].value_counts())
print(data['examide'].value_counts())

data.drop(columns={'examide', 'citoglipton'}, inplace=True)

citoglipton
No    16766
Name: count, dtype: int64
examide
No    16766
Name: count, dtype: int64


In [46]:
data.shape

(16766, 46)

In [47]:
data.head()

Unnamed: 0,encounter_id,race,gender,age,weight,time_in_hospital,payer_code,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide_metformin,glipizide_metformin,glimepiride_pioglitazone,metformin_rosiglitazone,metformin_pioglitazone,change,diabetesMed,readmitted,admission_source_des,admission_type_desc,discharge_desposition_desc,discharge_category,admission_category
0,268763496,AfricanAmerican,Female,[50-60),?,3,DM,14,0,12,1,0,1,402,496,413,9,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,YES,Emergency Room,Emergency,Discharged to home,Discharged to Home,Emergency Admission
1,268777020,Caucasian,Male,[50-60),?,4,PO,37,1,20,0,0,0,327,780,493,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO,Emergency Room,Emergency,Discharged to home,Discharged to Home,Emergency Admission
2,268780680,Caucasian,Female,[80-90),?,4,MC,63,0,29,0,1,1,428,486,585,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Emergency Room,Emergency,Discharged/transferred to SNF,Transfers to Other Healthcare Facilities,Emergency Admission
3,268784670,AfricanAmerican,Female,[50-60),?,2,MD,72,1,18,0,0,0,38,496,599,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO,Emergency Room,Emergency,Discharged to home,Discharged to Home,Emergency Admission
4,268787766,Caucasian,Female,[60-70),?,1,MC,28,0,13,0,0,0,564,135,428,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,NO,Emergency Room,Emergency,Discharged to home,Discharged to Home,Emergency Admission


In [48]:
pd.crosstab(data['payer_code'],data['readmitted'])

readmitted,NO,YES
payer_code,Unnamed: 1_level_1,Unnamed: 2_level_1
?,1550,693
BC,831,326
CH,58,24
CM,395,225
CP,424,169
DM,65,61
FR,1,0
HM,826,707
MC,4285,3135
MD,613,361


In [49]:
data['payer_code'] = data['payer_code'].replace('?','Other')

In [50]:
data.shape

(16766, 46)

In [51]:
data[(data['number_outpatient']>0) & (data['number_inpatient'] > 0)].shape

(1710, 46)

## Outlier Treatment

In [52]:
# def treat_outliers(df, columns):
   
#     for column in columns:
#         Q1 = df[column].quantile(0.25)
#         Q3 = df[column].quantile(0.75)
#         IQR = Q3 - Q1
#         lower_bound = Q1 - 1.5 * IQR
#         upper_bound = Q3 + 1.5 * IQR
#         # Cap outliers
#         df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
#         df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
#     return df

In [53]:
# data = treat_outliers(data, ['num_lab_procedures','num_medications',])
# data.describe()

In [54]:
# Box-Cox transformation
data['number_outpatient_boxcox'], _ = stats.boxcox(data['number_outpatient'] + 1)
data['number_inpatient_boxcox'], _ = stats.boxcox(data['number_inpatient'] + 1)


In [55]:
# Log Transformation
data['number_outpatient_treated'] = np.log1p(data['number_outpatient'])
data['number_inpatient_treated'] = np.log1p(data['number_inpatient'])

In [56]:
data.describe()

Unnamed: 0,encounter_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,number_outpatient_boxcox,number_inpatient_boxcox,number_outpatient_treated,number_inpatient_treated
count,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0
mean,341265400.0,4.113981,42.058988,1.364249,16.833174,0.493201,0.296374,0.657342,8.14589,0.05068,0.136874,0.225775,0.33374
std,54678210.0,2.80844,21.621008,1.795453,8.071631,1.468929,1.331495,1.32016,1.610011,0.098564,0.191407,0.486705,0.518125
min,268763500.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,287676800.0,2.0,29.0,0.0,11.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0
50%,334883800.0,3.0,44.0,1.0,16.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0
75%,390234200.0,5.0,58.0,2.0,21.0,0.0,0.0,1.0,9.0,0.0,0.366708,0.0,0.693147
max,443867200.0,14.0,126.0,6.0,68.0,40.0,64.0,16.0,16.0,0.250403,0.478625,3.713572,2.833213


In [57]:
del data['number_outpatient_treated']
del data['number_inpatient_treated']
         

In [58]:
del data['number_outpatient']

del data['number_inpatient']
         

In [59]:
data.describe()

Unnamed: 0,encounter_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_emergency,number_diagnoses,number_outpatient_boxcox,number_inpatient_boxcox
count,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0
mean,341265400.0,4.113981,42.058988,1.364249,16.833174,0.296374,8.14589,0.05068,0.136874
std,54678210.0,2.80844,21.621008,1.795453,8.071631,1.331495,1.610011,0.098564,0.191407
min,268763500.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
25%,287676800.0,2.0,29.0,0.0,11.0,0.0,8.0,0.0,0.0
50%,334883800.0,3.0,44.0,1.0,16.0,0.0,9.0,0.0,0.0
75%,390234200.0,5.0,58.0,2.0,21.0,0.0,9.0,0.0,0.366708
max,443867200.0,14.0,126.0,6.0,68.0,64.0,16.0,0.250403,0.478625


In [60]:
data['weight'].value_counts()

weight
?            16301
[75-100)       183
[50-75)        116
[100-125)      107
[125-150)       30
[25-50)         15
[150-175)        8
[175-200)        3
[0-25)           2
>200             1
Name: count, dtype: int64

In [61]:
del data['weight']

# Data Preprocessing

In [62]:
# # Check for unique values in categorical columns
# categorical_columns = data.select_dtypes(include=['object']).columns
# print(len(categorical_columns))
# for col in categorical_columns:
#     print(f"{col}: {data[col].unique()}")


In [64]:
data.drop(columns={'metformin_rosiglitazone','metformin_pioglitazone',
                   'diag_1','diag_2','diag_3','discharge_desposition_desc','admission_source_des' }, inplace=True)

In [63]:
# # Check for unique values in categorical columns
# categorical_columns = data.select_dtypes(include=['object']).columns
# print(len(categorical_columns))
# for col in categorical_columns:
#     print(f"{col}: {data[col].unique()}")


In [65]:
# Function to clean the age column using regex
def clean_age_column(age_series):
    return age_series.str.replace(r'[\[\]() ]', '', regex=True)

# Cleaning the age column
data['age'] = clean_age_column(data['age'])
data.head()

Unnamed: 0,encounter_id,race,gender,age,time_in_hospital,payer_code,num_lab_procedures,num_procedures,num_medications,number_emergency,number_diagnoses,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide_metformin,glipizide_metformin,glimepiride_pioglitazone,change,diabetesMed,readmitted,admission_type_desc,discharge_category,admission_category,number_outpatient_boxcox,number_inpatient_boxcox
0,268763496,AfricanAmerican,Female,50-60,3,DM,14,0,12,0,9,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,Ch,Yes,YES,Emergency,Discharged to Home,Emergency Admission,0.234683,0.366708
1,268777020,Caucasian,Male,50-60,4,PO,37,1,20,0,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO,Emergency,Discharged to Home,Emergency Admission,0.0,0.0
2,268780680,Caucasian,Female,80-90,4,MC,63,0,29,1,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,Ch,Yes,NO,Emergency,Transfers to Other Healthcare Facilities,Emergency Admission,0.0,0.366708
3,268784670,AfricanAmerican,Female,50-60,2,MD,72,1,18,0,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO,Emergency,Discharged to Home,Emergency Admission,0.0,0.0
4,268787766,Caucasian,Female,60-70,1,MC,28,0,13,0,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,Yes,NO,Emergency,Discharged to Home,Emergency Admission,0.0,0.0


In [66]:
# Convert the target variable to binary (0 for NO, 1 for YES)
data['readmitted'] = data['readmitted'].map({'NO': 0, 'YES': 1})

In [67]:
# # Calculate the total count of events (YES) and non-events (NO)
# total_events = data['readmitted'].sum()
# total_non_events = data['readmitted'].count() - total_events


In [68]:
# # Calculate WoE and IV
# def calculate_woe_iv(df, feature, target):
#     # Create a dataframe to hold the counts
#     woe_iv = pd.DataFrame()
    
#     # Group by the feature and calculate the count of events and non-events
#     woe_iv['total'] = df.groupby(feature)[target].count()
#     woe_iv['events'] = df.groupby(feature)[target].sum()
#     woe_iv['non_events'] = woe_iv['total'] - woe_iv['events']
    
#     # Calculate event and non-event rates
#     woe_iv['event_rate'] = woe_iv['events'] / total_events
#     woe_iv['non_event_rate'] = woe_iv['non_events'] / total_non_events
    
#     # Calculate WoE
#     woe_iv['woe'] = np.log(woe_iv['event_rate'] / woe_iv['non_event_rate'])
    
#     # Calculate IV
#     woe_iv['iv'] = (woe_iv['event_rate'] - woe_iv['non_event_rate']) * woe_iv['woe']
    
#     # Return the DataFrame with WoE and IV
#     return woe_iv

In [69]:
# # Calculate WoE and IV for the payer_code column
# woe_iv_results = calculate_woe_iv(data, 'payer_code', 'readmitted')

# # Display results
# woe_iv_results


In [70]:
# # Calculate total IV
# total_iv = woe_iv_results['iv'].sum()
# print(f'Total IV for payer_code: {total_iv}')

In [71]:
# # Calculate WoE and IV for the age column
# woe_iv_results_age = calculate_woe_iv(data, 'age', 'readmitted')

In [74]:
# Step 1: Create a dictionary to map age categories to WoE values
woe_mapping = {
    '0-10': -1.416239,
    '10-20': -0.328822,
    '20-30': -0.026394,
    '30-40': -0.132715,
    '40-50': -0.063792,
    '50-60': -0.096065,
    '60-70': 0.012694,
    '70-80': 0.088174,
    '80-90': 0.086466,
    '90-100': -0.281206
}

data['woe_age'] = data['age'].map(woe_mapping)

In [75]:
data.head()

Unnamed: 0,encounter_id,race,gender,age,time_in_hospital,payer_code,num_lab_procedures,num_procedures,num_medications,number_emergency,number_diagnoses,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide_metformin,glipizide_metformin,glimepiride_pioglitazone,change,diabetesMed,readmitted,admission_type_desc,discharge_category,admission_category,number_outpatient_boxcox,number_inpatient_boxcox,woe_age
0,268763496,AfricanAmerican,Female,50-60,3,DM,14,0,12,0,9,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,Ch,Yes,1,Emergency,Discharged to Home,Emergency Admission,0.234683,0.366708,-0.096065
1,268777020,Caucasian,Male,50-60,4,PO,37,1,20,0,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,0,Emergency,Discharged to Home,Emergency Admission,0.0,0.0,-0.096065
2,268780680,Caucasian,Female,80-90,4,MC,63,0,29,1,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,Ch,Yes,0,Emergency,Transfers to Other Healthcare Facilities,Emergency Admission,0.0,0.366708,0.086466
3,268784670,AfricanAmerican,Female,50-60,2,MD,72,1,18,0,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,0,Emergency,Discharged to Home,Emergency Admission,0.0,0.0,-0.096065
4,268787766,Caucasian,Female,60-70,1,MC,28,0,13,0,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,Yes,0,Emergency,Discharged to Home,Emergency Admission,0.0,0.0,0.012694


In [76]:
data.to_csv('../data/interim/validation_data.csv')