In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
import time
import pyodbc
print(pyodbc.drivers())
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
from scipy import stats

['SQL Server', 'ODBC Driver 17 for SQL Server', 'SQL Server Native Client RDA 11.0', 'Microsoft Access Driver (*.mdb, *.accdb)', 'Microsoft Excel Driver (*.xls, *.xlsx, *.xlsm, *.xlsb)', 'Microsoft Access Text Driver (*.txt, *.csv)', 'Microsoft Access dBASE Driver (*.dbf, *.ndx, *.mdx)']


# SQL Connection

In [2]:
def create_sql_connection(server, database, username, password, driver='{ODBC Driver 17 for SQL Server}'):
    """
    Establish a connection to a SQL Server database using pyodbc.

    Parameters:
    - server (str): The SQL Server address (e.g., 'localhost' or server IP).
    - database (str): The name of the database you want to connect to.
    - username (str): SQL Server username.
    - password (str): SQL Server password.
    - driver (str): ODBC driver to use. Default is '{ODBC Driver 17 for SQL Server}'.s

    Returns:
    - conn: A pyodbc connection object if successful.
    """
    connection_string = f"""
        DRIVER={driver};
        SERVER={server};
        DATABASE={database};
        UID={username};
        PWD={password};
    """
    try:
        conn = pyodbc.connect(connection_string)
        print("Connection established successfully!")
        return conn
    except Exception as e:
        print(f"Failed to connect to the database. Error: {e}")
        return None

In [3]:
def query_data(conn, query):
    """
    Execute a SQL query and fetch results as a pandas DataFrame.
    
    Parameters:
    - conn: A pyodbc connection object.
    - query (str): The SQL query to be executed.
    
    Returns:
    - df: A pandas DataFrame containing the query result.
    """
    start_time = time.time()  # Start time measurement
    try:
        cursor = conn.cursor()
        cursor.execute(query)
        
        # Fetch all results from the query
        rows = cursor.fetchall()
        
        # Get column names from cursor
        columns = [desc[0] for desc in cursor.description]
        
        # Create a pandas DataFrame from the results
        df = pd.DataFrame.from_records(rows, columns=columns)
        
    except pyodbc.Error as e:
        print(f"Error executing query: {e}")
        return None
    
    finally:
        cursor.close()
    
    end_time = time.time()  # End time measurement
    execution_time = end_time - start_time  # Calculate execution time
    
    # Print the DataFrame and execution time
    print(f"Query executed in: {execution_time:.4f} seconds")
    
    return df  

In [4]:
server = 'ROHIT'     
database = 'DiabetesData'  
username = 'rohit_kosamkar'       
password = 'September@2024' 

# Establish connection
conn = create_sql_connection(server, database, username, password)

Connection established successfully!


In [7]:
query = '''
select *, admissionsource.description as admission_source_des, admissiontype.description as admission_type_desc, discharge_disposition.description as discharge_desposition_desc 
from validation_data
left join admissionsource on admissionsource.admission_source_id = validation_data.admission_source_id
left join discharge_disposition on discharge_disposition.discharge_disposition_id = validation_data.discharge_disposition_id
left join admissiontype on admissiontype.admission_type_id = validation_data.admission_type_id
left join patient_readmission_status on patient_readmission_status.encounter_id = validation_data.encounter_id'''

In [8]:
data_temp = query_data(conn, query)
data_temp.shape

Query executed in: 1.1020 seconds


(16766, 60)

In [9]:
data = data_temp.copy(deep=True)

# Data Cleaning

In [10]:
data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide_metformin', 'glipizide_metformin',
       'glimepiride_pioglitazone', 'metformin_rosiglitazone',
       'metformin_pioglitazone', 'change', 'diabetesMed',
       'admission_source_id', 'description', 'discharge_disposition_id',
       'description', 'admission_

In [11]:
selected_col = ['encounter_id', 'patient_nbr','race', 'gender', 'age', 'weight',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide_metformin', 'glipizide_metformin',
       'glimepiride_pioglitazone', 'metformin_rosiglitazone',
       'metformin_pioglitazone', 'change', 'diabetesMed',
        'readmitted',
       'admission_source_des', 'admission_type_desc',
       'discharge_desposition_desc']

In [12]:
data = data[selected_col].iloc[:,1:]

In [13]:
data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide_metformin', 'glipizide_metformin',
       'glimepiride_pioglitazone', 'metformin_rosiglitazone',
       'metformin_pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'admission_source_des', 'admission_type_desc',
       'discharge_desposition_desc'],
      dtype='object')

In [14]:
data.head(3)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide_metformin,glipizide_metformin,glimepiride_pioglitazone,metformin_rosiglitazone,metformin_pioglitazone,change,diabetesMed,readmitted,admission_source_des,admission_type_desc,discharge_desposition_desc
0,268763496,89048466,AfricanAmerican,Female,[50-60),?,3,DM,?,14,0,12,1,0,1,402,496,413,9,,Norm,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,YES,Emergency Room,Emergency,Discharged to home
1,268777020,50550156,Caucasian,Male,[50-60),?,4,PO,?,37,1,20,0,0,0,327,780,493,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO,Emergency Room,Emergency,Discharged to home
2,268780680,67522518,Caucasian,Female,[80-90),?,4,MC,?,63,0,29,0,1,1,428,486,585,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Emergency Room,Emergency,Discharged/transferred to SNF


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16766 entries, 0 to 16765
Data columns (total 50 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   encounter_id                16766 non-null  int64 
 1   patient_nbr                 16766 non-null  int64 
 2   race                        16766 non-null  object
 3   gender                      16766 non-null  object
 4   age                         16766 non-null  object
 5   weight                      16766 non-null  object
 6   time_in_hospital            16766 non-null  int64 
 7   payer_code                  16766 non-null  object
 8   medical_specialty           16766 non-null  object
 9   num_lab_procedures          16766 non-null  int64 
 10  num_procedures              16766 non-null  int64 
 11  num_medications             16766 non-null  int64 
 12  number_outpatient           16766 non-null  int64 
 13  number_emergency            16766 non-null  in

In [16]:
data.describe()

Unnamed: 0,encounter_id,patient_nbr,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0
mean,341265400.0,82243520.0,4.113981,42.058988,1.364249,16.833174,0.493201,0.296374,0.657342,8.14589
std,54678210.0,42174510.0,2.80844,21.621008,1.795453,8.071631,1.468929,1.331495,1.32016,1.610011
min,268763500.0,40104.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,287676800.0,42564930.0,2.0,29.0,0.0,11.0,0.0,0.0,0.0,8.0
50%,334883800.0,84725850.0,3.0,44.0,1.0,16.0,0.0,0.0,0.0,9.0
75%,390234200.0,105719800.0,5.0,58.0,2.0,21.0,0.0,0.0,1.0,9.0
max,443867200.0,189502600.0,14.0,126.0,6.0,68.0,40.0,64.0,16.0,16.0


In [17]:
data.isnull().sum()

encounter_id                      0
patient_nbr                       0
race                              0
gender                            0
age                               0
weight                            0
time_in_hospital                  0
payer_code                        0
medical_specialty                 0
num_lab_procedures                0
num_procedures                    0
num_medications                   0
number_outpatient                 0
number_emergency                  0
number_inpatient                  0
diag_1                            0
diag_2                            0
diag_3                            0
number_diagnoses                  0
max_glu_serum                 16563
A1Cresult                     13797
metformin                         0
repaglinide                       0
nateglinide                       0
chlorpropamide                    0
glimepiride                       0
acetohexamide                     0
glipizide                   

In [18]:
# data.drop(columns={'max_glu_serum', 'A1Cresult'},inplace=True)
data['max_glu_serum'] = data['max_glu_serum'].fillna('No')
data['A1Cresult'] = data['A1Cresult'].fillna('No')

In [19]:
data.shape

(16766, 50)

In [20]:
data['number_emergency'] = data['number_emergency'].fillna(0)

In [21]:
data.isnull().sum()

encounter_id                  0
patient_nbr                   0
race                          0
gender                        0
age                           0
weight                        0
time_in_hospital              0
payer_code                    0
medical_specialty             0
num_lab_procedures            0
num_procedures                0
num_medications               0
number_outpatient             0
number_emergency              0
number_inpatient              0
diag_1                        0
diag_2                        0
diag_3                        0
number_diagnoses              0
max_glu_serum                 0
A1Cresult                     0
metformin                     0
repaglinide                   0
nateglinide                   0
chlorpropamide                0
glimepiride                   0
acetohexamide                 0
glipizide                     0
glyburide                     0
tolbutamide                   0
pioglitazone                  0
rosiglit

In [22]:
# Check for unique values in categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns
print(len(categorical_columns))
for col in categorical_columns:
    print(f"{col}: {data[col].unique()}")


40
race: ['AfricanAmerican' 'Caucasian' 'Other' '?' 'Asian' 'Hispanic']
gender: ['Female' 'Male']
age: ['[50-60)' '[80-90)' '[60-70)' '[70-80)' '[90-100)' '[40-50)' '[30-40)'
 '[20-30)' '[10-20)' '[0-10)']
weight: ['?' '[75-100)' '[100-125)' '[50-75)' '[125-150)' '[150-175)' '[25-50)'
 '[175-200)' '>200' '[0-25)']
payer_code: ['DM' 'PO' 'MC' 'MD' 'BC' 'HM' 'SP' '?' 'OG' 'CP' 'UN' 'CM' 'SI' 'CH' 'MP'
 'OT' 'WC' 'FR']
medical_specialty: ['?' 'Emergency/Trauma' 'Radiologist' 'InternalMedicine' 'Orthopedics'
 'Surgery-General' 'Family/GeneralPractice' 'Oncology' 'Cardiology'
 'Nephrology' 'Hematology' 'Gynecology' 'Podiatry'
 'ObstetricsandGynecology' 'Orthopedics-Reconstructive' 'Urology'
 'Radiology' 'Gastroenterology' 'Psychiatry' 'Hospitalist'
 'Surgery-Vascular' 'Ophthalmology' 'Pulmonology' 'Psychology'
 'Surgery-Neuro' 'Pediatrics' 'Neurology' 'InfectiousDiseases'
 'Endocrinology' 'Surgery-Cardiovascular' 'Otolaryngology'
 'Surgery-Cardiovascular/Thoracic' 'Surgery-Thoracic' 'Surgic

In [23]:
data['race'].value_counts()

race
Caucasian          12946
AfricanAmerican     2221
Hispanic             479
?                    477
Other                440
Asian                203
Name: count, dtype: int64

In [24]:
# race (~2k values replaced with 'Other' category)
data['race'] = data['race'].replace('?','Other')
data['race'].value_counts()

race
Caucasian          12946
AfricanAmerican     2221
Other                917
Hispanic             479
Asian                203
Name: count, dtype: int64

In [25]:
# gender (Only 3 values with "Unknown/Invalid" entry replacing it with the 'Female' as female proportion in the dataset is on higher side)

data['gender'] = data['gender'].replace('Unknown/Invalid', 'Female')
data['gender'].value_counts()

gender
Female    8898
Male      7868
Name: count, dtype: int64

In [26]:
data['admission_type_desc'].value_counts()

admission_type_desc
Emergency        10299
Elective          3520
Urgent            2454
NULL               403
Not Mapped          56
Not Available       24
Trauma Center        8
Newborn              2
Name: count, dtype: int64

In [99]:
# # admission_type_description -  Contains 4785 Null values, replacing it with the existing category "Not Available". Also Not Mapped replace with same category 
# data['admission_type_desc'].value_counts(dropna=False)
# data['admission_type_desc'] = data['admission_type_desc'].fillna('Other').replace({'Not Mapped': 'Not Available','NULL': 'Not Available'})
# data['admission_type_desc'].value_counts(dropna=False)

In [27]:
pd.crosstab(data['admission_type_desc'], data['readmitted'])

readmitted,NO,YES
admission_type_desc,Unnamed: 1_level_1,Unnamed: 2_level_1
Elective,2425,1095
Emergency,6046,4253
,273,130
Newborn,2,0
Not Available,19,5
Not Mapped,45,11
Trauma Center,8,0
Urgent,1513,941


In [28]:
# Admission Type categories
admission_type_other_categories = ['Trauma Center', 'Newborn','NULL','Not Available','Not Mapped']

# Replacing admission type categories with 'Other'
data['admission_type_desc'] = data['admission_type_desc'].apply(
    lambda x: 'Other' if x in admission_type_other_categories else x
)

data['admission_type_desc'].value_counts()

admission_type_desc
Emergency    10299
Elective      3520
Urgent        2454
Other          493
Name: count, dtype: int64

In [29]:
# discharge_disposition_description
data['discharge_desposition_desc'].value_counts(dropna= False)

discharge_desposition_desc
Discharged to home                                                                                           10346
Discharged/transferred to SNF                                                                                 2649
Discharged/transferred to home with home health service                                                       2299
Discharged/transferred to another rehab fac including rehab units of a hospital .                              353
Discharged/transferred to another short term hospital                                                          316
Expired                                                                                                        231
Hospice / home                                                                                                 112
Left AMA                                                                                                       106
Hospice / medical facility                           

In [30]:
## making subcategories for the discharge disposition
discharge_categories = {
    #Discharged to Home
    'Discharged to home': 'Discharged to Home',
    'Discharged/transferred to home with home health service': 'Discharged to Home',
    'Discharged/transferred to home under care of Home IV provider': 'Discharged to Home',
    
    #Transfers to Other Healthcare Facilities
    'Discharged/transferred to SNF': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred to another short term hospital': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred to another rehab fac including rehab units of a hospital .': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred to another type of inpatient care institution': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred to ICF': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred to a long term care hospital.': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred/referred to a psychiatric hospital of psychiatric distinct part unit of a hospital': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred to a federal health care facility.': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred within this institution to Medicare approved swing bed': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare.': 'Transfers to Other Healthcare Facilities',
    'Neonate discharged to another hospital for neonatal aftercare': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred/referred to this institution for outpatient services': 'Transfers to Other Healthcare Facilities',
    'Discharged/transferred/referred another institution for outpatient services': 'Transfers to Other Healthcare Facilities',

    #Expired
    'Expired': 'Expired',
    'Expired at home. Medicaid only, hospice.': 'Expired',
    'Expired in a medical facility. Medicaid only, hospice.': 'Expired',
    
    #Hospice Care
    'Hospice / home': 'Hospice Care',
    'Hospice / medical facility': 'Hospice Care',
    
    #AMA (Against Medical Advice)
    'Left AMA': 'AMA (Against Medical Advice)',
    
    #Other
    'Not Mapped': 'Other',
    'NaN':'Other',
    'Still patient or expected to return for outpatient services': 'Other',
    'Admitted as an inpatient to this hospital': 'Other',
    
    np.nan: 'Other'
}

 

In [31]:
# Expired type not in target
pd.crosstab(data['discharge_desposition_desc'].map(discharge_categories), data['readmitted'])


readmitted,NO,YES
discharge_desposition_desc,Unnamed: 1_level_1,Unnamed: 2_level_1
AMA (Against Medical Advice),57,49
Discharged to Home,7866,4779
Expired,232,0
Hospice Care,184,24
Other,6,2
Transfers to Other Healthcare Facilities,1981,1578


In [32]:
data['discharge_category'] = data['discharge_desposition_desc'].map(discharge_categories).fillna('Other')
data['discharge_category'].value_counts(dropna=False)


discharge_category
Discharged to Home                          12645
Transfers to Other Healthcare Facilities     3559
Expired                                       232
Hospice Care                                  208
AMA (Against Medical Advice)                  106
Other                                          16
Name: count, dtype: int64

In [33]:
# admission_source_description

data['admission_source_des'].value_counts(dropna=False)

admission_source_des
Emergency Room                                               11205
Physician Referral                                            4663
Transfer from a hospital                                       335
NULL                                                           267
Transfer from a Skilled Nursing Facility (SNF)                 144
Transfer from another health care facility                     107
Clinic Referral                                                 14
Transfer from hospital inpt/same fac reslt in a sep claim       10
Court/Law Enforcement                                            6
Not Available                                                    6
HMO Referral                                                     5
Transfer from Ambulatory Surgery Center                          2
Normal Delivery                                                  1
Sick Baby                                                        1
Name: count, dtype: int64

In [34]:
# ## making subcategories for the admission source 

admission_source_categories = {
    'Emergency Room': 'Emergency Admission',
    'Court/Law Enforcement': 'Emergency Admission',
    
    'Physician Referral': 'Physician Referral',
    'HMO Referral': 'Physician Referral',
    'Clinic Referral': 'Physician Referral',
    
    'Transfer from a hospital': 'Transfers from Other Facilities',
    'Transfer from another health care facility': 'Transfers from Other Facilities',
    'Transfer from a Skilled Nursing Facility (SNF)': 'Transfers from Other Facilities',
    'Transfer from hospital inpt/same fac reslt in a sep claim': 'Transfers from Other Facilities',
    'Transfer from critial access hospital': 'Transfers from Other Facilities',
    'Transfer from Ambulatory Surgery Center': 'Transfers from Other Facilities',
    
    'Extramural Birth': 'Other',
    'Normal Delivery': 'Other',
    'Sick Baby': 'Other',
    
    np.nan: 'Other',
    
    'Not Mapped': 'Other',
    'Not Available': 'Other'
}

In [35]:
data['admission_category'] = data['admission_source_des'].map(admission_source_categories).fillna('Other')
data['admission_category'].value_counts(dropna=False)

admission_category
Emergency Admission                11211
Physician Referral                  4682
Transfers from Other Facilities      598
Other                                275
Name: count, dtype: int64

In [36]:
pd.crosstab(data['admission_category'], data['readmitted'])


readmitted,NO,YES
admission_category,Unnamed: 1_level_1,Unnamed: 2_level_1
Emergency Admission,6595,4616
Other,188,87
Physician Referral,3129,1553
Transfers from Other Facilities,419,179


In [37]:
data['medical_specialty'].value_counts(dropna=False)

medical_specialty
?                                    11326
Emergency/Trauma                      1673
InternalMedicine                      1156
Radiologist                            524
Cardiology                             487
Family/GeneralPractice                 415
Surgery-General                        368
Orthopedics                            209
Nephrology                              72
Pulmonology                             48
Gastroenterology                        46
Urology                                 35
Orthopedics-Reconstructive              34
ObstetricsandGynecology                 31
Psychiatry                              30
Hematology                              28
Surgery-Vascular                        24
Oncology                                23
Surgery-Cardiovascular/Thoracic         23
Neurology                               20
Gynecology                              19
SurgicalSpecialty                       19
Podiatry                            

In [38]:
data['medical_specialty'].nunique()

43

In [39]:
speciality_mapping = {
    "Internal Medicine": [
        "Internal Medicine",
        "Hematology",
        "Infectious Diseases",
        "Nephrology",
        "Neurology",
        "Rheumatology",
        "Endocrinology",
        "Allergy and Immunology",
        "Psychiatry-Addictive",
        "Psychiatry-Child/Adolescent",
        "Psychiatry"
    ],
    "Cardiology": [
        "Cardiology",
        "Cardiology-Pediatric",
        "Surgery-Cardiovascular",
        "Surgery-Cardiovascular/Thoracic"
    ],
    "Surgery": [
        "Surgery-General",
        "Surgery-Neuro",
        "Surgery-Plastic",
        "Surgery-PlasticwithinHeadandNeck",
        "Surgery-Thoracic",
        "Surgery-Pediatric",
        "Surgery-Maxillofacial",
        "Surgery-Vascular",
        "SurgicalSpecialty",
        "Surgery-Colon&Rectal"
    ],
    "Family/General Practice": [
        "Family/General Practice",
        "Family/general practice",
        "Hospitalist",
        "PhysicianNotFound",
        "Resident",
        "DCPTEAM"
    ],
    "Other": [
        "Missing or unknown"
    ],
    "Other": [
        "Pediatrics-Endocrinology",
        "Gastroenterology",
        "Orthopedics",
        "Orthopedics-Reconstructive",
        "Emergency/Trauma",
        "Pulmonology",
        "Obstetrics and Gynecology",
        "Obstetrics and Gynecology-Gynecologic Onco",
        "Pediatrics-Critical Care",
        "Pediatrics-Pulmonology",
        "Anesthesiology-Pediatric",
        "Radiology",
        "Psychology",
        "Podiatry",
        "Gynecology",
        "Oncology",
        "Pediatrics-Neurology",
        "Ophthalmology",
        "Pediatrics-Emergency Medicine",
        "Physical Medicine and Rehabilitation",
        "Otolaryngology",
        "Pathology",
        "Dermatology",
        "Sports Medicine",
        "Speech",
        "Perinatology",
        "Neurophysiology",
        "Endocrinology-Metabolism",
        "Dentistry",
        "Osteopath",
        "Proctology",
        "Radiologist",
        "Surgeon"
    ]
}

In [40]:
# new column with 'Other' by default
data['Specialty_Group'] = 'Other'
 
# Handling missing Medical_Specialty
data['medical_specialty'] = data['medical_specialty'].replace('?', 'Missing or unknown')

In [41]:
# replacing with specialty mapping
for category, specialties in speciality_mapping.items():
        data.loc[data['medical_specialty'].isin(specialties), 'Specialty_Group'] = category

In [42]:
data['Specialty_Group'].value_counts(dropna=False)

Specialty_Group
Other                      15634
Cardiology                   516
Surgery                      435
Internal Medicine            155
Family/General Practice       26
Name: count, dtype: int64

In [43]:
# deleting original column as no longer needed
del data['medical_specialty']

In [44]:
# Deleting columns examide and citoglipton as both columns has only "No" entry
print(data['citoglipton'].value_counts())
print(data['examide'].value_counts())

data.drop(columns={'examide', 'citoglipton'}, inplace=True)

citoglipton
No    16766
Name: count, dtype: int64
examide
No    16766
Name: count, dtype: int64


In [45]:
data.shape

(16766, 50)

In [46]:
data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,time_in_hospital,payer_code,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide_metformin,glipizide_metformin,glimepiride_pioglitazone,metformin_rosiglitazone,metformin_pioglitazone,change,diabetesMed,readmitted,admission_source_des,admission_type_desc,discharge_desposition_desc,discharge_category,admission_category,Specialty_Group
0,268763496,89048466,AfricanAmerican,Female,[50-60),?,3,DM,14,0,12,1,0,1,402,496,413,9,No,Norm,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,YES,Emergency Room,Emergency,Discharged to home,Discharged to Home,Emergency Admission,Other
1,268777020,50550156,Caucasian,Male,[50-60),?,4,PO,37,1,20,0,0,0,327,780,493,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO,Emergency Room,Emergency,Discharged to home,Discharged to Home,Emergency Admission,Other
2,268780680,67522518,Caucasian,Female,[80-90),?,4,MC,63,0,29,0,1,1,428,486,585,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Emergency Room,Emergency,Discharged/transferred to SNF,Transfers to Other Healthcare Facilities,Emergency Admission,Other
3,268784670,47595249,AfricanAmerican,Female,[50-60),?,2,MD,72,1,18,0,0,0,38,496,599,9,No,Norm,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO,Emergency Room,Emergency,Discharged to home,Discharged to Home,Emergency Admission,Other
4,268787766,80279316,Caucasian,Female,[60-70),?,1,MC,28,0,13,0,0,0,564,135,428,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,NO,Emergency Room,Emergency,Discharged to home,Discharged to Home,Emergency Admission,Other


In [47]:
pd.crosstab(data['payer_code'],data['readmitted'])

readmitted,NO,YES
payer_code,Unnamed: 1_level_1,Unnamed: 2_level_1
?,1550,693
BC,831,326
CH,58,24
CM,395,225
CP,424,169
DM,65,61
FR,1,0
HM,826,707
MC,4285,3135
MD,613,361


In [48]:
def map_payor_code(code):
    if code in ['BC', 'HM', 'SP', 'MD', 'MP']:
        return 'Private Insurance'
    elif code in ['MC', 'MD', 'CH', 'DM', 'CM']:
        return 'Government Programs'
    elif code in ['OG', 'PO', 'SI', 'WC', 'UN', 'OT']:
        return 'Self-Pay/Other'
    else:
        return 'Self-Pay/Other'

In [49]:
# let's applying map_payor_code function to the data
data['payer_code_group'] = data['payer_code'].apply(map_payor_code)
data['payer_code_group'].value_counts()


payer_code_group
Government Programs    8248
Private Insurance      4861
Self-Pay/Other         3657
Name: count, dtype: int64

In [50]:
# deleting original columns as no longer needed
del data['payer_code']

In [51]:
data.shape

(16766, 50)

In [52]:
data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide_metformin,glipizide_metformin,glimepiride_pioglitazone,metformin_rosiglitazone,metformin_pioglitazone,change,diabetesMed,readmitted,admission_source_des,admission_type_desc,discharge_desposition_desc,discharge_category,admission_category,Specialty_Group,payer_code_group
0,268763496,89048466,AfricanAmerican,Female,[50-60),?,3,14,0,12,1,0,1,402,496,413,9,No,Norm,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,YES,Emergency Room,Emergency,Discharged to home,Discharged to Home,Emergency Admission,Other,Government Programs
1,268777020,50550156,Caucasian,Male,[50-60),?,4,37,1,20,0,0,0,327,780,493,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO,Emergency Room,Emergency,Discharged to home,Discharged to Home,Emergency Admission,Other,Self-Pay/Other
2,268780680,67522518,Caucasian,Female,[80-90),?,4,63,0,29,0,1,1,428,486,585,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Emergency Room,Emergency,Discharged/transferred to SNF,Transfers to Other Healthcare Facilities,Emergency Admission,Other,Government Programs
3,268784670,47595249,AfricanAmerican,Female,[50-60),?,2,72,1,18,0,0,0,38,496,599,9,No,Norm,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO,Emergency Room,Emergency,Discharged to home,Discharged to Home,Emergency Admission,Other,Private Insurance
4,268787766,80279316,Caucasian,Female,[60-70),?,1,28,0,13,0,0,0,564,135,428,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,NO,Emergency Room,Emergency,Discharged to home,Discharged to Home,Emergency Admission,Other,Government Programs


In [126]:
# data[(data['number_outpatient']>0) & (data['number_inpatient'] > 0)].shape

In [53]:
data.describe()

Unnamed: 0,encounter_id,patient_nbr,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0
mean,341265400.0,82243520.0,4.113981,42.058988,1.364249,16.833174,0.493201,0.296374,0.657342,8.14589
std,54678210.0,42174510.0,2.80844,21.621008,1.795453,8.071631,1.468929,1.331495,1.32016,1.610011
min,268763500.0,40104.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,287676800.0,42564930.0,2.0,29.0,0.0,11.0,0.0,0.0,0.0,8.0
50%,334883800.0,84725850.0,3.0,44.0,1.0,16.0,0.0,0.0,0.0,9.0
75%,390234200.0,105719800.0,5.0,58.0,2.0,21.0,0.0,0.0,1.0,9.0
max,443867200.0,189502600.0,14.0,126.0,6.0,68.0,40.0,64.0,16.0,16.0


In [128]:
# data[data['number_emergency']>70].shape

## Outlier Treatment

In [54]:
# def treat_outliers(df, columns):
   
#     for column in columns:
#         Q1 = df[column].quantile(0.25)
#         Q3 = df[column].quantile(0.75)
#         IQR = Q3 - Q1
#         lower_bound = Q1 - 1.5 * IQR
#         upper_bound = Q3 + 1.5 * IQR
#         # Cap outliers
#         df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
#         df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
#     return df

In [55]:
# data = treat_outliers(data, ['num_lab_procedures','num_medications'])
# data.describe()

In [56]:
# ## Let's see distrbution of numerical variables
# num_cols = data.select_dtypes(include=['int', 'float']).columns
# data[num_cols[2:]].hist(figsize=(16, 14))
# plt.show()

In [57]:
# Log Transformation to treat extreme values in this columns
data['number_outpatient_log'] = np.log1p(data['number_outpatient'])
data['number_inpatient_log'] = np.log1p(data['number_inpatient'])
data['number_emergency_log'] = np.log1p(data['number_emergency'])


In [133]:
# ## Let's see distrbution of numerical variables
# num_cols = data.select_dtypes(include=['int', 'float']).columns
# data[num_cols[2:]].hist(figsize=(16, 14))
# plt.show()

In [58]:
data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'insulin',
       'glyburide_metformin', 'glipizide_metformin',
       'glimepiride_pioglitazone', 'metformin_rosiglitazone',
       'metformin_pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'admission_source_des', 'admission_type_desc',
       'discharge_desposition_desc', 'discharge_category',
       'admission_category', 'Specialty_Group', 'payer_code_group',
       'number_outpatient_log', 'number_inpatient_log',
  

In [59]:
data.drop(columns={'number_outpatient', 'number_inpatient', 'number_emergency'}, inplace=True)

In [60]:
data.describe()

Unnamed: 0,encounter_id,patient_nbr,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_diagnoses,number_outpatient_log,number_inpatient_log,number_emergency_log
count,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0,16766.0
mean,341265400.0,82243520.0,4.113981,42.058988,1.364249,16.833174,8.14589,0.225775,0.33374,0.145335
std,54678210.0,42174510.0,2.80844,21.621008,1.795453,8.071631,1.610011,0.486705,0.518125,0.376071
min,268763500.0,40104.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,287676800.0,42564930.0,2.0,29.0,0.0,11.0,8.0,0.0,0.0,0.0
50%,334883800.0,84725850.0,3.0,44.0,1.0,16.0,9.0,0.0,0.0,0.0
75%,390234200.0,105719800.0,5.0,58.0,2.0,21.0,9.0,0.0,0.693147,0.0
max,443867200.0,189502600.0,14.0,126.0,6.0,68.0,16.0,3.713572,2.833213,4.174387


In [61]:
data['weight'].value_counts()

weight
?            16301
[75-100)       183
[50-75)        116
[100-125)      107
[125-150)       30
[25-50)         15
[150-175)        8
[175-200)        3
[0-25)           2
>200             1
Name: count, dtype: int64

In [62]:
# Since >90% data has null values will drop this column
del data['weight']

# Data Preprocessing

In [63]:
# Checking for unique values in categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns
print(len(categorical_columns))
for col in categorical_columns:
    print(f"{col}: {data[col].unique()}")


39
race: ['AfricanAmerican' 'Caucasian' 'Other' 'Asian' 'Hispanic']
gender: ['Female' 'Male']
age: ['[50-60)' '[80-90)' '[60-70)' '[70-80)' '[90-100)' '[40-50)' '[30-40)'
 '[20-30)' '[10-20)' '[0-10)']
diag_1: ['402' '327' '428' '38' '564' '802' '486' '276' '414' '576' '616' '427'
 '250' '789' '401' '682' '493' '197' '410' '491' '250.8' '152' '250.81'
 '715' '785' '998' '250.6' '189' '440' '153' '238' '786' '560' '250.82'
 '162' '569' '996' '724' '574' '780' '528' '577' '729' '345' '465' '239'
 '530' '250.11' '536' '250.2' '458' '455' '250.02' '625' '644' '515' '562'
 '211' '850' '278' '252' '250.4' '535' '158' '730' '646' '568' '575' '592'
 '721' '511' '415' '296' '250.33' '250.7' '531' '620' '433' '642' '466'
 '805' 'V57' '507' '461' '426' '337' '451' '707' '435' '348' '443' '726'
 '8' '599' '424' '618' '552' '572' '157' '280' '338' '198' '285' '453'
 '253' '434' '518' '566' '403' '820' '340' '250.13' '590' '959' '813'
 '537' '202' '295' '933' '274' '482' '595' '578' '977' '250.1' '2

In [64]:
#  dropping 'discharge_desposition_desc','admission_source_des' as we have mapped those with different categories and metformin_rosiglitazone, metformin_pioglitazone as it has only one category no use in our model
data.drop(columns={'metformin_rosiglitazone','metformin_pioglitazone'
                   ,'discharge_desposition_desc','admission_source_des' }, inplace=True)

In [65]:
data['discharge_category'].value_counts()

discharge_category
Discharged to Home                          12645
Transfers to Other Healthcare Facilities     3559
Expired                                       232
Hospice Care                                  208
AMA (Against Medical Advice)                  106
Other                                          16
Name: count, dtype: int64

In [66]:
## As Expired and hospice care patients not going to readmit so will drop those records from our data
data = data[~data['discharge_category'].isin(['Expired', 'Hospice Care'])]

In [67]:
data.shape

(16326, 45)

In [68]:
# Check for unique values in categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns
print(len(categorical_columns))
for col in categorical_columns:
    print(f"{col}: {data[col].unique()}")


35
race: ['AfricanAmerican' 'Caucasian' 'Other' 'Asian' 'Hispanic']
gender: ['Female' 'Male']
age: ['[50-60)' '[80-90)' '[60-70)' '[70-80)' '[90-100)' '[40-50)' '[30-40)'
 '[20-30)' '[10-20)' '[0-10)']
diag_1: ['402' '327' '428' '38' '564' '802' '486' '276' '414' '576' '616' '427'
 '250' '789' '401' '682' '493' '197' '410' '491' '250.8' '152' '250.81'
 '715' '785' '998' '250.6' '189' '440' '153' '238' '786' '560' '250.82'
 '162' '569' '996' '724' '574' '780' '528' '577' '729' '465' '239' '530'
 '250.11' '536' '250.2' '458' '455' '250.02' '625' '644' '515' '562' '211'
 '850' '278' '252' '250.4' '535' '730' '646' '568' '575' '592' '721' '511'
 '415' '296' '250.33' '250.7' '531' '620' '433' '642' '466' '805' 'V57'
 '507' '461' '426' '337' '451' '707' '435' '348' '443' '726' '8' '599'
 '424' '618' '552' '572' '280' '338' '198' '345' '285' '453' '253' '434'
 '518' '566' '403' '820' '340' '250.13' '590' '959' '813' '537' '202'
 '295' '933' '274' '482' '595' '578' '977' '250.1' '298' '188' '8

In [69]:
# Function to clean the age column using regex
def clean_age_column(age_series):
    return age_series.str.replace(r'[\[\]() ]', '', regex=True)

# Cleaning the age column
data['age'] = clean_age_column(data['age'])
data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide_metformin,glipizide_metformin,glimepiride_pioglitazone,change,diabetesMed,readmitted,admission_type_desc,discharge_category,admission_category,Specialty_Group,payer_code_group,number_outpatient_log,number_inpatient_log,number_emergency_log
0,268763496,89048466,AfricanAmerican,Female,50-60,3,14,0,12,402,496,413,9,No,Norm,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,Ch,Yes,YES,Emergency,Discharged to Home,Emergency Admission,Other,Government Programs,0.693147,0.693147,0.0
1,268777020,50550156,Caucasian,Male,50-60,4,37,1,20,327,780,493,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO,Emergency,Discharged to Home,Emergency Admission,Other,Self-Pay/Other,0.0,0.0,0.0
2,268780680,67522518,Caucasian,Female,80-90,4,63,0,29,428,486,585,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,Ch,Yes,NO,Emergency,Transfers to Other Healthcare Facilities,Emergency Admission,Other,Government Programs,0.0,0.693147,0.693147
3,268784670,47595249,AfricanAmerican,Female,50-60,2,72,1,18,38,496,599,9,No,Norm,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO,Emergency,Discharged to Home,Emergency Admission,Other,Private Insurance,0.0,0.0,0.0
4,268787766,80279316,Caucasian,Female,60-70,1,28,0,13,564,135,428,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,Yes,NO,Emergency,Discharged to Home,Emergency Admission,Other,Government Programs,0.0,0.0,0.0


In [70]:
# Convert the target variable to binary (0 for NO, 1 for YES)
data['readmitted'] = data['readmitted'].map({'NO': 0, 'YES': 1})

In [71]:
data['metformin'].value_counts()

metformin
No        12732
Steady     3341
Up          158
Down         95
Name: count, dtype: int64

In [72]:
data['discharge_category'].value_counts()

discharge_category
Discharged to Home                          12645
Transfers to Other Healthcare Facilities     3559
AMA (Against Medical Advice)                  106
Other                                          16
Name: count, dtype: int64

In [73]:
data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide_metformin,glipizide_metformin,glimepiride_pioglitazone,change,diabetesMed,readmitted,admission_type_desc,discharge_category,admission_category,Specialty_Group,payer_code_group,number_outpatient_log,number_inpatient_log,number_emergency_log
0,268763496,89048466,AfricanAmerican,Female,50-60,3,14,0,12,402,496,413,9,No,Norm,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,Ch,Yes,1,Emergency,Discharged to Home,Emergency Admission,Other,Government Programs,0.693147,0.693147,0.0
1,268777020,50550156,Caucasian,Male,50-60,4,37,1,20,327,780,493,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,0,Emergency,Discharged to Home,Emergency Admission,Other,Self-Pay/Other,0.0,0.0,0.0
2,268780680,67522518,Caucasian,Female,80-90,4,63,0,29,428,486,585,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,Ch,Yes,0,Emergency,Transfers to Other Healthcare Facilities,Emergency Admission,Other,Government Programs,0.0,0.693147,0.693147
3,268784670,47595249,AfricanAmerican,Female,50-60,2,72,1,18,38,496,599,9,No,Norm,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,0,Emergency,Discharged to Home,Emergency Admission,Other,Private Insurance,0.0,0.0,0.0
4,268787766,80279316,Caucasian,Female,60-70,1,28,0,13,564,135,428,9,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,Yes,0,Emergency,Discharged to Home,Emergency Admission,Other,Government Programs,0.0,0.0,0.0


In [77]:
# data.to_csv('../data/interim/data_for_eda.csv')

In [74]:
# function to map ICD-9 codes to categories
def map_icd_category(code):
    if pd.isna(code):
        return 'Other'
   
    code_str = str(code).split('.')[0]  
   
    # Handling non-numeric ICD-9 codes starting with 'E' or 'V'
    if code_str.startswith('E') or code_str.startswith('V'):
        return 'Other'
   
    try:
        code = int(code_str) 
    except ValueError:
        return 'Other'  
   
    # Map numeric ICD-9 codes to categories
    if 390 <= code <= 459 or code == 785:
        return 'Circulatory'
    elif 460 <= code <= 519 or code == 786:
        return 'Respiratory'
    elif 520 <= code <= 579 or code == 787:
        return 'Digestive'
    elif code == 250:
        return 'Diabetes'
    elif 800 <= code <= 999:
        return 'Injury'
    elif 710 <= code <= 739:
        return 'Musculoskeletal'
    elif 580 <= code <= 629 or code == 788:
        return 'Genitourinary'
    elif 140 <= code <= 239:
        return 'Neoplasms'
    elif (780 <= code <= 799) or (240 <= code <= 279 and code != 250) or \
         (680 <= code <= 709) or (1 <= code <= 139) or (290 <= code <= 319) or \
         (code in range(280, 290)) or (code in range(320, 360)) or \
         (code in range(360, 390)) or (code in range(740, 760)):
        return 'Other'
    else:
        return 'Other'
 
# Apply mapping function to diag_3 only because in diag_3 mazimum no. of patients were diagnose with diabetes out of 1,2,3 diag, also highest no. of readmission found in diag_3
data['diag_3_cat'] = data['diag_3'].apply(map_icd_category)
 

In [75]:
data['diag_3_cat'].value_counts()

diag_3_cat
Other              5052
Circulatory        4934
Diabetes           2348
Genitourinary      1102
Respiratory        1037
Digestive           757
Injury              413
Musculoskeletal     373
Neoplasms           310
Name: count, dtype: int64

In [76]:
## Dropping other diagnosis columns as no longer neeeded
data.drop(columns={'diag_1', 'diag_2', 'diag_3'}, inplace=True)

In [77]:
data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide_metformin', 'glipizide_metformin',
       'glimepiride_pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'admission_type_desc', 'discharge_category', 'admission_category',
       'Specialty_Group', 'payer_code_group', 'number_outpatient_log',
       'number_inpatient_log', 'number_emergency_log', 'diag_3_cat'],
      dtype='object')

In [78]:
# Handle medication changes
med_change_mapping = {
    'No': 0,
    'Steady': 1,
    'Up': 2,
    'Down': -1  
}
 
med_cols = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
            'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
            'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'insulin', 'glyburide_metformin',
            'glipizide_metformin', 'glimepiride_pioglitazone']
 
for col in med_cols:
    if col in data.columns:
        data[col] = data[col].map(med_change_mapping).fillna(0)  

In [79]:
# med_cols = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
#             'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
#             'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'insulin', 'glyburide_metformin',
#             'glipizide_metformin', 'glimepiride_pioglitazone']

In [80]:
#will go ahead with only metmorfin, insulin as in EDA get cleared those 2 drugs inclined towards readmission of patients
#Metformin: First-line therapy for Type 2 diabetes, widely prescribed due to its effectiveness in lowering blood glucose and improving insulin sensitivity.
#Insulin: Essential for patients with advanced Type 2 diabetes or Type 1 diabetes. It's critical to understand how insulin usage correlates with patient outcomes since improper use or mismanagement can lead to readmissions due to either hyperglycemia or hypoglycemia.

data.drop(columns={'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
            'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
            'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'glyburide_metformin',
            'glipizide_metformin', 'glimepiride_pioglitazone'}, inplace=True)

In [81]:
# med_cols

In [82]:
data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'insulin', 'change', 'diabetesMed', 'readmitted',
       'admission_type_desc', 'discharge_category', 'admission_category',
       'Specialty_Group', 'payer_code_group', 'number_outpatient_log',
       'number_inpatient_log', 'number_emergency_log', 'diag_3_cat'],
      dtype='object')

In [83]:
# Age processing
data['age'].unique()

array(['50-60', '80-90', '60-70', '70-80', '90-100', '40-50', '30-40',
       '20-30', '10-20', '0-10'], dtype=object)

In [84]:
# to make age column numerical will insert random number between that range
#mapping
age_ranges = {
    '0-10': (0, 10),
    '10-20': (10, 20),
    '20-30': (20, 30),
    '30-40': (30, 40),
    '40-50': (40, 50),
    '50-60': (50, 60),
    '60-70': (60, 70),
    '70-80': (70, 80),
    '80-90': (80, 90),
    '90-100': (90, 100)
}
# Function to generate a random number within the age range
def patient_age_treatment(age_group):
    min_age, max_age = age_ranges[age_group]
    return np.random.randint(min_age, max_age + 1)  # +1 to include max_age


In [85]:
data['Patient_Age'] = data['age'].apply(patient_age_treatment)

In [86]:
del data['age']

In [87]:
len(data.columns)

26

In [88]:
data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'insulin', 'change', 'diabetesMed', 'readmitted', 'admission_type_desc',
       'discharge_category', 'admission_category', 'Specialty_Group',
       'payer_code_group', 'number_outpatient_log', 'number_inpatient_log',
       'number_emergency_log', 'diag_3_cat', 'Patient_Age'],
      dtype='object')

In [89]:
data.select_dtypes(include='object').columns

Index(['race', 'gender', 'max_glu_serum', 'A1Cresult', 'change', 'diabetesMed',
       'admission_type_desc', 'discharge_category', 'admission_category',
       'Specialty_Group', 'payer_code_group', 'diag_3_cat'],
      dtype='object')

In [90]:
data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_diagnoses,max_glu_serum,A1Cresult,metformin,insulin,change,diabetesMed,readmitted,admission_type_desc,discharge_category,admission_category,Specialty_Group,payer_code_group,number_outpatient_log,number_inpatient_log,number_emergency_log,diag_3_cat,Patient_Age
0,268763496,89048466,AfricanAmerican,Female,3,14,0,12,9,No,Norm,1,1,Ch,Yes,1,Emergency,Discharged to Home,Emergency Admission,Other,Government Programs,0.693147,0.693147,0.0,Circulatory,52
1,268777020,50550156,Caucasian,Male,4,37,1,20,9,No,No,0,0,No,No,0,Emergency,Discharged to Home,Emergency Admission,Other,Self-Pay/Other,0.0,0.0,0.0,Respiratory,54
2,268780680,67522518,Caucasian,Female,4,63,0,29,9,No,No,0,2,Ch,Yes,0,Emergency,Transfers to Other Healthcare Facilities,Emergency Admission,Other,Government Programs,0.0,0.693147,0.693147,Genitourinary,88
3,268784670,47595249,AfricanAmerican,Female,2,72,1,18,9,No,Norm,0,0,No,No,0,Emergency,Discharged to Home,Emergency Admission,Other,Private Insurance,0.0,0.0,0.0,Genitourinary,50
4,268787766,80279316,Caucasian,Female,1,28,0,13,9,No,No,0,1,No,Yes,0,Emergency,Discharged to Home,Emergency Admission,Other,Government Programs,0.0,0.0,0.0,Circulatory,60


In [91]:
data.to_csv('../data/interim/Final_data_validation.csv')

In [176]:
data.shape

(83017, 26)

In [155]:
# # Calculate the total count of events (YES) and non-events (NO)
# total_events = data['readmitted'].sum()
# total_non_events = data['readmitted'].count() - total_events


In [110]:
# # Calculate WoE and IV
# def calculate_woe_iv(df, feature, target):
#     # Create a dataframe to hold the counts
#     woe_iv = pd.DataFrame()
    
#     # Group by the feature and calculate the count of events and non-events
#     woe_iv['total'] = df.groupby(feature)[target].count()
#     woe_iv['events'] = df.groupby(feature)[target].sum()
#     woe_iv['non_events'] = woe_iv['total'] - woe_iv['events']
    
#     # Calculate event and non-event rates
#     woe_iv['event_rate'] = woe_iv['events'] / total_events
#     woe_iv['non_event_rate'] = woe_iv['non_events'] / total_non_events
    
#     # Calculate WoE
#     woe_iv['woe'] = np.log(woe_iv['event_rate'] / woe_iv['non_event_rate'])
    
#     # Calculate IV
#     woe_iv['iv'] = (woe_iv['event_rate'] - woe_iv['non_event_rate']) * woe_iv['woe']
    
#     # Return the DataFrame with WoE and IV
#     return woe_iv

In [111]:
# # Calculate WoE and IV for the payer_code column
# woe_iv_results = calculate_woe_iv(data, 'payer_code', 'readmitted')

# # Display results
# woe_iv_results


In [112]:
# # Calculate total IV
# total_iv = woe_iv_results['iv'].sum()
# print(f'Total IV for payer_code: {total_iv}')

In [113]:
# # Calculate WoE and IV for the age column
# woe_iv_results_age = calculate_woe_iv(data, 'age', 'readmitted')


In [114]:
# woe_iv_results_age

In [115]:
# # Create a mapping dictionary from payer_code to WoE values
# woe_mapping = woe_iv_results_age['woe'].to_dict()

# # Replace the original payer_code values with their corresponding WoE values
# data['age_woe'] = data['age'].map(woe_mapping)

# # Check the updated DataFrame
# data[['age', 'age_woe']].head()

In [116]:
# woe_iv_results_discharge_category = calculate_woe_iv(data, 'discharge_category', 'readmitted')
# woe_iv_results_discharge_category

In [169]:
data.shape


(85000, 39)