All imports 


In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [82]:
diabetes_data = pd.read_csv("diabetic_data.csv")
diabetes_data = diabetes_data.copy()


Get all the column names from the dataframe

In [83]:
diabetes_data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

Seperate the target viariable from the dataset.


In [84]:
pd.isna(diabetes_data).sum()

encounter_id                    0
patient_nbr                     0
race                            0
gender                          0
age                             0
weight                          0
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                      0
medical_specialty               0
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                          0
diag_2                          0
diag_3                          0
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide 

In [85]:
diabetes_data["A1Cresult"].unique()

array([nan, '>7', '>8', 'Norm'], dtype=object)

Pandas is treating none as nan which i dont want to. The diabetic dataset does not contain any missing values but since none means "not measure" according to the Data description. 
Therefore, I am converting those nan values into "not measured" for every row with the code below

In [86]:
diabetes_data["A1Cresult"] =  diabetes_data['A1Cresult'].replace(np.nan, "not measured")

In [87]:
diabetes_data["A1Cresult"].isna().sum()

0

Similary, max_glu_serum does not have any missing values. Panda is treating "none" with "nan".
I am replacing the nan with "not measured",

In [88]:
diabetes_data["max_glu_serum"] =  diabetes_data['max_glu_serum'].replace(np.nan, "not measured")

In [89]:
diabetes_data["max_glu_serum"].isna().sum()

0

Now verifying Unique values for max_glu_serum nad A1Cresult

In [90]:
diabetes_data["max_glu_serum"].unique()

array(['not measured', '>300', 'Norm', '>200'], dtype=object)

In [91]:
diabetes_data["A1Cresult"].unique()

array(['not measured', '>7', '>8', 'Norm'], dtype=object)

In [92]:
diabetes_data["weight"].unique()


array(['?', '[75-100)', '[50-75)', '[0-25)', '[100-125)', '[25-50)',
       '[125-150)', '[175-200)', '[150-175)', '>200'], dtype=object)

In [93]:
# here we can see ? symbol. This is a missing value so I will replace it with np.na and count how many are missing in total
diabetes_data["weight"] = diabetes_data["weight"].replace("?", np.nan)


In [94]:
diabetes_data.isna().sum()

encounter_id                    0
patient_nbr                     0
race                            0
gender                          0
age                             0
weight                      98569
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                      0
medical_specialty               0
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                          0
diag_2                          0
diag_3                          0
number_diagnoses                0
max_glu_serum                   0
A1Cresult                       0
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide 

In [95]:
# I will drop the weight column as 98569 values are missing which is 91% of the data.
diabetes_data.drop(columns=["weight"], inplace=True)

In [96]:
diabetes_data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [97]:
# similarly I am going to drop payer code, medical_speicality, encounter ID,
# patient number, payer code, medical speciality

diabetes_data.drop(columns=["encounter_id", "patient_nbr", "medical_specialty", "payer_code"], inplace=True)
diabetes_data.columns

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

Now is the time to do some data exploration. I will analyze the data one-by-one for all the features starting with race.

In [98]:
diabetes_data["race"].unique()

array(['Caucasian', 'AfricanAmerican', '?', 'Other', 'Asian', 'Hispanic'],
      dtype=object)

In [99]:
row_count = diabetes_data["race"].str.contains("?", na=False, regex=False).sum()
print(row_count)

2273


In [None]:
# Removing all rows that dont contains the "?" symbol.
diabetes_data = diabetes_data[~diabetes_data["race"].str.contains("\?", na=False)]
diabetes_data

In [None]:
diabetes_data["gender"].unique()

In [None]:
row_count = diabetes_data["gender"].str.contains("Unknown/Invalid").sum()
print(row_count)

In [None]:
#removing rows for columns "gender" where value is "Unknown/Invalid"
diabetes_data = diabetes_data[~diabetes_data["gender"].str.contains("Unknown/Invalid", na=False)]
diabetes_data

In [None]:
diabetes_data["age"].unique()

In [None]:
diabetes_data["admission_type_id"].unique()

In [None]:
target_var = diabetes_data["readmitted"]
target_var
pd.unique(target_var)

In [None]:
diabetes_data["discharge_disposition_id"].unique()

In [None]:
diabetes_data["admission_source_id"].unique()

In [None]:
diabetes_data["time_in_hospital"].unique()

In [None]:
diabetes_data["num_lab_procedures"].unique()

In [None]:
diabetes_data["num_procedures"].unique()

In [None]:
diabetes_data["num_medications"].unique()

In [None]:
diabetes_data["number_outpatient"].unique()

In [None]:
diabetes_data["number_emergency"].unique()

In [None]:
diabetes_data["number_inpatient"].unique()

In [None]:
diabetes_data["diag_1"].unique()

In [None]:
diabetes_data["diag_2"].unique()

In [None]:
diabetes_data["diag_3"].unique()

In [None]:
diabetes_data["number_diagnoses"].unique()

In [None]:
diabetes_data["max_glu_serum"].unique()

In [None]:
diabetes_data["A1Cresult"].unique()

In [None]:
diabetes_data["metformin"].unique()

In [None]:
diabetes_data["change"].unique()

In [None]:
diabetes_data["diabetesMed"].unique()

In [None]:
diabetes_data["readmitted"].unique()

No that we have removed all null/redundant values, we will start doing some real exploratory data analysis

In [None]:
diabetes_data.columns

In [None]:
box_plot_time_hospital = diabetes_data["time_in_hospital"].plot(kind="box", grid=True)
box_plot_time_hospital.set_xlabel("time spent in hospital")
box_plot_time_hospital.set_ylabel("days")
plt.tight_layout()
plt.xticks(rotation=45)
plt.show()


In [None]:
diabetes_data["time_in_hospital"].value_counts()

In [None]:
diabetes_data[["time_in_hospital", "num_lab_procedures", "num_medications", "number_emergency", "number_diagnoses",
             "number_outpatient", "number_inpatient"]].describe()

Box plot on num_lab_procedures


In [None]:
box_plot_time_hospital = diabetes_data["num_lab_procedures"].plot(kind="box", grid=True)
box_plot_time_hospital.set_xlabel("Total lab Procudeures")
box_plot_time_hospital.set_ylabel("Count")
plt.tight_layout()
plt.xticks(rotation=45)
plt.show()

In [None]:
diabetes_data[diabetes_data["num_lab_procedures"]>50][[ "time_in_hospital"]].value_counts().sort_index()
pd.set_option('display.max_rows', None)
diabetes_data[["num_lab_procedures", "time_in_hospital"]].groupby(["time_in_hospital", "num_lab_procedures"]).value_counts()

In [None]:
scatter_lab_stay = diabetes_data.plot(kind="scatter", x = "time_in_hospital",y = "num_lab_procedures", grid=True, alpha=0.2)
plt.show()

Looking at the scratter plot between time in hospital and the total lab procedures it appears that lab procedure tend to increase
as time in hospital increases. It also shows that for a particular time of stay, there are variation in the 
number of lab procedures as well. I have decided to keep the data as it is as removing any outliers may discard
valuable insights.

In [None]:
medication_total = diabetes_data['num_medications']
medication_total.plot(kind="box", grid=True)
plt.show()

In [None]:
medication_total.value_counts().sort_index()

I have decided to keep the outliers in the total medication because if I remove  cenrtain people who take more 
number of medicine could indicate multiple conditions of the patient. Removing them might influence the readmission 
prediction. I have decided to keep them.


In [None]:
total_emergency_data = diabetes_data['number_emergency']
total_emergency_data.plot(kind="box", grid=True)
plt.show()

In [None]:
total_emergency_data.value_counts().sort_index()

Again this is a valid data and I am not removing any outliers. max emergency total is 76 and min is 0. which is 
completely valid. Removing them might remove those people who are in high chance of going to be readmitted.

I want to know the relation between total emergency visits and number of diagnosis.


In [None]:
sns.scatterplot(data=diabetes_data, 
                x="number_diagnoses", 
                y="number_emergency", 
                hue="readmitted",  
                alpha=0.5)
plt.show()

Looking at the scatter plot, number of emergency visit increases as the total diagnosis increase up to 9.
Then starting from 10, it hits a plateau.
This could be of a number of factors like the patient being transferred into another department or
data collection limits.
Another key insight is that majority of the patients with diagnosis more than 9 are readmitted to the hospital.
Readmission also increases as the number of emergency visit increases.( patient who visit the emergency more often might have poorly controlled condition or acute conditions.



In [None]:
total_outpatient_visits = diabetes_data['number_outpatient'].plot(kind="box", grid=True)
plt.show()

The box plot suggest that majority of the  patient have 0 outpatient visit. Rest of the points in the graph
suggest that they are outliers but removing these might interfere without prediction of the true readmissionr rate
as visiting outpatient as much as 40 doest not seem to be abnormal for patients with complication. So I have 
decided to keep the data as is.

In [None]:
sns.boxplot(x="readmitted", y="number_outpatient", data=diabetes_data)
plt.show()

Inpatient Total Visits

In [None]:
sns.boxplot(x="readmitted", y="number_inpatient", data=diabetes_data)
plt.show()

The plot above suggests that higher inpatient visit counts are associated with a higher chance of being readmitted within 30 days.

# Now lets take a look at our categorical features


In [None]:
def stacked_bar(feature, title):
    
    crosstab = pd.crosstab(diabetes_data[feature], diabetes_data['readmitted'], normalize='index')
    crosstab.plot(kind='bar', stacked=True)
    plt.title(title)
    plt.xlabel(feature)
    plt.ylabel("Proportion")
    plt.xticks(rotation=45)
    plt.legend(title='Readmitted')
    plt.show()
stacked_bar("race", "Proportion of readmission by Race")

Looking at the stacked bar chart the readmittance rate is similar in all races.
But, I want to make sure by using the chisquare test.

In [None]:

from scipy.stats import chi2_contingency

def chisquare_test_Cramers_value(feature, data):

    contingency_table = pd.crosstab(data[feature], data['readmitted'])

    chi2, p, degree_of_freedom, expected = chi2_contingency(contingency_table)

    print(f"Chi-square Statistic for {feature} is {chi2}" )
    print(f"Degrees of Freedom for {feature} is {degree_of_freedom}" )
    print(f"p-value for {feature} is {p}")
    n = contingency_table.to_numpy().sum()
    minumum_dimension = min(contingency_table.shape) -1
    cramers_v = np.sqrt(chi2 / (n * minumum_dimension))
    print(f"The Cramers Value for {feature} is {cramers_v}")

chisquare_test_Cramers_value("race", diabetes_data)

The p value appears to be extremely small which suggests that even though the stacked bar chart does not show
significant relation between race and readmittance within 30 days, The chi square test says it other wise.
But the Cramer's value of 0.02175 is an extremely small value that suggest that race is not a strong predrictor and does not have practical significance in predicting the readmmitance of a diabetic patient.




here are the values of discharge_disposition_id one of the categorical features.
description
1 Discharged to home
2 Discharged/transferred to another short term hospital
3 Discharged/transferred to SNF
4 Discharged/transferred to ICF
5 Discharged/transferred to another type of inpatient care institution
6 Discharged/transferred to home with home health service
7 Left AMA
8 Discharged/transferred to home under care of Home IV provider
9 Admitted as an inpatient to this hospital
10 Neonate discharged to another hospital for neonatal aftercare
11 Expired
12 Still patient or expected to return for outpatient services
13 Hospice / home
14Hospice / medical facility
15 Discharged/transferred within this institution to Medicare approved swing bed
16 Discharged/transferred/referred another institution for outpatient services
17 Discharged/transferred/referred to this institution for outpatient services
18 NULL
19 Expired at home. Medicaid only, hospice.
20 Expired in a medical facility. Medicaid only, hospice.
21 Expired, place unknown. Medicaid only, hospice.
22 Discharged/transferred to another rehab fac including rehab units of a hospital .
23 Discharged/transferred to a long term care hospital.
24 Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare.
25 Not Mapped
26 Unknown/Invalid
27 Discharged/transferred to another Type of Health Care Institution not Defined Elsewhere
28 Discharged/transferred to a federal health care facility.
29 Discharged/transferred/referred to a psychiatric hospital of psychiatric distinct part unit of a hospital
30 Discharged/transferred to a Critical Access Hospital (CAH).

I will convert "Not Mapped", "Unknown/Invalid", and "NULL" values into a different value  called 31

In [None]:
diabetes_data["discharge_disposition_id"] = diabetes_data["discharge_disposition_id"].replace({18:31,25:31, 26:31 })


In [None]:
diabetes_data["discharge_disposition_id"].unique()

Now lets run the chi square and Cramer's V 


In [None]:
chisquare_test_Cramers_value("discharge_disposition_id", diabetes_data)

discharge_disposition_id has a extremely low p value and a Cramers V of 0.1325 which is higher compared to the 
cramers V obtained from race.
Thus it is a stronger predictor!
This may indicate that the way that patients are diabetic patients are discharge carries more clues in regards to readmission!

Now lets check for gender

In [None]:


    
stacked_bar("gender", "Proportion of Readmission by Gender")

Again, looking at the graph, there are similar readmittance proportion between gender and readmittance. 
I want to check using the chi-square method



In [None]:
chisquare_test_Cramers_value("gender", diabetes_data)

Again the p-value is significantly low that rejects the null hypothesis of no assosiation between gender and readmisssion.
But the Cramers value is not significant and tells that in practicality, gender might be less significant in
predicting readmittance. 

In [None]:
chisquare_test_Cramers_value("age", diabetes_data)

With 18 degree of freedom and chi-square of 300 with a very small p value, age is a strong candidate for
the prediction of readdmitance.

Since age is in range, I am transforming the range into a single value using the midpoint.


In [None]:
diabetes_data['age'].unique()

In [None]:
diabetes_data['age'] = diabetes_data["age"].replace({'[0-10)': 5,'[10-20)': 15, '[20-30)':25,
                    '[30-40)':35, '[40-50)':45, '[50-60)':55,'[60-70)':65, '[70-80)':75, '[80-90)':85,
                                                    '[90-100)':95} )

In [None]:
diabetes_data["age"].unique()

In [None]:
chisquare_test_Cramers_value("age", diabetes_data)

In [None]:
diabetes_data["admission_type_id"].unique()

In [None]:
diabetes_data['admission_type_id'].value_counts()


here 
1: Emergency
2: Urgent
3: Elective
4: Newborn
5: Not Avaiable
6: NULL
7: Trauma Center
8: Not Mapped
    

In [None]:
chisquare_test_Cramers_value("admission_type_id", diabetes_data)

Now, I want to categorize null, not available, and not mapped as unknown. And redo the chi square test

In [None]:

diabetes_data['admission_type_id'] = diabetes_data["admission_type_id"].replace({6:5, 7:5, 8:5})


In [None]:
diabetes_data["admission_type_id"].unique()

In [None]:
chisquare_test_Cramers_value("admission_type_id",diabetes_data)

Admission Type also has a good indicative p value but lacks a significant Cramer's V indiciating that in practical, this feature might not be significant for predicting readmittance based on our data.

# Now lets check in admission source id
The values of admission source id are
1 Physician Referral
2 Clinic Referral
3 HMO Referral
4 Transfer from a hospital
 5 Transfer from a Skilled Nursing Facility (SNF)
 6 Transfer from another health care facility
 7 Emergency Room
 8 Court/Law Enforcement
 9 Not Available
 10 Transfer from critial access hospital
11 Normal Delivery
 12Premature Delivery
 13 Sick Baby
 14 Extramural Birth
15 Not Available
16 NULL
 17 Transfer From Another Home Health Agency
18 Readmission to Same Home Health Agency
 19 Not Mapped
20 Unknown/Invalid
 21 Transfer from hospital inpt/same fac reslt in a sep claim
 22 Born inside this hospital
 23 Born outside this hospital
 24 Transfer from Ambulatory Surgery Center
25 Transfer from Hospice

For the values Not Available, for 9, 15, null for 17, not mapped for 20, Unknown/Invalid for 21, I will create a value called 27.

In [79]:
#diabetes_data['admission_source_id'] = diabetes_data["admission_source_id"].replace({9:27, 15:27, 17:27,20:27})

In [80]:
#diabetes_data["admission_source_id"].unique()

# still need to do one hot encoding for admission types. Will do it later in the pipeline 