In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import chi2_contingency


from sklearn.metrics import confusion_matrix, classification_report, plot_precision_recall_curve
from sklearn.metrics import plot_confusion_matrix,roc_curve, roc_auc_score,precision_recall_curve,accuracy_score,f1_score


from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, train_test_split, KFold



from sklearn.preprocessing import StandardScaler



from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC


from sklearn.feature_selection import RFE,SelectFromModel
from mlxtend.feature_selection import SequentialFeatureSelector as sfs


import warnings
warnings.filterwarnings('ignore')
warnings.warn("this will not show")
pd.options.display.max_columns = None 
pd.options.display.max_rows = None
sns.set(style='darkgrid')
%matplotlib inline

In [2]:
data = pd.read_csv('diabetic_data.csv')

In [3]:
data.shape

(101766, 50)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

In [5]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
encounter_id,101766.0,165201600.0,102640300.0,12522.0,84961194.0,152388987.0,230270900.0,443867222.0
patient_nbr,101766.0,54330400.0,38696360.0,135.0,23413221.0,45505143.0,87545950.0,189502619.0
admission_type_id,101766.0,2.024006,1.445403,1.0,1.0,1.0,3.0,8.0
discharge_disposition_id,101766.0,3.715642,5.280166,1.0,1.0,1.0,4.0,28.0
admission_source_id,101766.0,5.754437,4.064081,1.0,1.0,7.0,7.0,25.0
time_in_hospital,101766.0,4.395987,2.985108,1.0,2.0,4.0,6.0,14.0
num_lab_procedures,101766.0,43.09564,19.67436,1.0,31.0,44.0,57.0,132.0
num_procedures,101766.0,1.33973,1.705807,0.0,0.0,1.0,2.0,6.0
num_medications,101766.0,16.02184,8.127566,1.0,10.0,15.0,20.0,81.0
number_outpatient,101766.0,0.3693572,1.267265,0.0,0.0,0.0,0.0,42.0


In [6]:
data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648.0,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,?,?,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,?,?,51,0,8,0,0,0,197.0,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


# Working With the Numerical Data

In [7]:
# identifying the numerical and categorical features of the dataset

In [8]:
numerical_df = data.select_dtypes(include=np.number)

In [9]:
numerical_df.head()

Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
0,2278392,8222157,6,25,1,1,41,0,1,0,0,0,1
1,149190,55629189,1,1,7,3,59,0,18,0,0,0,9
2,64410,86047875,1,1,7,2,11,5,13,2,0,1,6
3,500364,82442376,1,1,7,2,44,1,16,0,0,0,7
4,16680,42519267,1,1,7,1,51,0,8,0,0,0,5


In [10]:
# finding columns that have full unique values.. and removing it..

In [11]:
unique_columns = []
for col in numerical_df.columns:
    if len(data)==data[col].nunique():
        unique_columns.append(col)

In [12]:
unique_columns

['encounter_id']

In [13]:
numerical_df = numerical_df.drop(columns=unique_columns)

In [14]:
numerical_df.head()

Unnamed: 0,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
0,8222157,6,25,1,1,41,0,1,0,0,0,1
1,55629189,1,1,7,3,59,0,18,0,0,0,9
2,86047875,1,1,7,2,11,5,13,2,0,1,6
3,82442376,1,1,7,2,44,1,16,0,0,0,7
4,42519267,1,1,7,1,51,0,8,0,0,0,5


In [15]:
# finding if there are any missing values in the numerical part of the data..

In [16]:
numerical_df.isnull().sum()

patient_nbr                 0
admission_type_id           0
discharge_disposition_id    0
admission_source_id         0
time_in_hospital            0
num_lab_procedures          0
num_procedures              0
num_medications             0
number_outpatient           0
number_emergency            0
number_inpatient            0
number_diagnoses            0
dtype: int64

In [17]:
# by the definition of the dataset 'admission_type_id' 'discharge_disposition_id' and 'admission_source_id'
# are categorical variables..

In [18]:
numerical_df = numerical_df.drop(columns=['admission_type_id','discharge_disposition_id'
                                                                           ,'admission_source_id'])

In [19]:
numerical_df['patient_nbr'].nunique()

71518

In [20]:
numerical_df_with_target = numerical_df.copy(deep=True)

In [21]:
numerical_df_with_target['readmitted'] = data['readmitted']

In [22]:
numerical_df_with_target.head()

Unnamed: 0,patient_nbr,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,readmitted
0,8222157,1,41,0,1,0,0,0,1,NO
1,55629189,3,59,0,18,0,0,0,9,>30
2,86047875,2,11,5,13,2,0,1,6,NO
3,82442376,2,44,1,16,0,0,0,7,NO
4,42519267,1,51,0,8,0,0,0,5,NO


In [23]:
data['admission_type_id'] = data['admission_type_id'].astype('object')

In [24]:
data['admission_source_id'] = data['admission_source_id'].astype('object')

In [25]:
data['discharge_disposition_id'] = data['discharge_disposition_id'].astype('object')

# Working with the categorical data

In [26]:
# finding the meaningless entities in the categorical part of the data..

In [27]:
categorical_df = data.select_dtypes(exclude=np.number)

In [28]:
categorical_df.head(10)

Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,payer_code,medical_specialty,diag_1,diag_2,diag_3,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),?,6,25,1,?,Pediatrics-Endocrinology,250.83,?,?,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,[10-20),?,1,1,7,?,?,276.0,250.01,255,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),?,1,1,7,?,?,648.0,250,V27,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[30-40),?,1,1,7,?,?,8.0,250.43,403,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,[40-50),?,1,1,7,?,?,197.0,157,250,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
5,Caucasian,Male,[50-60),?,2,1,2,?,?,414.0,411,250,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30
6,Caucasian,Male,[60-70),?,3,1,2,?,?,414.0,411,V45,,,Steady,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
7,Caucasian,Male,[70-80),?,1,1,7,?,?,428.0,492,250,,,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,>30
8,Caucasian,Female,[80-90),?,2,1,4,?,?,398.0,427,38,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
9,Caucasian,Female,[90-100),?,3,3,4,?,InternalMedicine,434.0,198,486,,,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [29]:
for col in categorical_df.columns:
    print('========'*2,col.upper(),'========'*2)
    print(categorical_df[col].nunique())
    print('----------------------------')
    print(categorical_df[col].value_counts())
    print('--'*20)

6
----------------------------
Caucasian          76099
AfricanAmerican    19210
?                   2273
Hispanic            2037
Other               1506
Asian                641
Name: race, dtype: int64
----------------------------------------
3
----------------------------
Female             54708
Male               47055
Unknown/Invalid        3
Name: gender, dtype: int64
----------------------------------------
10
----------------------------
[70-80)     26068
[60-70)     22483
[50-60)     17256
[80-90)     17197
[40-50)      9685
[30-40)      3775
[90-100)     2793
[20-30)      1657
[10-20)       691
[0-10)        161
Name: age, dtype: int64
----------------------------------------
10
----------------------------
?            98569
[75-100)      1336
[50-75)        897
[100-125)      625
[125-150)      145
[25-50)         97
[0-25)          48
[150-175)       35
[175-200)       11
>200             3
Name: weight, dtype: int64
----------------------------------------
8
----------

790
----------------------------
250       11555
401        8289
276        5175
428        4577
427        3955
414        3664
496        2605
403        2357
585        1992
272        1969
599        1941
?          1423
V45        1389
250.02     1369
707        1360
780        1334
285        1200
425        1136
250.6      1080
424        1063
584         963
305         924
250.01      915
682         887
518         854
41          727
493         694
278         680
530         625
786         584
491         574
486         568
244         540
V58         501
250.4       429
411         399
280         398
357         394
785         388
287         388
197         385
511         380
402         375
787         358
788         353
412         346
593         339
V15         334
413         329
438         318
998         316
250.8       316
294         315
799         312
571         310
560         306
295         305
995         299
997         298
789         290
458    

No        101743
Steady        23
Name: tolbutamide, dtype: int64
----------------------------------------
4
----------------------------
No        94438
Steady     6976
Up          234
Down        118
Name: pioglitazone, dtype: int64
----------------------------------------
4
----------------------------
No        95401
Steady     6100
Up          178
Down         87
Name: rosiglitazone, dtype: int64
----------------------------------------
4
----------------------------
No        101458
Steady       295
Up            10
Down           3
Name: acarbose, dtype: int64
----------------------------------------
4
----------------------------
No        101728
Steady        31
Down           5
Up             2
Name: miglitol, dtype: int64
----------------------------------------
2
----------------------------
No        101763
Steady         3
Name: troglitazone, dtype: int64
----------------------------------------
3
----------------------------
No        101727
Steady        38
Up          

# OBSERVATIONS:

In [30]:
# 'admission_type_id' 'discharge_disposition_id' and 'admission_source_id' are numerically encoded.

# There is '?' in features like race, weight, payer_code, Medical_speciality,diag_1,diag_2 and diag_3.

# gender has 3 invalid/unknown entries.

# diag1,diag2 and diag3 have numerical entries combined with characters..

# max_glu_serum and A1Cresult has 'none' for majority of entries..

# examide and citoglipton No:101766

# metformin-pioglitazone, glimepiride-pioglitazone, acetohexamide No:101765 steady:1

# metformin-rosiglitazone No:101764 steady:2

# troglitazone No:101763 steady:3

# glipizide-metformin  No:101753 steady:13

# CONCLUSION: 

In [31]:
# Imputing with string values in numerically encoded columns 'discharge_disposition_id' and 'admission_source_id'..

# Here there is '?' in features like race, weight, payer_code, Medical_speciality,diag_1,diag_2 and diag_3 

# which does'nt give any meaning hence we can replace them with nan

# gender has 3 invalid/unknown entries WHICH CAN BE REPLACED WITH NAN

# examide and citoglipton has No:101766... when no one was administered with these drugs they can be removed from the features..


# FEATURE ENGINEERING

In [32]:
def admission_type_func(value):
    admission_type_description = {1:'Emergency',2:'Urgent',3:'Elective',4:'Newborn',7:'Trauma Center'}
    if value not in admission_type_description.keys():
        return 'Other'
    else:
        return admission_type_description[value]

In [33]:
 def discharge_disposition_func(value):
    if (value == 11) or (value >= 19 and value <= 21):
        return 'Expired'
    elif value == 7:
        return 'left AMA'
    elif (value == 9) or (value == 12):
        return 'In/Still/Outpatient'
    elif value == 13 or value == 14:
        return 'Hospice'
    elif value in[1,6,8]:
        return 'Discharged Home'
    elif value in [2,3,4,5,10,16,17,22,23,24,27,28,29,30]:
        return 'Transferred/Referred'
    elif value == 15:
        return 'Transferred Within'
    
    return 'Other'

In [34]:
def admission_source_func(value):
    if value in [1,2,3]:
        return 'Referral'
    elif value in [4,5,6,10,18,22,25,26]:
        return 'Transfer'
    elif value == 7:
        return 'Emergency Room'
    elif (value == 8):
        return 'Law Enforcement'
    elif value in [11,12,13,14,19,23,24]:
        if value == 11:
            return 'Normal Delivery'
        elif value == 12:
            return 'Premature Delivery'
        elif value == 13:
            return 'Sick Baby'
        elif value == 14:
            return 'Extramural Birth'
        elif value == 19:
            return 'Readmission'
        elif value == 23:
            return 'Born In This Hospital'
        elif value == 24:
            return 'Born Elsewhere'

    return 'Other'

In [35]:
def transform_category_func(value):
    category = 'other'
    if value in [np.nan]:
        return np.nan
    elif '250.' in value:
        return 'Diabetes'
    else:
        if value.isdecimal():
            value = float(value)
            if value>=390 and value<=459 or value==785:
                category = 'Circulatory'
            elif value>=460 and value<=519 or value==786:
                category = 'Respiratory'
            elif value>=520 and value<=579 or value==787:
                category = 'Digestive'
            elif value==250:
                category = 'Diabetes'
            elif value>=800 and value<=999:
                category = 'Injury'          
            elif value>=710 and value<=739:
                category = 'Musculoskeletal'   
            elif value>=580 and value<=629 or value==788:
                category = 'Genitourinary'
            elif value>=140 and value<=239 :
                category = 'Neoplasms'
    
    return category

In [36]:
def categorize_patient_frequency_func(value):
    if value==1:
        return '1-time'
    elif (value > 1) and (value <= 5):
        return '2-5 times'
    elif (value > 5) and (value <= 10):
        return '6-10 times'
    return 'more than 10 times'

In [37]:
categorical_df = categorical_df.replace({'?':np.nan,'Unknown/Invalid':np.nan})

In [38]:
categorical_df.head()

Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,payer_code,medical_specialty,diag_1,diag_2,diag_3,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),,6,25,1,,Pediatrics-Endocrinology,250.83,,,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,[10-20),,1,1,7,,,276.0,250.01,255,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),,1,1,7,,,648.0,250.0,V27,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[30-40),,1,1,7,,,8.0,250.43,403,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,[40-50),,1,1,7,,,197.0,157.0,250,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [39]:
categorical_df.shape

(101766, 40)

In [40]:
categorical_df = categorical_df.drop(columns=['examide','citoglipton'])

In [41]:
categorical_df.shape

(101766, 38)

In [42]:
100*(categorical_df.isnull().sum()/len(categorical_df)).sort_values(ascending=False)

weight                      96.858479
medical_specialty           49.082208
payer_code                  39.557416
race                         2.233555
diag_3                       1.398306
diag_2                       0.351787
diag_1                       0.020636
gender                       0.002948
age                          0.000000
admission_type_id            0.000000
discharge_disposition_id     0.000000
admission_source_id          0.000000
diabetesMed                  0.000000
max_glu_serum                0.000000
A1Cresult                    0.000000
metformin                    0.000000
repaglinide                  0.000000
nateglinide                  0.000000
chlorpropamide               0.000000
readmitted                   0.000000
acetohexamide                0.000000
glipizide                    0.000000
change                       0.000000
metformin-pioglitazone       0.000000
metformin-rosiglitazone      0.000000
glimepiride-pioglitazone     0.000000
glipizide-me

In [43]:
# the weight of many patients is not provided therefore the column can be dropped..

# the patient's response to the medication and the probability of re-admission does'nt depend on the payer_code..
# i.e if he pays by himself or he takes money from some health insurance or bank..

# All the doctors irrespective of their speciality can administer the drugs for diabetes..which will not have any effect on 
# the patient's re-admission rate..

In [44]:
categorical_df = categorical_df.drop(columns=['weight','payer_code','medical_specialty'])

In [45]:
categorical_df['admission_type_id'] = categorical_df['admission_type_id'].apply(admission_type_func)

In [46]:
categorical_df['admission_source_id'] = categorical_df['admission_source_id'].apply(admission_source_func)

In [47]:
categorical_df['discharge_disposition_id'] = categorical_df['discharge_disposition_id'].apply(discharge_disposition_func)

In [48]:
categorical_df['diag_1'] = categorical_df['diag_1'].apply(transform_category_func)

In [49]:
categorical_df['diag_2'] = categorical_df['diag_2'].apply(transform_category_func)

In [50]:
categorical_df['diag_3'] = categorical_df['diag_3'].apply(transform_category_func)

In [51]:
categorical_df.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,diag_1,diag_2,diag_3,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),Other,Other,Referral,Diabetes,,,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,[10-20),Emergency,Discharged Home,Emergency Room,other,Diabetes,other,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),Emergency,Discharged Home,Emergency Room,other,Diabetes,other,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[30-40),Emergency,Discharged Home,Emergency Room,other,Diabetes,Circulatory,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,[40-50),Emergency,Discharged Home,Emergency Room,Neoplasms,Neoplasms,Diabetes,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [52]:
df = pd.concat([numerical_df,categorical_df],axis=1)

In [53]:
df.head()

Unnamed: 0,patient_nbr,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,diag_1,diag_2,diag_3,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,8222157,1,41,0,1,0,0,0,1,Caucasian,Female,[0-10),Other,Other,Referral,Diabetes,,,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,55629189,3,59,0,18,0,0,0,9,Caucasian,Female,[10-20),Emergency,Discharged Home,Emergency Room,other,Diabetes,other,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,86047875,2,11,5,13,2,0,1,6,AfricanAmerican,Female,[20-30),Emergency,Discharged Home,Emergency Room,other,Diabetes,other,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,82442376,2,44,1,16,0,0,0,7,Caucasian,Male,[30-40),Emergency,Discharged Home,Emergency Room,other,Diabetes,Circulatory,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,42519267,1,51,0,8,0,0,0,5,Caucasian,Male,[40-50),Emergency,Discharged Home,Emergency Room,Neoplasms,Neoplasms,Diabetes,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [54]:
df.dropna(subset=['gender'], how='all', inplace = True)

In [55]:
df.dropna(subset=['diag_1'], how='all', inplace = True)

In [56]:
df.isnull().sum().sort_values(ascending=False)

race                        2269
diag_3                      1422
diag_2                       357
readmitted                     0
age                            0
A1Cresult                      0
max_glu_serum                  0
diag_1                         0
admission_source_id            0
discharge_disposition_id       0
admission_type_id              0
gender                         0
diabetesMed                    0
number_diagnoses               0
number_inpatient               0
number_emergency               0
number_outpatient              0
num_medications                0
num_procedures                 0
num_lab_procedures             0
time_in_hospital               0
metformin                      0
repaglinide                    0
nateglinide                    0
troglitazone                   0
change                         0
metformin-pioglitazone         0
metformin-rosiglitazone        0
glimepiride-pioglitazone       0
glipizide-metformin            0
glyburide-

In [57]:
# Here we have dropped all the entries where diagnosis_1 have nan values... Since it should have diabetes or diabetes with
# some or the other side effect.. When a patient is not diagnosed with any disease in the first diagnosis then there is no
# meaning in going for the further diagnoses..

In [58]:
# There are certain entries where the second diagnosis is nan but the third diagnosis has some sort of disorder
# These may be wrong entries because once when the person is not diagnosed with any disorder he/she is unlikely to be 
# diagnosed with again..Hence we drop all those entries..

In [59]:
index_to_be_dropped = df[df['diag_2'].isnull() & df['diag_3'].notnull()].index

In [60]:
df.drop(index_to_be_dropped,inplace=True)

In [61]:
df.isnull().sum().sort_values(ascending=False)

race                        2263
diag_3                      1422
diag_2                       278
readmitted                     0
age                            0
A1Cresult                      0
max_glu_serum                  0
diag_1                         0
admission_source_id            0
discharge_disposition_id       0
admission_type_id              0
gender                         0
diabetesMed                    0
number_diagnoses               0
number_inpatient               0
number_emergency               0
number_outpatient              0
num_medications                0
num_procedures                 0
num_lab_procedures             0
time_in_hospital               0
metformin                      0
repaglinide                    0
nateglinide                    0
troglitazone                   0
change                         0
metformin-pioglitazone         0
metformin-rosiglitazone        0
glimepiride-pioglitazone       0
glipizide-metformin            0
glyburide-

In [62]:
df.shape

(101663, 44)

# SPLITTING DATA INTO DIABETIC AND NON-DIABETIC 

In [63]:
# Splitting the dataset into diabetic and non-diabetic patients since we have the drugs that are only used to dianose the 
# diabetic patients..Especially type 2..

In [64]:
df_diabetic = df[(df['diag_1']=='Diabetes') | (df['diag_2']=='Diabetes') | (df['diag_3']=='Diabetes')]

In [65]:
df_non_diabetic = df[(df['diag_1']!='Diabetes') & (df['diag_2']!='Diabetes') & (df['diag_3']!='Diabetes')]

In [66]:
df_non_diabetic['readmitted'] = df_non_diabetic['readmitted'].apply(lambda x:'No' if x=='NO' else 'Yes')

In [67]:
df_diabetic.shape

(37990, 44)

In [68]:
df_diabetic.to_csv('Diabetic_patients_with_patient_nbr.csv')

### LIST OF ALL CATEGORICAL VARIABLES IN THE DATASET SEGREGATED AS DIFFERENT ASPECTS...

In [69]:
drugs_list = ['metformin','repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride','acetohexamide', 'glipizide',
              'glyburide', 'tolbutamide','pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
              'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin','glimepiride-pioglitazone',
              'metformin-rosiglitazone','metformin-pioglitazone']
patient_formalities_list = ['admission_type_id','discharge_disposition_id','admission_source_id','patient_frequency_categorized']
patient_demographics_list = ['gender','age','race']
diagnosis_list = ['diag_1','diag_2','diag_3']
primary_tests_list = ['A1Cresult','max_glu_serum']
medication_changes_list = ['diabetesMed','change']

### LIST OF ALL NUMERICAL VARIABLES IN THE DATASET SEGREGATED AS DIFFERENT ASPECTS...

In [70]:
number_of_visits = ['patient_frequency','number_outpatient','number_emergency','number_inpatient']
number_of_hospital_formalities = ['time_in_hospital','num_lab_procedures','num_procedures','num_medications','number_diagnoses']

# FEATURE EXTRACTION

In [71]:
patient_freq_tab = df_non_diabetic['patient_nbr'].value_counts()

In [72]:
df_non_diabetic['patient_frequency'] = df_non_diabetic['patient_nbr'].map(patient_freq_tab)

In [73]:
df_non_diabetic.head()

Unnamed: 0,patient_nbr,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,diag_1,diag_2,diag_3,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,patient_frequency
6,84259809,4,70,1,21,0,0,0,7,Caucasian,Male,[60-70),Elective,Discharged Home,Referral,Circulatory,Circulatory,other,,,Steady,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,No,1
8,48330783,13,68,2,28,0,0,0,8,Caucasian,Female,[80-90),Urgent,Discharged Home,Transfer,Circulatory,Circulatory,other,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,No,1
9,63555939,12,33,3,18,0,0,0,8,Caucasian,Female,[90-100),Elective,Transferred/Referred,Transfer,Circulatory,Neoplasms,Respiratory,,,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,No,1
11,77391171,7,62,0,11,0,0,0,7,AfricanAmerican,Male,[60-70),Urgent,Discharged Home,Transfer,Neoplasms,other,Neoplasms,,,No,No,No,No,No,No,No,Up,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,Yes,1
13,77586282,10,55,1,31,0,0,0,8,Caucasian,Male,[80-90),Emergency,Discharged Home,Emergency Room,Circulatory,Circulatory,Circulatory,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,No,1


In [74]:
list_of_cols = list(df_non_diabetic.columns)

In [75]:
list_of_cols = [list_of_cols.pop()] + list_of_cols

In [76]:
df_non_diabetic = df_non_diabetic[list_of_cols]

In [77]:
df_non_diabetic.head()

Unnamed: 0,patient_frequency,patient_nbr,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,diag_1,diag_2,diag_3,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
6,1,84259809,4,70,1,21,0,0,0,7,Caucasian,Male,[60-70),Elective,Discharged Home,Referral,Circulatory,Circulatory,other,,,Steady,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,No
8,1,48330783,13,68,2,28,0,0,0,8,Caucasian,Female,[80-90),Urgent,Discharged Home,Transfer,Circulatory,Circulatory,other,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,No
9,1,63555939,12,33,3,18,0,0,0,8,Caucasian,Female,[90-100),Elective,Transferred/Referred,Transfer,Circulatory,Neoplasms,Respiratory,,,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,No
11,1,77391171,7,62,0,11,0,0,0,7,AfricanAmerican,Male,[60-70),Urgent,Discharged Home,Transfer,Neoplasms,other,Neoplasms,,,No,No,No,No,No,No,No,Up,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,Yes
13,1,77586282,10,55,1,31,0,0,0,8,Caucasian,Male,[80-90),Emergency,Discharged Home,Emergency Room,Circulatory,Circulatory,Circulatory,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,No


In [78]:
df_non_diabetic['patient_frequency_categorized'] = df_non_diabetic['patient_frequency'].apply(categorize_patient_frequency_func)

In [79]:
df_non_diabetic.to_csv('Non_Diabetic_patients.csv')

## NEED NOT EXECUTE

# VISUAL ANALYSIS

In [None]:
df_visual_non_diabetic = pd.read_csv('Non_Diabetic_patients.csv',index_col=0)
df_visual_non_diabetic.head()

### DISTRIBUTION OF VARIABLES

### DISTRIBUTION OF NUMERICAL VARIABLES : UNIVARIATE ANALYSIS

In [None]:
def box_labels(ax, df,col1,col2):
    medians = df.groupby([col1])[col2].median()
    vertical_offset = df[col2].median() * 0.05 # offset from median for display

    for xtick in ax.get_xticks():
        ax.text(xtick,medians[xtick] + vertical_offset,medians[xtick], 
                horizontalalignment='center',size='x-small',color='w',weight='semibold')

In [None]:
def labels(ax):
    for bar in ax.patches: 
        ax.annotate('%{:.1f}\n{:.0f}'.format(100*bar.get_height()/len(df_visual_non_diabetic),bar.get_height()), (bar.get_x() + bar.get_width() / 2,  
                        bar.get_height()-400), ha='center', va='center', 
                       size=14, xytext=(0, 8), 
                       textcoords='offset points') 

In [None]:
def labels_catnum(ax, df=df_visual_non_diabetic):
    for p in ax.patches:
            ax.annotate('%{:.1f}\n{:.0f}'.format(100*p.get_height()/len(df),p.get_height()), 
                        (p.get_x()+0.2, p.get_height()-27),size=16)


In [None]:
plt.figure(figsize=(25,20))
for i,col in enumerate(number_of_visits):
    plt.subplot(4,1,i+1)
    sns.distplot(df_visual_non_diabetic[col])

In [None]:
plt.figure(figsize=(25,15))
for i,col in enumerate(number_of_hospital_formalities):
    plt.subplot(2,3,i+1)
    sns.distplot(df_visual_non_diabetic[col])

In [None]:
plt.figure(figsize=(25,20))
for i,col in enumerate(number_of_visits):
    plt.subplot(2,2,i+1)
    sns.boxplot(df_visual_non_diabetic[col])

In [None]:
plt.figure(figsize=(25,15))
for i,col in enumerate(number_of_hospital_formalities):
    plt.subplot(2,3,i+1)
    sns.boxplot(df_visual_non_diabetic[col])

In [None]:
skew_of_features = pd.DataFrame([df_visual_non_diabetic[col].skew() for col in [df_visual_non_diabetic.select_dtypes(include=np.number).columns]],index=['SKEW']).T

In [None]:
skew_of_features.sort_values(by='SKEW',ascending=False)

In [None]:
# The columns that are highly skewed are number_outpatients and number_emergencys..  
# As we can see from the plots and the skew we can understand the most of the numerical columns are right skewed..

In [None]:
# Instead of transforming the variables to bring them to a normal distribution 
# and winzorizing to eliminate outliers to make the model perform better we keep
# the data as it is and analise how the model performs..
# Since these datapoints may have a significant influence on the target variable which has to be learnt by the model..

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(x='num_medications', data=df_visual_non_diabetic)
mean, median = np.mean(df_visual_non_diabetic.num_medications), np.median(df_visual_non_diabetic.num_medications)
plt.axvline(mean-df_visual_non_diabetic.num_medications.min(), color='blue', label=f'mean:{round(mean,2)}')
plt.axvline(median-df_visual_non_diabetic.num_medications.min(), color='black', label=f'median:{round(median,2)}')
plt.title('Number of medications given During Visit')
plt.xticks(rotation=90)
plt.legend()
plt.show()

In [None]:
# an average of 16-17 distinct drugs has been administered per visit for each patient.
# do this wrt readmission in bivariate analysis..

In [None]:
plt.figure(figsize=(26,7))
sns.countplot(x='num_lab_procedures', data=df_visual_non_diabetic)
mean, median = np.mean(df_visual_non_diabetic.num_lab_procedures), np.median(df_visual_non_diabetic.num_lab_procedures)
plt.axvline(mean-df_visual_non_diabetic.num_lab_procedures.min(), color='blue', label=f'mean:{round(mean,2)}')
plt.axvline(median-df_visual_non_diabetic.num_lab_procedures.min(), color='black', label=f'median:{round(median,2)}')
plt.title('Number of Lab Procedures Performed During Visit')
plt.xticks(rotation=90)
plt.legend()
plt.show()

In [None]:
# most of the time at an average 44 tests are taken on a patient, with the exception of 1....

In [None]:
plt.figure(figsize=(26,7))
sns.countplot(x='patient_frequency', data=df_visual_non_diabetic)
mean, median = np.mean(df_visual_non_diabetic.patient_frequency), np.median(df_visual_non_diabetic.patient_frequency)
plt.axvline(mean-df_visual_non_diabetic.patient_frequency.min(), color='blue', label=f'mean:{round(mean,2)}')
plt.axvline(median-df_visual_non_diabetic.patient_frequency.min(), color='black', label=f'median:{round(median,2)}')
plt.title('Number of Visits of patients to the hospital')
plt.xticks(rotation=90)
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x='time_in_hospital', palette='muted', data=df_visual_non_diabetic)
mean, median = np.mean(df_visual_non_diabetic['time_in_hospital']), np.median(df_visual_non_diabetic['time_in_hospital'])
plt.axvline(mean-df_visual_non_diabetic.time_in_hospital.min(), color='blue', label=f'mean:{round(mean,2)}')
plt.axvline(median-df_visual_non_diabetic.time_in_hospital.min(), color='red', label=f'median:{round(median,2)}')
plt.title('Duration of Hospital Visit in Days')
plt.legend()
plt.show()

In [None]:
# On an average patients spend 4 and half days in the hospital..

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x='num_procedures', palette='seismic', data=df_visual_non_diabetic)
mean, median = np.mean(df_visual_non_diabetic.num_procedures), np.median(df_visual_non_diabetic.num_procedures)
plt.axvline(mean, color='blue', label=f'mean:{round(mean,2)}')
plt.axvline(median, color='black', label=f'median:{round(median,2)}')
plt.title('Number of Procedures Performed (Except Lab)');

In [None]:
# For many of the patients there were no procedures performed.
# Number of manual checkups (procedures) performed by the doctor before lab procedures is usually 1 per patient.. 

In [None]:
# number of diagnoses and readmit rate
plt.figure(figsize=(15,5))
ax = sns.countplot(x='number_diagnoses', palette='Accent', data=df_visual_non_diabetic)
plt.title('Number of Diagnoses')
plt.show()

In [None]:
# For many of the patients 9 lab diagnoses are performed...

### DISTRIBUTION OF CATEGORICAL VARIABLES

In [None]:
plt.figure(figsize=(20,25))
for i,col in enumerate(drugs_list):
    plt.subplot(7,3,i+1)
    sns.countplot(x=df_visual_non_diabetic[col])

In [None]:
# Many drugs in the dataset are not administered to patients..Only insulin seems to be administered to majority
# of the patients..
# From the domain understanding many of the drugs administered above are for type 2 diabetic patients..
# We will later perform statistical tests on these features to understand their significance wrt the target column..

In [None]:
plt.figure(figsize=(26, 10))
for i,col in enumerate(patient_formalities_list):
    plt.subplot(2,2,i+1)
    plt.xticks(rotation=90,fontsize=15)
    plt.xlabel(col,fontsize=20)
    sns.countplot(x=df_visual_non_diabetic[col])

In [None]:
# Admission_type_id gives the information about why a patient was admitted..Many of them seem to have got admitted
# in emergency and urgent conditions..around 74000 patients..Around 20000 patients are admitted with prior formalities verified.
# Admission source id talks about how the patient got admitted..It may be because of someone's referral or transfer
# from other hospital.. It is dominated by Emergency room
# Discharge DispositionId gives the detail whether the patient was discharged to home after treatment or was transferred to
# other hospitals for various reasons..A considerably good number of patients were dischrged back home..Around 20000 patients
# were traansferred to other hospitals or care centres..
# left AMA : left against medical advice : Patient refuses to stay for continued care..

In [None]:
plt.figure(figsize=(26, 10))
for i,col in enumerate(patient_demographics_list):
    plt.subplot(1,3,i+1)
    plt.xticks(rotation=90,fontsize=15)
    plt.xlabel(col,fontsize=20)
    sns.countplot(x=df_visual_non_diabetic[col])

In [None]:
# As we can see the gender is balanced, oddly there are more females compared to that of males
# Majority of the patients are senior citizends..
# Caucasians are high..

In [None]:
plt.figure(figsize=(26, 10))
for i,col in enumerate(diagnosis_list):
    plt.subplot(1,3,i+1)
    plt.xticks(rotation=90,fontsize=15)
    plt.xlabel(col,fontsize=20)
    sns.countplot(x=df_visual_non_diabetic[col])

In [None]:
#-------> Many patients have circulatory disorders...

In [None]:
plt.figure(figsize=(15, 7))
for i,col in enumerate(primary_tests_list):
    plt.subplot(1,3,i+1)
    plt.xticks(rotation=90,fontsize=15)
    plt.xlabel(col,fontsize=20)
    sns.countplot(x=df_visual_non_diabetic[col])

In [None]:
# The details of the test results of A1C and maximum glucose serum is not specified for Most of the patients... 

In [None]:
plt.figure(figsize=(15, 7))
for i,col in enumerate(medication_changes_list):
    plt.subplot(1,3,i+1)
    plt.xticks(rotation=90,fontsize=15)
    plt.xlabel(col,fontsize=20)
    sns.countplot(x=df_visual_non_diabetic[col])

In [None]:
# For around 23000 patients no diabetes medications were given..
# For around 46000 patients medications was changed..

### BIVARIATE ANALYSIS (NUMERICAL VS NUMERICAL)

### CORRELATION AMONG NUMERICAL VARIABLES

In [None]:
plt.figure(figsize=(25,15))
sns.heatmap(numerical_df_with_target.corr(),annot=True);

In [None]:
# As we can see from the heatmap there is no heavy multicollinearity among the numerical variables in the data..

In [None]:
# We analyse the relation between numerical columns that have high correlation compared to the rest..

In [None]:
plt.figure(figsize=(15,10))
ax = sns.boxplot(x='time_in_hospital', y='num_lab_procedures', data=df_visual_non_diabetic.sort_values('time_in_hospital'))
# box_labels(ax, df.sort_values('time_in_hospital'),'time_in_hospital','num_lab_procedures') 
plt.title('Lab Procedures Based on Length of Hospital Visit')
plt.show()

In [None]:
# There is an increasing trend between time spent in the hospital and number of lab tests completed.
# This makes sense since patients with longer stays had more tests completed to properly diagnose their conditions.

In [None]:
plt.figure(figsize=(15,10))
ax = sns.boxplot(x='time_in_hospital', y='num_medications', data=df_visual_non_diabetic)
plt.title('Medications Administered Based on Length of Hospital Visit')
plt.show()

In [None]:
# Patients who spend more time in the hospital receive more medications..

In [None]:
plt.figure(figsize=(15,10))
ax = sns.boxplot(x='patient_frequency', y='number_inpatient', data=df_visual_non_diabetic)
plt.title('patient_frequeny vs number_inpatient')
plt.show()

### NUMERICAL FEATURES VS TARGET

In [None]:
plt.figure(figsize=(26,20))
for i,col in enumerate(number_of_visits):
    plt.subplot(2,2,i+1)
    plt.ylabel(col,fontsize=20)
    plt.xlabel(col,fontsize=20)
    plt.xticks(fontsize=20)
    sns.boxplot(x=df_visual_non_diabetic['readmitted'],y=df_visual_non_diabetic[col])

In [None]:
# A good number of people are visiting inpatients who were re admitted within 30 days
# For majority of the inpatients there were no visitors in the previous year..
# A maximum of 21 visitors were there for one particular patient
# usually it is limited to 3 or less than 3 for majority of the patients
#----------> Number_inpatient vs diag..vs number of days

In [None]:
plt.figure(figsize=(26,16))
for i,col in enumerate(number_of_hospital_formalities):
    plt.subplot(2,3,i+1)
    plt.ylabel(col,fontsize=20)
    plt.xlabel(col,fontsize=20)
    plt.xticks(fontsize=20)
    sns.boxplot(x=df_visual_non_diabetic['readmitted'],y=df_visual_non_diabetic[col],palette='brg')

In [None]:
# From the above boxplots we get to know that certain numerical features do not have a significant impact on the target
# such as time_in_hospital, number_of_lab_procedures,number_procedures,number_of_medications and number_of_diagnoses

### CATEGORICAL FEATURES VS TARGET

In [None]:
plt.figure(figsize=(26,10))
count =0
for i in patient_demographics_list:
    plt.subplot(1,3,count+1)
    plt.xticks(rotation=90,fontsize=15)
    plt.xlabel(i,fontsize=20)
    sns.countplot(df_visual_non_diabetic[i], hue=df_visual_non_diabetic['readmitted'], palette='nipy_spectral');
    count = count+1

In [None]:
plt.figure(figsize=(26,20))
count =0
for i in patient_formalities_list:
    plt.subplot(2,2,count+1)
    plt.xticks(rotation=90,fontsize=15)
    plt.xlabel(i,fontsize=20)
    sns.countplot(df_visual_non_diabetic[i], hue=df_visual_non_diabetic['readmitted'], palette='Set1');
    count = count+1

In [None]:
plt.figure(figsize=(26,10))
count =0
for i in diagnosis_list:
    plt.subplot(1,3,count+1)
    plt.xticks(rotation=90,fontsize=15)
    plt.xlabel(i,fontsize=20)
    sns.countplot(df_visual_non_diabetic[i], hue=df_visual_non_diabetic['readmitted'], palette='autumn');
    count = count+1

In [None]:
# Based on drugs

In [None]:
plt.figure(figsize=(25,35))
count =0
for i in drugs_list:
    plt.subplot(7,3,count+1)
    plt.xticks(fontsize=15)
    plt.xlabel(i,fontsize=20)
    sns.countplot(df_visual_non_diabetic[i], hue=df_visual_non_diabetic['readmitted'], palette='coolwarm');
    count = count+1

In [None]:
fig, ax =plt.subplots(figsize=(15,5))
sns.countplot(x="insulin", hue="readmitted", data=df_visual_non_diabetic, palette="YlGnBu")
plt.show()

In [None]:
# Understanding how the major drug 'Insulin' influences patient behaviour
# for majority of the patients Insulin was not administered
# people whose insulin doses were Up or down i.e increased are more likely to get re-admitted..
# Why down increases readmission???

In [None]:
# exclude patients without a glucose reading
plt.figure(figsize=(15, 7))
glucose_none = df_visual_non_diabetic[df_visual_non_diabetic.max_glu_serum != 'None']

# glucose serum results and readmit impact
ax = sns.countplot(x='max_glu_serum', hue='readmitted', palette='Accent', data=glucose_none)
labels_catnum(ax,glucose_none)
plt.title('Readmits By Glucose Serum Levels')
plt.show()

In [None]:
# if the glucose serum test value with

In [None]:
# exclude patients without an A1C reading
alc_none = df_visual_non_diabetic[df_visual_non_diabetic.A1Cresult != 'None']
plt.figure(figsize=(15, 9))
# A1C results and readmit impact
ax = sns.countplot(x='A1Cresult', hue='readmitted', palette='Wistia', data=alc_none)
labels_catnum(ax, alc_none)
plt.title('Readmits By A1C Test Results')
plt.show()

In [None]:
# change in medications
plt.figure(figsize=(15,7))
ax = sns.countplot(x='change', hue='readmitted', data=df_visual_non_diabetic)
labels(ax)
plt.title('Change in Diabetic Medications')
plt.show()

In [None]:
# if there is no change in medication the readmission rate is less..

In [None]:
plt.figure(figsize=(15,7))
ax = sns.countplot(x='diabetesMed', hue='readmitted', data=df_visual_non_diabetic)
labels(ax)
plt.title('Comparison When No drugs were given vs when atleast 1 drug was administered')
plt.show()

In [None]:
#     No significant impact on patient readmission..

### CATEGORICAL VS CATEGORICAL

### PATIENT DEMOGRAPHICS VS REST

In [None]:
plt.figure(figsize=(26,60))
count =0
for i in patient_demographics_list:
    for j in patient_formalities_list:
        plt.subplot(6,2,count+1)
        plt.xticks(fontsize=15,rotation=90)
        plt.xlabel(i,fontsize=20)
        sns.countplot(x=j, hue=i,data=df_visual_non_diabetic, palette='gnuplot');
        count = count+1

In [None]:
plt.figure(figsize=(26,99))
count =0
for i in patient_demographics_list:
    for j in diagnosis_list:
        plt.subplot(9,1,count+1)
        plt.xticks(fontsize=15)
        plt.xlabel(i,fontsize=20)
        sns.countplot(x=j, hue=i,data=df_visual_non_diabetic, palette='Set1_r');
        count = count+1

In [None]:
plt.figure(figsize=(26,30))
count =0
for i in patient_demographics_list:
    for j in primary_tests_list:
        plt.subplot(3,2,count+1)
        plt.xticks(fontsize=15)
        plt.xlabel(i,fontsize=20)
        sns.countplot(x=j, hue=i,data=df_visual_non_diabetic, palette='gnuplot');
        count = count+1

In [None]:
plt.figure(figsize=(26,30))
count =0
for i in patient_demographics_list:
    for j in medication_changes_list:
        plt.subplot(3,2,count+1)
        plt.xticks(fontsize=15)
        plt.xlabel(i,fontsize=20)
        sns.countplot(x=j, hue=i,data=df_visual_non_diabetic, palette='CMRmap_r');
        count = count+1

In [None]:
plt.figure(figsize=(26,30))
count =0
for i in patient_demographics_list:
    plt.subplot(3,1,count+1)
    plt.xticks(fontsize=15)
    plt.xlabel(i,fontsize=20)
    sns.countplot(x='insulin', hue=i,data=df_visual_non_diabetic, palette='gnuplot');
    count = count+1

### PATIENT FORMALITIES VS REST

In [None]:
plt.figure(figsize=(26,60))
count =0
for i in patient_formalities_list:
    for j in diagnosis_list:
        plt.subplot(6,2,count+1)
        plt.xticks(fontsize=15)
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.countplot(x=j, hue=i,data=df_visual_non_diabetic, palette='brg');
        count = count+1

In [None]:
plt.figure(figsize=(26,40))
count =0
for i in patient_formalities_list:
    for j in medication_changes_list:
        plt.subplot(4,2,count+1)
        plt.xticks(fontsize=15)
        plt.xlabel(i,fontsize=20)
        sns.countplot(x=j, hue=i,data=df_visual_non_diabetic, palette='CMRmap');
        count = count+1

In [None]:
plt.figure(figsize=(26,40))
count =0
for i in patient_formalities_list:
    for j in primary_tests_list:
        plt.subplot(4,2,count+1)
        plt.xticks(fontsize=15)
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.countplot(x=j, hue=i,data=df_visual_non_diabetic, palette='Dark2_r');
        count = count+1

In [None]:
plt.figure(figsize=(26,20))
count =0
for i in patient_formalities_list:
    plt.subplot(2,2,count+1)
    plt.xticks(fontsize=15)
    plt.xlabel(i,fontsize=20)
    sns.countplot(x='insulin', hue=i,data=df_visual_non_diabetic, palette='Dark2');
    count = count+1

### DIAGNOSES VS REST

In [None]:
plt.figure(figsize=(26,30))
count =0
for i in diagnosis_list:
    for j in primary_tests_list:
        plt.subplot(3,2,count+1)
        plt.xticks(fontsize=15)
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.countplot(x=j, hue=i,data=df_visual_non_diabetic, palette='brg_r');
        count = count+1

In [None]:
plt.figure(figsize=(26,30))
count =0
for i in diagnosis_list:
    for j in medication_changes_list:
        plt.subplot(3,2,count+1)
        plt.xticks(fontsize=15)
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.countplot(x=j, hue=i,data=df_visual_non_diabetic, palette='icefire');
        count = count+1

In [None]:
plt.figure(figsize=(26,30))
count =0
for i in diagnosis_list:
    plt.subplot(3,1,count+1)
    plt.xticks(fontsize=15)
    plt.xlabel(i,fontsize=20)
    sns.countplot(x='insulin', hue=i,data=df_visual_non_diabetic, palette='Set1_r');
    count = count+1

### PRIMARY TESTS VS REST

In [None]:
plt.figure(figsize=(26,20))
count =0
for i in primary_tests_list:
    for j in medication_changes_list:
        plt.subplot(2,2,count+1)
        plt.xticks(fontsize=15)
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.countplot(x=j, hue=i,data=df_visual_non_diabetic, palette='nipy_spectral_r');
        count = count+1

In [None]:
plt.figure(figsize=(26,20))
count =0
for i in primary_tests_list:
    plt.subplot(2,1,count+1)
    plt.xticks(fontsize=15)
    plt.xlabel(i,fontsize=20)
    sns.countplot(x='insulin', hue=i,data=df_visual_non_diabetic, palette='CMRmap_r');
    count = count+1

In [None]:
# reduced level of insulin for older people should be taken into consideration..doctors have to study..

In [None]:
# patient_demographics_list,hospital_formalities , drugs,diagnosis_list,primary_tests_list,medication_changes_list

In [None]:
# Have all the diabetic patients provided with insulin?

In [None]:
# comparison between patients who were not administered any drug and their diagnosis

### NUMERICAL FEATURES VS CATEGORICAL FEATURES

### patient_demographics_list vs Number_of_hospital_formalities 

In [None]:
plt.figure(figsize=(26,40))
count =0
for i in patient_demographics_list:
    for j in number_of_hospital_formalities:
        plt.subplot(8,2,count+1)
        plt.xticks(fontsize=15)
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_non_diabetic.sort_values(i),palette='nipy_spectral')
        count += 1

In [None]:
plt.figure(figsize=(26,24))
count =0
for i in patient_demographics_list:
    for j in number_of_visits:
        plt.subplot(4,3,count+1)
        plt.xticks(rotation=90,fontsize=15)
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_non_diabetic.sort_values(i),palette='nipy_spectral_r')
        count += 1

In [None]:
plt.figure(figsize=(26,100))
count =0
for i in patient_formalities_list:
    for j in number_of_hospital_formalities:
        plt.subplot(10,2,count+1)
        plt.xticks(rotation=90,fontsize=9,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_non_diabetic.sort_values(i),palette='plasma_r')
        count += 1

In [None]:
plt.figure(figsize=(26,40))
fig.tight_layout()
count =0
for i in patient_formalities_list:
    for j in number_of_visits:
        plt.subplot(8,2,count+1,)
        plt.xticks(rotation=90,fontsize=9,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_non_diabetic.sort_values(i),palette='brg')
        count += 1

In [None]:
plt.figure(figsize=(26,104))
fig.tight_layout()
count =0
for i in diagnosis_list:
    for j in number_of_hospital_formalities:
        plt.subplot(8,2,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_non_diabetic.sort_values(i),palette='Set1')
        count += 1

In [None]:
plt.figure(figsize=(26,27))
fig.tight_layout()
count =0
for i in diagnosis_list:
    for j in number_of_visits:
        plt.subplot(4,3,count+1,)
        plt.xticks(rotation=90,fontsize=9,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_non_diabetic.sort_values(i),palette='plasma')
        count += 1

In [None]:
plt.figure(figsize=(26,50))
fig.tight_layout()
count =0
for i in primary_tests_list:
    for j in number_of_hospital_formalities:
        plt.subplot(5,2,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_non_diabetic.sort_values(i),palette='autumn')
        count += 1

In [None]:
plt.figure(figsize=(26,20))
fig.tight_layout()
count =0
for i in primary_tests_list:
    for j in number_of_visits:
        plt.subplot(3,3,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_non_diabetic.sort_values(i),palette='brg')
        count += 1

In [None]:
plt.figure(figsize=(26,50))
fig.tight_layout()
count =0
for i in medication_changes_list:
    for j in number_of_hospital_formalities:
        plt.subplot(5,2,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_non_diabetic.sort_values(i),palette='icefire')
        count += 1

In [None]:
plt.figure(figsize=(26,20))
fig.tight_layout()
count =0
for i in medication_changes_list:
    for j in number_of_visits:
        plt.subplot(3,3,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_non_diabetic.sort_values(i),palette='CMRmap')
        count += 1

In [None]:
plt.figure(figsize=(26,30))
fig.tight_layout()
count =0
for i in ['insulin']:
    for j in number_of_hospital_formalities:
        plt.subplot(3,2,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_non_diabetic.sort_values(i),palette='gnuplot')
        count += 1

In [None]:
plt.figure(figsize=(26,10))
fig.tight_layout()
count =0
for i in ['insulin']:
    for j in number_of_visits:
        plt.subplot(2,2,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_non_diabetic.sort_values(i),palette='Set2')
        count += 1

# MULTIVARIATE ANALYSIS

In [None]:
plt.figure(figsize=(26,40))
count =0
for i in patient_demographics_list:
    for j in number_of_hospital_formalities:
        plt.subplot(8,2,count+1)
        plt.xticks(fontsize=15)
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_non_diabetic.sort_values(i),hue='readmitted',palette='nipy_spectral')
        count += 1

In [None]:
plt.figure(figsize=(26,24))
count =0
for i in patient_demographics_list:
    for j in number_of_visits:
        plt.subplot(5,3,count+1)
        plt.xticks(rotation=90,fontsize=15)
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_non_diabetic.sort_values(i),palette='brg_r',hue='readmitted')
        count += 1

In [None]:
plt.figure(figsize=(26,100))
count =0
for i in patient_formalities_list:
    for j in number_of_hospital_formalities:
        plt.subplot(10,2,count+1)
        plt.xticks(rotation=90,fontsize=9,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_non_diabetic.sort_values(i),palette='plasma_r',hue='readmitted')
        count += 1

In [None]:
plt.figure(figsize=(26,40))
fig.tight_layout()
count =0
for i in patient_formalities_list:
    for j in number_of_visits:
        plt.subplot(8,2,count+1,)
        plt.xticks(rotation=90,fontsize=9,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_non_diabetic.sort_values(i),palette='gnuplot',hue='readmitted')
        count += 1

In [None]:
plt.figure(figsize=(26,104))
fig.tight_layout()
count =0
for i in diagnosis_list:
    for j in number_of_hospital_formalities:
        plt.subplot(8,2,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_non_diabetic.sort_values(i),palette='CMRmap',hue='readmitted')
        count += 1

In [None]:
plt.figure(figsize=(26,54))
fig.tight_layout()
count =0
for i in diagnosis_list:
    for j in number_of_visits:
        plt.subplot(6,2,count+1,)
        plt.xticks(rotation=90,fontsize=9,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_non_diabetic.sort_values(i),palette='plasma',hue='readmitted')
        count += 1

In [None]:
plt.figure(figsize=(26,50))
fig.tight_layout()
count =0
for i in primary_tests_list:
    for j in number_of_hospital_formalities:
        plt.subplot(5,2,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_non_diabetic.sort_values(i),palette='Reds',hue='readmitted')
        count += 1

In [None]:
plt.figure(figsize=(26,40))
fig.tight_layout()
count =0
for i in primary_tests_list:
    for j in number_of_visits:
        plt.subplot(4,2,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_non_diabetic.sort_values(i),palette='icefire',hue='readmitted')
        count += 1

In [None]:
plt.figure(figsize=(26,50))
fig.tight_layout()
count =0
for i in medication_changes_list:
    for j in number_of_hospital_formalities:
        plt.subplot(5,2,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_non_diabetic.sort_values(i),palette='autumn_r',hue='readmitted')
        count += 1

In [None]:
plt.figure(figsize=(26,28))
fig.tight_layout()
count =0
for i in medication_changes_list:
    for j in number_of_visits:
        plt.subplot(4,2,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_non_diabetic.sort_values(i),palette='CMRmap_r',hue='readmitted')
        count += 1

In [None]:
plt.figure(figsize=(26,30))
fig.tight_layout()
count =0
for i in ['insulin']:
    for j in number_of_hospital_formalities:
        plt.subplot(3,2,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_non_diabetic.sort_values(i),palette='gnuplot_r',hue='readmitted')
        count += 1

In [None]:
plt.figure(figsize=(26,20))
fig.tight_layout()
count =0
for i in ['insulin']:
    for j in number_of_visits:
        plt.subplot(2,2,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_non_diabetic.sort_values(i),palette='icefire_r',hue='readmitted')
        count += 1

### BUSINESS INTERPRETATION AND INSIGHTS

### INSIGHTS THAT ARE FOUND USING PREVIOUS ANALYSIS

### FINDING THE CAUSE FOR EVENTS.. 

In [None]:
df_visual_non_diabetic[(df_visual_non_diabetic['diag_1'].isnull()) & (df_visual_non_diabetic['diag_1'].isnull()) & (df_visual_non_diabetic['diag_1'].isnull())]

In [None]:
# Understanding Expired Patients wrt their disorders

In [None]:
df_visual_non_diabetic[df_visual_non_diabetic['discharge_disposition_id']=='Expired']['readmitted'].value_counts()

In [None]:
df_visual_non_diabetic[df_visual_non_diabetic['discharge_disposition_id']=='Hospice']['readmitted'].value_counts()

In [None]:
diag_tab_expired = pd.DataFrame(df_visual_non_diabetic[df_visual_non_diabetic['discharge_disposition_id']=='Expired']['diag_1'].value_counts())

In [None]:
diag_tab_expired['diag_2'] = df_visual_non_diabetic[df_visual_non_diabetic['discharge_disposition_id']=='Expired']['diag_2'].value_counts()

In [None]:
diag_tab_expired['diag_3'] = df_visual_non_diabetic[df_visual_non_diabetic['discharge_disposition_id']=='Expired']['diag_3'].value_counts()

In [None]:
diag_tab_expired.plot(kind='bar',figsize=(26,10));

In [None]:
# From the above graph we could say that many patients who expired had Circulatory disorder...But the ratio of circulatory
# patients is also considerably high

In [None]:
fatality_percent_diag_1 = {}
fatality_percent_diag_2 = {}
fatality_percent_diag_3 = {}
for cat in df_visual_non_diabetic['diag_1'].unique():
    fatality_percent_diag_1.update({cat:100*(len(df_visual_non_diabetic[(df_visual_non_diabetic['diag_1']==cat) & (df_visual_non_diabetic['discharge_disposition_id']=='Expired')])/len(df_visual_non_diabetic[df_visual_non_diabetic['diag_1']==cat]))})
    fatality_percent_diag_2.update({cat:100*(len(df_visual_non_diabetic[(df_visual_non_diabetic['diag_2']==cat) & (df_visual_non_diabetic['discharge_disposition_id']=='Expired')])/len(df_visual_non_diabetic[df_visual_non_diabetic['diag_2']==cat]))})
    fatality_percent_diag_3.update({cat:100*(len(df_visual_non_diabetic[(df_visual_non_diabetic['diag_3']==cat) & (df_visual_non_diabetic['discharge_disposition_id']=='Expired')])/len(df_visual_non_diabetic[df_visual_non_diabetic['diag_3']==cat]))})
    

In [None]:
fatality_percent = pd.DataFrame()
for key in fatality_percent_diag_1:
    fatality_percent[key]=[fatality_percent_diag_1[key],fatality_percent_diag_2[key],fatality_percent_diag_3[key]]
    

In [None]:
fatality_percent = fatality_percent.T

In [None]:
fatality_percent.columns=['diag_1','diag_2','diag_3']

In [None]:
fatality_percent.plot(kind='bar',figsize=(26,10));

In [None]:
# the above plot shows the rate of expiration in terms of disease..

In [None]:
# Understandiing transferred Patients wrt their disorders

In [None]:
diag_tab_transferred = pd.DataFrame(df_visual_non_diabetic[df_visual_non_diabetic['discharge_disposition_id']=='Transferred/Referred']['diag_1'].value_counts())

In [None]:
diag_tab_transferred['diag_2'] = df_visual_non_diabetic[df_visual_non_diabetic['discharge_disposition_id']=='Transferred/Referred']['diag_2'].value_counts()

In [None]:
diag_tab_transferred['diag_3'] = df_visual_non_diabetic[df_visual_non_diabetic['discharge_disposition_id']=='Transferred/Referred']['diag_3'].value_counts()

In [None]:
diag_tab_transferred.plot(kind='bar',figsize=(26,10));

In [None]:
# Many patients who were transferred had a circulatory disorder..

In [None]:
transfer_percent_diag_1 = {}
transfer_percent_diag_2 = {}
transfer_percent_diag_3 = {}
for cat in df_visual_non_diabetic['diag_1'].unique():
    transfer_percent_diag_1.update({cat:100*(len(df_visual_non_diabetic[(df_visual_non_diabetic['diag_1']==cat) & (df_visual_non_diabetic['discharge_disposition_id']=='Transferred/Referred')])/len(df_visual_non_diabetic[df_visual_non_diabetic['diag_1']==cat]))})
    transfer_percent_diag_2.update({cat:100*(len(df_visual_non_diabetic[(df_visual_non_diabetic['diag_2']==cat) & (df_visual_non_diabetic['discharge_disposition_id']=='Transferred/Referred')])/len(df_visual_non_diabetic[df_visual_non_diabetic['diag_2']==cat]))})
    transfer_percent_diag_3.update({cat:100*(len(df_visual_non_diabetic[(df_visual_non_diabetic['diag_3']==cat) & (df_visual_non_diabetic['discharge_disposition_id']=='Transferred/Referred')])/len(df_visual_non_diabetic[df_visual_non_diabetic['diag_3']==cat]))})
    

In [None]:
transfer_percent = pd.DataFrame()
for key in transfer_percent_diag_1:
    transfer_percent[key]=[transfer_percent_diag_1[key],transfer_percent_diag_2[key],transfer_percent_diag_3[key]]
    

In [None]:
transfer_percent = transfer_percent.T

In [None]:
transfer_percent.columns=['diag_1','diag_2','diag_3']

In [None]:
transfer_percent.plot(kind='bar',figsize=(26,10));

In [None]:
# the rate of transfer of patients who were injured and patients who had musculoskeletal disorders is high..

In [None]:
# insulin vs age vs diagnoses vs discharge_disposition_id vs patient_frequency_categorized

In [None]:
pd.crosstab(df_visual_non_diabetic['insulin'],df_visual_non_diabetic['age'])

In [None]:
pd.crosstab(df_visual_non_diabetic['insulin'],df_visual_non_diabetic['age'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_non_diabetic['insulin'],df_visual_non_diabetic['diag_1'])

In [None]:
pd.crosstab(df_visual_non_diabetic['insulin'],df_visual_non_diabetic['diag_1'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_non_diabetic['insulin'],df_visual_non_diabetic['patient_frequency_categorized'])

In [None]:
pd.crosstab(df_visual_non_diabetic['insulin'],df_visual_non_diabetic['patient_frequency_categorized'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_non_diabetic['age'],df_visual_non_diabetic['diag_1'])

In [None]:
# the age group of 1-10 are least prone to any of these diseases..

In [None]:
pd.crosstab(df_visual_non_diabetic['age'],df_visual_non_diabetic['diag_1'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
# As age increases people are more prone to circulatory disorder..

In [None]:
pd.crosstab(df_visual_non_diabetic['age'],df_visual_non_diabetic['diag_1'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_non_diabetic['age'],df_visual_non_diabetic['discharge_disposition_id'])

In [None]:
pd.crosstab(df_visual_non_diabetic['age'],df_visual_non_diabetic['discharge_disposition_id'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
# Many of the inpatients are of the age 60 and above..

In [None]:
pd.crosstab(df_visual_non_diabetic['age'],df_visual_non_diabetic['patient_frequency_categorized'])

In [None]:
# add this without inference..

In [None]:
pd.crosstab(df_visual_non_diabetic['age'],df_visual_non_diabetic['patient_frequency_categorized'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
# Patientss who are 50 and above tend to visit the hospital more often..

In [None]:
pd.crosstab(df_visual_non_diabetic['age'],df_visual_non_diabetic['patient_frequency_categorized'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
# Many patients in all age group visit the hospital only once..

In [None]:
pd.crosstab(df_visual_non_diabetic['diag_1'],df_visual_non_diabetic['discharge_disposition_id'])

In [None]:
pd.crosstab(df_visual_non_diabetic['diag_1'],df_visual_non_diabetic['discharge_disposition_id'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
# many of the inpatients are because of other disorders like headache etc..

In [None]:
pd.crosstab(df_visual_non_diabetic['diag_1'],df_visual_non_diabetic['patient_frequency_categorized'])

In [None]:
# circulatory respiratory and patients with other disorders tend to visit the hospital more number of times..

In [None]:
pd.crosstab(df_visual_non_diabetic['diag_1'],df_visual_non_diabetic['patient_frequency_categorized'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
# 

In [None]:
pd.crosstab(df_visual_non_diabetic['diag_1'],df_visual_non_diabetic['patient_frequency_categorized'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
# patients with respiratory and other illness tend to visit the hospital often..

In [None]:
pd.crosstab(df_visual_non_diabetic['discharge_disposition_id'],df_visual_non_diabetic['patient_frequency_categorized'])

In [None]:
# add anyways..

In [None]:
pd.crosstab(df_visual_non_diabetic['discharge_disposition_id'],df_visual_non_diabetic['patient_frequency_categorized'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_non_diabetic['diag_1'],df_visual_non_diabetic['readmitted'])

In [None]:
pd.crosstab(df_visual_non_diabetic['diag_1'],df_visual_non_diabetic['readmitted'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_non_diabetic['diag_1'],df_visual_non_diabetic['readmitted'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
# readmission rate is very high for circulatory,respiratory other and digestive disorders

In [None]:
pd.crosstab(df_visual_non_diabetic['discharge_disposition_id'],df_visual_non_diabetic['readmitted'])

In [None]:
# add anyways..

In [None]:
pd.crosstab(df_visual_non_diabetic['discharge_disposition_id'],df_visual_non_diabetic['readmitted'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_non_diabetic['discharge_disposition_id'],df_visual_non_diabetic['readmitted'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_non_diabetic['patient_frequency_categorized'],df_visual_non_diabetic['readmitted'])

In [None]:
# 7640 patients were readmitted in the first instance itself..

In [None]:
pd.crosstab(df_visual_non_diabetic['patient_frequency_categorized'],df_visual_non_diabetic['readmitted'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
# patients who visit the hospital more than once are more likely to be readmitted..

In [None]:
pd.crosstab(df_visual_non_diabetic['patient_frequency_categorized'],df_visual_non_diabetic['readmitted'],normalize='index').plot(kind='bar',figsize=(26,10));

## Dropping Expired Records and patient_frequency_categorized feature.

In [None]:
index_to_be_dropped = df_visual_non_diabetic[df_visual_non_diabetic['discharge_disposition_id']=='Expired'].index

In [None]:
df_visual_non_diabetic.drop(index_to_be_dropped,inplace=True)

In [None]:
df_visual_non_diabetic.drop(columns=['patient_nbr'],inplace=True)

In [None]:
df_visual_non_diabetic.drop(columns=['patient_frequency_categorized'],inplace=True)

In [None]:
df_visual_non_diabetic.to_csv('Non_diabetic_data_for_stats.csv')

# STATISTICAL ANALYSIS

In [None]:
data_preprocessed = pd.read_csv('Non_diabetic_data_for_stats.csv',index_col=0)
df_stats = data_preprocessed.copy()

df_stats.head()

In [None]:
# mode imputation for statistical analysis

In [None]:
for col in ['race','diag_2','diag_3']:
    df_stats[col] = df_stats[col].fillna(df_stats[col].mode()[0])

In [None]:
df_stats.isnull().sum()

In [None]:
df_stats.shape

### FINDING CONSTANT AND QUASI-CONSTANT FEATURES USING A VARIANCE THRESHOLD OF 0.995

In [None]:
quasi_constant_feat = []


for feature in df_stats.columns:

    predominant = (df_stats[feature].value_counts() / np.float(len(df_stats))).sort_values(ascending=False).values[0]

    if predominant > 0.995:
        
        
        quasi_constant_feat.append(feature)

len(quasi_constant_feat)

In [None]:
quasi_constant_feat

# CHI-SQUARED TEST FOR INDEPENDENCE

We want to analyze the variables in this dataset to understand any relationships between them and their overall effects.
To do this,
  * `Chi-square test` for categorical variables relationship
  * We have to analyze numerical variables using `analysis of variance` or `ANOVA test`.
The purpose of these tests is to determine whether there is a statistically significant relationship between 
the target variable, readmissions and independent variable. 
Considering p-value as 0.05, if anything above that, we cannot reject the null hypothesis.
This takes the string values ​​in a variable and converts them to columns labeled 0 or 1 relative to the string.
We will also standardize the original numerical variables with a mean of 0 and a standard deviation of 1.
Finally, we look at the correlation coefficients between the independent variables to make sure they do not have a
strong influence on each other. The threshold we used is -0.7 <x <0.7.

This test is used to test whether the categorical variables are independent or not.

<p style='text-indent:20em'> <strong> $H_{0}$: The variables are independent</strong></p>
<p style='text-indent:20em'> <strong> $H_{1}$: The variables are not independent (i.e. variables are dependent)</strong></p>

Consider a categorical variable `A` with `r` levels and variable `B` with `c` levels. Let us test the independence of variables A and B.

The test statistic is given as:
<p style='text-indent:25em'> <strong> $\chi^{2} = \sum_{i= 1}^{r}\sum_{j = 1}^{c}\frac{O_{ij}^{2}}{E_{ij}} - N$</strong></p>

Where, <br>
$O_{ij}$: Observed frequency for category (i,j) <br>
$E_{ij}$: Expected frequency for category (i,j)<br>
$N$: Total number of observations

Under $H_{0}$, the test statistic follows a chi-square distribution with $(r-1)(c-1)$ degrees of freedom.

In [None]:
categorical_columns = df_stats.select_dtypes(include='object').columns.tolist()

In [None]:
# define a function that returns a table, a chi-square value, and a p value
def chisquare_test(df, var_list, target, null_list=[]):
    for var in var_list:
        print('\n\n',var.upper())
        chi_test = pd.crosstab(df[var], df[target])
        display(chi_test)
        
        chisq_value, pvalue, dof, expected = chi2_contingency(chi_test)
        print('---'*10,'\nExpected Chi table ')
        display(pd.DataFrame(expected,index=chi_test.index,columns=chi_test.columns))
        print(f"""Chi-square value: {chisq_value:.2f}
p-value\t\t: {pvalue:.3f}         for      {var.upper()}\n""")
        print('===='*30)
        
        if pvalue > 0.05: # adds variables that fail to reject the null hypothesis
            null_list.append(var)
            
    print(f'Failed to Reject null hypothesis: {null_list}')

In [None]:
cols_cat = df_stats.select_dtypes(exclude=np.number).columns.to_list()
chi_squared_failed_features=[]
chisquare_test(df_stats, cols_cat,'readmitted',chi_squared_failed_features)

In [None]:
# The drugs: 'nateglinide', 'chlorpropamide', 'acetohexamide', 'glyburide', 'tolbutamide', 'miglitol', 'troglitazone',
#           'tolazamide', 'glyburide-metformin', 'glipizide-metformin','glimepiride-pioglitazone', 'metformin-rosiglitazone',
#           'metformin-pioglitazone'
# failed to pass the test since they have p-values greater than 0.05.
# Based on the chi-square value and p-value, we can safely say that rest of all the categorical features have safely passed
# and have relation with the target variable.

In [None]:
chi_squared_failed_features

# ONE-WAY ANOVA

It is used to check the equality of population means for more than two independent samples. Each group is considered as a `treatment`. It assumes that the samples are taken from normally distributed populations. To check this assumption we can use the `Shapiro-Wilk Test.` Also, the population variances should be equal; this can be tested using the `Levene's Test`.

The null and alternative hypothesis is given as:
<p style='text-indent:20em'> <strong> $H_{0}$: The averages of all treatments are the same. </strong></p>
<p style='text-indent:20em'> <strong> $H_{1}$: At least one treatment has a different average. </strong></p>

Consider there are `t` treatments and `N` number of total observations. The test statistic is given as:
<p style='text-indent:28em'> <strong> $F = \frac{MTrSS}{MESS} $</strong></p>

Where,<br>
MTrSS = $\frac{TrSS}{df_{Tr}}$<br>

TrSS = $\sum_{i}^{t}\sum_{j}^{n_{i}}n_{i}(\bar{x_{i}}. - \bar{x}..)$<br> $n_{i}$ is the number of observations in $i^{th}$ treatment. <br>$\bar{x_{i}}.$ is the mean over $i^{th}$ treatment <br> $\bar{x}..$ is the grand mean (i.e. mean of all the observations). <br>

$df_{Tr}$ is the degrees of freedom for treatments (= $t-1$)

MESS = $\frac{ESS}{df_{e}}$<br>

ESS = $\sum_{i}^{t}\sum_{j}^{n_{i}}{(x_{ij} - \bar{x_{i}}.)}^{2}$

$df_{e}$ is the degrees of freedom for error (= $N-t$)

Under $H_{0}$, the test statistic follows F-distribution with ($t-1,  N-t$) degrees of freedom.

# NON-PARAMETRIC (KRUSKAL-WALLIS H TEST)

If one of the assumptions of one-way ANOVA is not satisfied, then we can perform the `Kruskal-Wallis H test` which is a non-parametric equivalent test for one-way ANOVA.

The null and alternative hypothesis is given as:
<p style='text-indent:20em'> <strong> $H_{0}$: The averages of all treatments are the same. </strong></p>
<p style='text-indent:20em'> <strong> $H_{1}$: At least one treatment has a different average. </strong></p>

Consider there are `t` treatments each with $n_{i}$ (i = 1,2,...,t) observations  and `n` be the number of total observations. The test statistic is given as:
<p style='text-indent:28em'> <strong> $H = \frac{12}{n(n+1)} \sum_{i}{\frac{R_{i}^{2}}{n_{i}} - 3(n+1)} $</strong></p>

Where $R_{i}$ sum of the ranks of observations in the $i^{th}$ treatment.

Under $H_{0}$, the test statistic follows Chi-square distribution with ($t-1$) degrees of freedom.

In [None]:
# The numerical variables 
numerical_columns=df_stats.select_dtypes(include=np.number).columns.to_list()
print(numerical_columns)

In [None]:
# define a function that performs the ANOVA test and returns a table
def anova_table(var_list,target_column, failed_list=[],test_list=[]):
    for var in var_list:
        print('\n\n')
        print(var.upper())
        print('=='*20)
        for cat in df_stats[target_column].unique():
            print(cat.upper())
            data = df_stats[df_stats[target_column]==cat][var]
            shapiro_stats = stats.shapiro(data)
            print(shapiro_stats)
            if shapiro_stats[1]<0.05:
                print(f'\n-----------------Shapiro test for {var} and {cat} has failed--------------------\n')
                
            print('--'*44,'\n')
            test_list.append(data)
            
        print(f'\nlevene  test for {var} \n')
        print(stats.levene(*test_list))
        print('--'*20,'\n\n')   
        print(f'\nanova  test for {var} \n')
        print(stats.f_oneway(*test_list))
        print('--'*20,'\n\n')
        print(f'\nkruskal  test for {var}\n')
        print(stats.kruskal(*test_list))
        print('--'*20,'\n\n')
        print('=='*50)  
        print('\n\n')
        

In [None]:
anova_table(numerical_columns,'readmitted')

# STATISTICAL CONCLUSIONS

### FEATURE SELECTION USING FILTER METHODS

In [None]:
insignificant_features = list(set(quasi_constant_feat).union(set(chi_squared_failed_features)))

In [None]:
insignificant_features

In [None]:
# Few DRUGS have failed to prove their significance wrt the target variable hence they can be eleminated

In [None]:
df_stats = df_stats.drop(columns=insignificant_features)

In [None]:
df_stats.shape

In [None]:
df_stats.head()

In [None]:
df_stats.to_csv('Non_diabetic_data_for_encoding.csv')

### ENCODING OF ORDINAL CATEGORICAL FEATURES IN THE DATASET 

In [None]:
data_processed = pd.read_csv('Non_diabetic_data_for_encoding.csv',index_col=0)
df_encoded = data_processed.copy()
df_encoded.head()

In [None]:
df_encoded.shape

In [None]:
# age,A1cresult and max_glu_serum are ordinal features...

In [None]:
df_encoded['age'] = df_encoded['age'].apply(lambda x : x[1]).astype(int)

In [None]:
df_encoded['max_glu_serum'] = df_encoded['max_glu_serum'].replace({'None':0,'Norm':1,'>200':200,'>300':300}).astype(int)

In [None]:
df_encoded['A1Cresult'] = df_encoded['A1Cresult'].replace({'None':0,'Norm':1,'>8':8,'>7':7}).astype(int)

In [None]:
for col in drugs_list:
    if col in df_encoded.columns:
        df_encoded[col].replace({'No':0,'Steady':0.5,'Down':-1,'Up':1},inplace=True)

In [None]:
df_encoded.head()

In [None]:
df_encoded = pd.get_dummies(df_encoded,drop_first=True)

In [None]:
df_encoded.head()

In [None]:
df_encoded.shape

In [None]:
# save dataset to new file for machine learning
df_encoded.to_csv('Non_diabetic_data_for_ml.csv')

# TRAIN TEST SPLIT

In [None]:
df_ml = pd.read_csv('Non_diabetic_data_for_ml.csv',index_col=0)
df_ml.head()

In [None]:
X = df_ml.drop(columns=['readmitted_Yes'])

In [None]:
y = df_ml['readmitted_Yes']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,stratify=y, random_state=42)

In [None]:
sc =  StandardScaler()

In [None]:
X_train = pd.DataFrame(sc.fit_transform(X_train),columns=X_train.columns)

In [None]:
X_test = pd.DataFrame(sc.transform(X_test),columns=X_train.columns)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
y_train.unique()

# MODEL BUILDING : ITERATION 1

In [None]:
cv_acc_train = {}
cv_acc_test = {}
cv_TPR = {}
cv_FPR = {}
cv_acc = {}

In [None]:
def plot_result(model, name:str):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluation based on a 5-fold cross-validation
    scores_train = cross_val_score(model, X_train, y_train, cv=5, scoring = 'balanced_accuracy')
    scores_test = cross_val_score(model, X_test, y_test, cv=5, scoring = 'balanced_accuracy')  
    scores_train_f1 = cross_val_score(model, X_train, y_train, cv=5, scoring = 'f1_micro')
    scores_test_f1 = cross_val_score(model, X_test, y_test, cv=5, scoring = 'f1_micro')  
    cv_acc_train[name] = round(scores_train.mean(), 4)*100 
    cv_acc_test[name] = round(scores_test.mean(), 4)*100
    cv_TPR[name] = (confusion_matrix(y_test, y_pred)[1][1]/confusion_matrix(y_test, y_pred)[1].sum())*100 
    cv_FPR[name] = (confusion_matrix(y_test, y_pred)[0][1]/confusion_matrix(y_test, y_pred)[0].sum())*100 
    cv_acc[name] = accuracy_score(y_test,y_pred)
    # accuracy scores
    print('Average Balanced Accuracy (CV=5), Test Set:', scores_test.mean())  
    print('Average Balanced Accuracy (CV=5), Training Set: ', scores_train.mean())

    # print classification report
    print(classification_report(y_test, y_pred, zero_division=0))

    # Plot Confusion Matrix
    plot_confusion_matrix(model, X_test, y_test)
    plt.show()

In [None]:
lgc = LogisticRegression()
knc = KNeighborsClassifier()
nbc = GaussianNB()
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier()
abc = AdaBoostClassifier()
gbc = GradientBoostingClassifier()

In [None]:
plot_result(lgc, "LogisticRegression_base")

In [None]:
plot_result(dtc, "DecisionTreeClassifier_base")

In [None]:
plot_result(rfc, "RandomForestClassifier_base")

In [None]:
plot_result(abc, "AdaBoostClassifier_base")

In [None]:
plot_result(gbc, "GradientBoostingClassifier_base")

In [None]:
feature_importance_df = pd.DataFrame(rfc.feature_importances_,index=X_train.columns,columns=['Random_Forest'])

feature_importance_df['Ada_Boost'] = abc.feature_importances_

feature_importance_df['Gradient_Boosting'] = gbc.feature_importances_

feature_importance_df = round(feature_importance_df.sort_values(by=['Random_Forest','Ada_Boost','Gradient_Boosting']),3)

feature_importance_df
## BUILDING MODEL AFTER REMOVING TOP few UNIMPORTANT FEATURES USING RANDOM FOREST FEATURE_IMPORTANCE ATTRIBUTE..

In [None]:
selected_features_random_forest_embedded = list(feature_importance_df.index[-1:10:-1])

In [None]:
selected_features_random_forest_embedded

In [None]:
len(selected_features_random_forest_embedded)

In [None]:
selected_features_random_forest_embedded = selected_features_random_forest_embedded+['readmitted_Yes']

# MODEL BUILDING : ITERATION 2 (FEATURE SELECTION USING EMBEDDED METHODS

In [None]:
df_ml_embedded_1 = pd.read_csv('Non_diabetic_data_for_ml.csv',index_col=0)
df_ml_embedded_1.head()

In [None]:
df_ml_embedded_1 = df_ml_embedded_1[selected_features_random_forest_embedded]
df_ml_embedded_1.shape

In [None]:
X = df_ml_embedded_1.drop(columns=['readmitted_Yes'])
y = df_ml_embedded_1['readmitted_Yes']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,stratify=y, random_state=42)
sc =  StandardScaler()
X_train = pd.DataFrame(sc.fit_transform(X_train),columns=X_train.columns)
X_test = pd.DataFrame(sc.transform(X_test),columns=X_train.columns)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
rf2 = RandomForestClassifier()

In [None]:
plot_result(rf2,'Random_Forest_Embedded')

### Feature Selection using Logistice Regression

In [None]:
df_ml_embedded_2 = pd.read_csv('Non_diabetic_data_for_ml.csv',index_col=0)
df_ml_embedded_2.head()

In [None]:
X = df_ml_embedded_2.drop(columns=['readmitted_Yes'])
y = df_ml_embedded_2['readmitted_Yes']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,stratify=y, random_state=42)
sc =  StandardScaler()
X_train = pd.DataFrame(sc.fit_transform(X_train),columns=X_train.columns)
X_test = pd.DataFrame(sc.transform(X_test),columns=X_train.columns)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
sel2=SelectFromModel(LogisticRegression(penalty='l1',C=0.025,solver='saga'))
sel2.fit(X_train,y_train)

In [None]:
len(X_train.columns)

In [None]:
pd.DataFrame((sel2.get_support()),index=X_train.columns).sort_values(by=0)

In [None]:
selected_features_lasso_embedded = X_train.columns[(sel2.get_support())]

In [None]:
selected_features_lasso_embedded = list(selected_features_lasso_embedded)

In [None]:
selected_features_lasso_embedded

In [None]:
selected_features_lasso_embedded = selected_features_lasso_embedded + ['readmitted_Yes']

In [None]:
len(selected_features_lasso_embedded)

In [None]:
new_df=df_ml_embedded_2[selected_features_lasso_embedded]

In [None]:
new_df.head()

In [None]:
X=new_df.drop(columns=['readmitted_Yes'])
y=new_df['readmitted_Yes']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,stratify=y, random_state=42)
sc =  StandardScaler()
X_train = pd.DataFrame(sc.fit_transform(X_train),columns=X_train.columns)
X_test = pd.DataFrame(sc.transform(X_test),columns=X_train.columns)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
log_reg_drop=LogisticRegression()

In [None]:
plot_result(log_reg_drop,'LassoLogistic')

## Gradient Boosting Using above X_train and Y_Train

In [None]:
gd1=GradientBoostingClassifier()

In [None]:
plot_result(gd1,'Embedded_Gradient_Boosting')

# MODEL BUILDING : ITERATION 3 (FEATURE SELECTION USING WRAPPER METHODS

<a id="rfe"></a>
# 5. Recursive Feature Elimination (RFE)

It is the process that returns the significant features in the dataset by recursively removing the less significant feature subsets.

In [None]:
df = pd.read_csv('Non_diabetic_data_for_ml.csv',index_col=0)
df.head()

In [None]:
X = df.drop(columns=['readmitted_Yes'])
y = df['readmitted_Yes']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,stratify=y, random_state=42)
sc =  StandardScaler()
X_train = pd.DataFrame(sc.fit_transform(X_train),columns=X_train.columns)
X_test = pd.DataFrame(sc.transform(X_test),columns=X_train.columns)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
y_train.unique()

In [None]:
rfc_hybrid = RandomForestClassifier()

rfe_model = RFE(estimator=rfc_hybrid, n_features_to_select = None,verbose=2)


rfe_model = rfe_model.fit(X_train, y_train)


feat_index = pd.Series(data = rfe_model.ranking_, index = X_train.columns)


selected_features_rfe_hybrid = feat_index[feat_index==1].index


print(selected_features_rfe_hybrid)

In [None]:
selected_features_rfe_hybrid = list(selected_features_rfe_hybrid)

In [None]:
selected_features_rfe_hybrid = selected_features_rfe_hybrid + ['readmitted_Yes']

In [None]:
len(selected_features_rfe_hybrid)

# MODEL BUILDING : ITERATION 4 :HYPERPARAMETER TUNING

In [None]:
df = pd.read_csv('Non_diabetic_data_for_ml.csv',index_col=0)
df.head()

In [None]:
df_hybrid = df[selected_features_rfe_hybrid]

In [None]:
df_hybrid.head()

In [None]:
df_hybrid.shape

In [None]:
X = df_hybrid.drop(columns=['readmitted_Yes'])
y = df_hybrid['readmitted_Yes']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,stratify=y, random_state=42)
sc =  StandardScaler()
X_train = pd.DataFrame(sc.fit_transform(X_train),columns=X_train.columns)
X_test = pd.DataFrame(sc.transform(X_test),columns=X_train.columns)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
criterion=["gini", "entropy"]    # learning_rate = [0.01,0.1,0.25,0.5]
max_depth = [2,4,6]        # max_depth = [2,3,4,5]
n_estimators = [150,100]     # n_estimators = [20,30,50,100,150]

In [None]:
param = {'criterion':criterion,'max_depth':max_depth,'n_estimators':n_estimators}

In [None]:
grid = GridSearchCV(estimator=RandomForestClassifier(),param_grid=param)

In [None]:
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
plot_result(RandomForestClassifier(criterion = 'gini', max_depth = 6, n_estimators = 150),'Random_Forest_Grid')

In [None]:
df = pd.read_csv('Non_diabetic_data_for_ml.csv',index_col=0)
df.head()

In [None]:
X = df.drop(columns=['readmitted_Yes'])
y = df['readmitted_Yes']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,stratify=y, random_state=42)
sc =  StandardScaler()
X_train = pd.DataFrame(sc.fit_transform(X_train),columns=X_train.columns)
X_test = pd.DataFrame(sc.transform(X_test),columns=X_train.columns)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
y_train.unique()

In [None]:
gbc_hybrid = GradientBoostingClassifier()

gbc_model = RFE(estimator=gbc_hybrid, n_features_to_select = None,verbose=2)


gbc_model = gbc_model.fit(X_train, y_train)


feat_index = pd.Series(data = gbc_model.ranking_, index = X_train.columns)


selected_features_gbc_hybrid = feat_index[feat_index==1].index


print(selected_features_gbc_hybrid)

In [None]:
selected_features_gbc_hybrid = list(selected_features_gbc_hybrid)

In [None]:
selected_features_gbc_hybrid = selected_features_gbc_hybrid + ['readmitted_Yes']

In [None]:
len(selected_features_gbc_hybrid)

# MODEL BUILDING : ITERATION 4 :HYPERPARAMETER TUNING

In [None]:
df = pd.read_csv('Non_diabetic_data_for_ml.csv',index_col=0)
df.head()

In [None]:
df_hybrid = df[selected_features_gbc_hybrid]

In [None]:
df_hybrid.head()

In [None]:
df_hybrid.shape

In [None]:
X = df_hybrid.drop(columns=['readmitted_Yes'])
y = df_hybrid['readmitted_Yes']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,stratify=y, random_state=42)
sc =  StandardScaler()
X_train = pd.DataFrame(sc.fit_transform(X_train),columns=X_train.columns)
X_test = pd.DataFrame(sc.transform(X_test),columns=X_train.columns)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
learning_rate = [0.05,0.1]
max_depth = [7,6]        
n_estimators = [150,100]     

In [None]:
param = {'learning_rate':learning_rate,'max_depth':max_depth,'n_estimators':n_estimators}

In [None]:
grid = GridSearchCV(estimator=GradientBoostingClassifier(),param_grid=param)

In [None]:
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
plot_result(GradientBoostingClassifier(learning_rate = 0.05, max_depth = 6, n_estimators = 150),'GradientBoost_Grid')

In [None]:
d = {}
for key in cv_acc_test:
    l = [cv_acc[key],cv_acc_test[key],cv_acc_train[key],cv_FPR[key],cv_TPR[key]]
    d.update({key:l})

In [None]:
model_performance_df = pd.DataFrame(d,index=['Accuracy','Balanced_Test_Accuracy','Balanced_Train_Accuracy','False_Positive_Rate','True_Positive_Rate']).T

In [None]:
model_performance_df