In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import chi2_contingency


from sklearn.metrics import confusion_matrix, classification_report, plot_precision_recall_curve,auc
from sklearn.metrics import plot_confusion_matrix,roc_curve, roc_auc_score,precision_recall_curve,accuracy_score,f1_score


from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, train_test_split, KFold



from sklearn.preprocessing import StandardScaler


from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.feature_selection import RFE,SelectFromModel
from mlxtend.feature_selection import SequentialFeatureSelector as sfs


import warnings
warnings.filterwarnings('ignore')
warnings.warn("this will not show")
pd.options.display.max_columns = None 
pd.options.display.max_rows = None
sns.set(style='darkgrid')
%matplotlib inline

In [2]:
df_visual_diabetic = pd.read_csv('Diabetic_patients_with_patient_nbr.csv',index_col=0)
df_visual_diabetic.head()

Unnamed: 0,patient_nbr,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,diag_1,diag_2,diag_3,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,8222157,1,41,0,1,0,0,0,1,Caucasian,Female,[0-10),Other,Other,Referral,Diabetes,,,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,55629189,3,59,0,18,0,0,0,9,Caucasian,Female,[10-20),Emergency,Discharged Home,Emergency Room,other,Diabetes,other,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,86047875,2,11,5,13,2,0,1,6,AfricanAmerican,Female,[20-30),Emergency,Discharged Home,Emergency Room,other,Diabetes,other,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,82442376,2,44,1,16,0,0,0,7,Caucasian,Male,[30-40),Emergency,Discharged Home,Emergency Room,other,Diabetes,Circulatory,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,42519267,1,51,0,8,0,0,0,5,Caucasian,Male,[40-50),Emergency,Discharged Home,Emergency Room,Neoplasms,Neoplasms,Diabetes,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [3]:
def categorize_patient_frequency_func(value):
    if value==1:
        return '1-time'
    elif (value > 1) and (value <= 5):
        return '2-5 times'
    elif (value > 5) and (value <= 10):
        return '6-10 times'
    return 'more than 10 times'

In [4]:
patient_freq_tab = df_visual_diabetic['patient_nbr'].value_counts()

In [5]:
df_visual_diabetic['patient_nbr'].nunique()

31164

In [6]:
df_visual_diabetic['patient_frequency'] = df_visual_diabetic['patient_nbr'].map(patient_freq_tab)

In [7]:
df_visual_diabetic.head()

Unnamed: 0,patient_nbr,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,diag_1,diag_2,diag_3,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,patient_frequency
0,8222157,1,41,0,1,0,0,0,1,Caucasian,Female,[0-10),Other,Other,Referral,Diabetes,,,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO,1
1,55629189,3,59,0,18,0,0,0,9,Caucasian,Female,[10-20),Emergency,Discharged Home,Emergency Room,other,Diabetes,other,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30,1
2,86047875,2,11,5,13,2,0,1,6,AfricanAmerican,Female,[20-30),Emergency,Discharged Home,Emergency Room,other,Diabetes,other,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO,1
3,82442376,2,44,1,16,0,0,0,7,Caucasian,Male,[30-40),Emergency,Discharged Home,Emergency Room,other,Diabetes,Circulatory,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,1
4,42519267,1,51,0,8,0,0,0,5,Caucasian,Male,[40-50),Emergency,Discharged Home,Emergency Room,Neoplasms,Neoplasms,Diabetes,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,1


In [8]:
list_of_cols = list(df_visual_diabetic.columns)

In [9]:
list_of_cols = [list_of_cols.pop()] + list_of_cols

In [10]:
df_visual_diabetic = df_visual_diabetic[list_of_cols]

In [11]:
df_visual_diabetic.head()

Unnamed: 0,patient_frequency,patient_nbr,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,diag_1,diag_2,diag_3,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,1,8222157,1,41,0,1,0,0,0,1,Caucasian,Female,[0-10),Other,Other,Referral,Diabetes,,,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,1,55629189,3,59,0,18,0,0,0,9,Caucasian,Female,[10-20),Emergency,Discharged Home,Emergency Room,other,Diabetes,other,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,1,86047875,2,11,5,13,2,0,1,6,AfricanAmerican,Female,[20-30),Emergency,Discharged Home,Emergency Room,other,Diabetes,other,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,1,82442376,2,44,1,16,0,0,0,7,Caucasian,Male,[30-40),Emergency,Discharged Home,Emergency Room,other,Diabetes,Circulatory,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,1,42519267,1,51,0,8,0,0,0,5,Caucasian,Male,[40-50),Emergency,Discharged Home,Emergency Room,Neoplasms,Neoplasms,Diabetes,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [12]:
df_visual_diabetic['patient_frequency_categorized'] = df_visual_diabetic['patient_frequency'].apply(categorize_patient_frequency_func)

In [13]:
df_visual_diabetic.head()

Unnamed: 0,patient_frequency,patient_nbr,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,diag_1,diag_2,diag_3,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,patient_frequency_categorized
0,1,8222157,1,41,0,1,0,0,0,1,Caucasian,Female,[0-10),Other,Other,Referral,Diabetes,,,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO,1-time
1,1,55629189,3,59,0,18,0,0,0,9,Caucasian,Female,[10-20),Emergency,Discharged Home,Emergency Room,other,Diabetes,other,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30,1-time
2,1,86047875,2,11,5,13,2,0,1,6,AfricanAmerican,Female,[20-30),Emergency,Discharged Home,Emergency Room,other,Diabetes,other,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO,1-time
3,1,82442376,2,44,1,16,0,0,0,7,Caucasian,Male,[30-40),Emergency,Discharged Home,Emergency Room,other,Diabetes,Circulatory,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,1-time
4,1,42519267,1,51,0,8,0,0,0,5,Caucasian,Male,[40-50),Emergency,Discharged Home,Emergency Room,Neoplasms,Neoplasms,Diabetes,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,1-time


In [14]:
df_visual_diabetic.shape

(37990, 46)

In [15]:
df_visual_diabetic['readmitted'].value_counts()/len(df_visual_diabetic)

NO     0.547197
>30    0.343327
<30    0.109476
Name: readmitted, dtype: float64

In [16]:
df_visual_diabetic.to_csv('Diabetic_patients.csv')

### LIST OF ALL CATEGORICAL VARIABLES IN THE DATASET SEGREGATED AS DIFFERENT ASPECTS...

In [17]:
drugs_list = ['metformin','repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride','acetohexamide', 'glipizide',
              'glyburide', 'tolbutamide','pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
              'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin','glimepiride-pioglitazone',
              'metformin-rosiglitazone','metformin-pioglitazone']
patient_formalities_list = ['admission_type_id','discharge_disposition_id','admission_source_id','patient_frequency_categorized']
patient_demographics_list = ['gender','age','race']
diagnosis_list = ['diag_1','diag_2','diag_3']
primary_tests_list = ['A1Cresult','max_glu_serum']
medication_changes_list = ['diabetesMed','change']

### LIST OF ALL NUMERICAL VARIABLES IN THE DATASET SEGREGATED AS DIFFERENT ASPECTS...

In [18]:
number_of_visits = ['patient_frequency','number_outpatient','number_emergency','number_inpatient']
number_of_hospital_formalities = ['time_in_hospital','num_lab_procedures','num_procedures','num_medications','number_diagnoses']

## NEED NOT EXECUTE

# VISUAL ANALYSIS

### DISTRIBUTION OF VARIABLES

### DISTRIBUTION OF NUMERICAL VARIABLES : UNIVARIATE ANALYSIS

In [None]:
def box_labels(ax, df,col1,col2):
    medians = df.groupby([col1])[col2].median()
    vertical_offset = df[col2].median() * 0.05 # offset from median for display

    for xtick in ax.get_xticks():
        ax.text(xtick,medians[xtick] + vertical_offset,medians[xtick], 
                horizontalalignment='center',size='x-small',color='w',weight='semibold')

In [None]:
def labels(ax):
    for bar in ax.patches: 
        ax.annotate('%{:.1f}\n{:.0f}'.format(100*bar.get_height()/len(df_visual_diabetic),bar.get_height()), (bar.get_x() + bar.get_width() / 2,  
                        bar.get_height()-400), ha='center', va='center', 
                       size=14, xytext=(0, 8), 
                       textcoords='offset points') 

In [None]:
def labels_catnum(ax, df=df_visual_diabetic):
    for p in ax.patches:
            ax.annotate('%{:.1f}\n{:.0f}'.format(100*p.get_height()/len(df),p.get_height()), 
                        (p.get_x()+0.2, p.get_height()-27),size=16)


In [None]:
plt.figure(figsize=(25,20))
for i,col in enumerate(number_of_visits):
    plt.subplot(4,1,i+1)
    sns.distplot(df_visual_diabetic[col])

In [None]:
plt.figure(figsize=(25,15))
for i,col in enumerate(number_of_hospital_formalities):
    plt.subplot(2,3,i+1)
    sns.distplot(df_visual_diabetic[col])

In [None]:
plt.figure(figsize=(25,20))
for i,col in enumerate(number_of_visits):
    plt.subplot(2,2,i+1)
    sns.boxplot(df_visual_diabetic[col])

In [None]:
plt.figure(figsize=(25,15))
for i,col in enumerate(number_of_hospital_formalities):
    plt.subplot(2,3,i+1)
    sns.boxplot(df_visual_diabetic[col])

In [None]:
skew_of_features = pd.DataFrame([df_visual_diabetic[col].skew() for col in [df_visual_diabetic.select_dtypes(include=np.number).columns]],index=['SKEW']).T

In [None]:
skew_of_features.sort_values(by='SKEW',ascending=False)

In [None]:
# The columns that are highly skewed are number_outpatients and number_emergencys..  
# As we can see from the plots and the skew we can understand the most of the numerical columns are right skewed..

In [None]:
# Instead of transforming the variables to bring them to a normal distribution 
# and winzorizing to eliminate outliers to make the model perform better we keep
# the data as it is and analise how the model performs..
# Since these datapoints may have a significant influence on the target variable which has to be learnt by the model..

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(x='num_medications', data=df_visual_diabetic)
mean, median = np.mean(df_visual_diabetic.num_medications), np.median(df_visual_diabetic.num_medications)
plt.axvline(mean-df_visual_diabetic.num_medications.min(), color='blue', label=f'mean:{round(mean,2)}')
plt.axvline(median-df_visual_diabetic.num_medications.min(), color='black', label=f'median:{round(median,2)}')
plt.title('Number of medications given During Visit')
plt.xticks(rotation=90)
plt.legend()
plt.show()

In [None]:
# an average of 16-17 distinct drugs has been administered per visit for each patient.
# do this wrt readmission in bivariate analysis..

In [None]:
plt.figure(figsize=(26,7))
sns.countplot(x='num_lab_procedures', data=df_visual_diabetic)
mean, median = np.mean(df_visual_diabetic.num_lab_procedures), np.median(df_visual_diabetic.num_lab_procedures)
plt.axvline(mean-df_visual_diabetic.num_lab_procedures.min(), color='blue', label=f'mean:{round(mean,2)}')
plt.axvline(median-df_visual_diabetic.num_lab_procedures.min(), color='black', label=f'median:{round(median,2)}')
plt.title('Number of Lab Procedures Performed During Visit')
plt.xticks(rotation=90)
plt.legend()
plt.show()

In [None]:
# most of the time at an average 44 tests are taken on a patient, with the exception of 1....

In [None]:
plt.figure(figsize=(26,7))
sns.countplot(x='patient_frequency', data=df_visual_diabetic)
mean, median = np.mean(df_visual_diabetic.patient_frequency), np.median(df_visual_diabetic.patient_frequency)
plt.axvline(mean-df_visual_diabetic.patient_frequency.min(), color='blue', label=f'mean:{round(mean,2)}')
plt.axvline(median-df_visual_diabetic.patient_frequency.min(), color='black', label=f'median:{round(median,2)}')
plt.title('Number of Visits of patients to the hospital')
plt.xticks(rotation=90)
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x='time_in_hospital', palette='muted', data=df_visual_diabetic)
mean, median = np.mean(df_visual_diabetic['time_in_hospital']), np.median(df_visual_diabetic['time_in_hospital'])
plt.axvline(mean-df_visual_diabetic.time_in_hospital.min(), color='blue', label=f'mean:{round(mean,2)}')
plt.axvline(median-df_visual_diabetic.time_in_hospital.min(), color='red', label=f'median:{round(median,2)}')
plt.title('Duration of Hospital Visit in Days')
plt.legend()
plt.show()

In [None]:
# On an average patients spend 4 and half days in the hospital..

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x='num_procedures', palette='seismic', data=df_visual_diabetic)
mean, median = np.mean(df_visual_diabetic.num_procedures), np.median(df_visual_diabetic.num_procedures)
plt.axvline(mean, color='blue', label=f'mean:{round(mean,2)}')
plt.axvline(median, color='black', label=f'median:{round(median,2)}')
plt.title('Number of Procedures Performed (Except Lab)');

In [None]:
# For many of the patients there were no procedures performed.
# Number of manual checkups (procedures) performed by the doctor before lab procedures is usually 1 per patient.. 

In [None]:
# number of diagnoses and readmit rate
plt.figure(figsize=(15,5))
ax = sns.countplot(x='number_diagnoses', palette='Accent', data=df_visual_diabetic)
plt.title('Number of Diagnoses')
plt.show()

In [None]:
# For many of the patients 9 lab diagnoses are performed...

### DISTRIBUTION OF CATEGORICAL VARIABLES

In [None]:
df_visual_diabetic['readmitted'].value_counts().plot(kind='bar');

In [None]:
plt.figure(figsize=(20,25))
for i,col in enumerate(drugs_list):
    plt.subplot(7,3,i+1)
    sns.countplot(x=df_visual_diabetic[col])

In [None]:
# Many drugs in the dataset are not administered to patients..Only insulin seems to be administered to majority
# of the patients..
# From the domain understanding many of the drugs administered above are for type 2 diabetic patients..
# We will later perform statistical tests on these features to understand their significance wrt the target column..

In [None]:
plt.figure(figsize=(26, 10))
for i,col in enumerate(patient_formalities_list):
    plt.subplot(2,2,i+1)
    plt.xticks(rotation=90,fontsize=15)
    plt.xlabel(col,fontsize=20)
    sns.countplot(x=df_visual_diabetic[col])

In [None]:
# Admission_type_id gives the information about why a patient was admitted..Many of them seem to have got admitted
# in emergency and urgent conditions..around 74000 patients..Around 20000 patients are admitted with prior formalities verified.
# Admission source id talks about how the patient got admitted..It may be because of someone's referral or transfer
# from other hospital.. It is dominated by Emergency room
# Discharge DispositionId gives the detail whether the patient was discharged to home after treatment or was transferred to
# other hospitals for various reasons..A considerably good number of patients were dischrged back home..Around 20000 patients
# were traansferred to other hospitals or care centres..
# left AMA : left against medical advice : Patient refuses to stay for continued care..

In [None]:
plt.figure(figsize=(26, 10))
for i,col in enumerate(patient_demographics_list):
    plt.subplot(1,3,i+1)
    plt.xticks(rotation=90,fontsize=15)
    plt.xlabel(col,fontsize=20)
    sns.countplot(x=df_visual_diabetic[col])

In [None]:
# As we can see the gender is balanced, oddly there are more females compared to that of males
# Majority of the patients are senior citizends..
# Caucasians are high..

In [None]:
plt.figure(figsize=(26, 10))
for i,col in enumerate(diagnosis_list):
    plt.subplot(1,3,i+1)
    plt.xticks(rotation=90,fontsize=15)
    plt.xlabel(col,fontsize=20)
    sns.countplot(x=df_visual_diabetic[col])

In [None]:
#-------> Many patients have circulatory disorders...

In [None]:
plt.figure(figsize=(15, 7))
for i,col in enumerate(primary_tests_list):
    plt.subplot(1,3,i+1)
    plt.xticks(rotation=90,fontsize=15)
    plt.xlabel(col,fontsize=20)
    sns.countplot(x=df_visual_diabetic[col])

In [None]:
# The details of the test results of A1C and maximum glucose serum is not specified for Most of the patients... 

In [None]:
plt.figure(figsize=(15, 7))
for i,col in enumerate(medication_changes_list):
    plt.subplot(1,3,i+1)
    plt.xticks(rotation=90,fontsize=15)
    plt.xlabel(col,fontsize=20)
    sns.countplot(x=df_visual_diabetic[col])

In [None]:
# For around 23000 patients no diabetes medications were given..
# For around 46000 patients medications was changed..

### BIVARIATE ANALYSIS (NUMERICAL VS NUMERICAL)

### CORRELATION AMONG NUMERICAL VARIABLES

In [None]:
plt.figure(figsize=(25,15))
sns.heatmap(df_visual_diabetic.corr(),annot=True);

In [None]:
# As we can see from the heatmap there is no heavy multicollinearity among the numerical variables in the data..

In [None]:
# We analyse the relation between numerical columns that have high correlation compared to the rest..

In [None]:
plt.figure(figsize=(15,10))
ax = sns.boxplot(x='time_in_hospital', y='num_lab_procedures', data=df_visual_diabetic.sort_values('time_in_hospital'))
# box_labels(ax, df.sort_values('time_in_hospital'),'time_in_hospital','num_lab_procedures') 
plt.title('Lab Procedures Based on Length of Hospital Visit')
plt.show()

In [None]:
# There is an increasing trend between time spent in the hospital and number of lab tests completed.
# This makes sense since patients with longer stays had more tests completed to properly diagnose their conditions.

In [None]:
plt.figure(figsize=(15,10))
ax = sns.boxplot(x='time_in_hospital', y='num_medications', data=df_visual_diabetic)
plt.title('Medications Administered Based on Length of Hospital Visit')
plt.show()

In [None]:
# Patients who spend more time in the hospital receive more medications..

In [None]:
plt.figure(figsize=(15,10))
ax = sns.boxplot(x='patient_frequency', y='number_inpatient', data=df_visual_diabetic)
plt.title('patient_frequeny vs number_inpatient')
plt.show()

### NUMERICAL FEATURES VS TARGET

In [None]:
plt.figure(figsize=(26,20))
for i,col in enumerate(number_of_visits):
    plt.subplot(2,2,i+1)
    plt.ylabel(col,fontsize=20)
    plt.xlabel(col,fontsize=20)
    plt.xticks(fontsize=20)
    sns.boxplot(x=df_visual_diabetic['readmitted'],y=df_visual_diabetic[col])

In [None]:
# A good number of people are visiting inpatients who were re admitted within 30 days
# For majority of the inpatients there were no visitors in the previous year..
# A maximum of 21 visitors were there for one particular patient
# usually it is limited to 3 or less than 3 for majority of the patients
#----------> Number_inpatient vs diag..vs number of days

In [None]:
plt.figure(figsize=(26,16))
for i,col in enumerate(number_of_hospital_formalities):
    plt.subplot(2,3,i+1)
    plt.ylabel(col,fontsize=20)
    plt.xlabel(col,fontsize=20)
    plt.xticks(fontsize=20)
    sns.boxplot(x=df_visual_diabetic['readmitted'],y=df_visual_diabetic[col],palette='brg')

In [None]:
# From the above boxplots we get to know that certain numerical features do not have a significant impact on the target
# such as time_in_hospital, number_of_lab_procedures,number_procedures,number_of_medications and number_of_diagnoses

### CATEGORICAL FEATURES VS TARGET

In [None]:
plt.figure(figsize=(26,10))
count =0
for i in patient_demographics_list:
    plt.subplot(1,3,count+1)
    plt.xticks(rotation=90,fontsize=15)
    plt.xlabel(i,fontsize=20)
    sns.countplot(df_visual_diabetic[i], hue=df_visual_diabetic['readmitted'], palette='nipy_spectral');
    count = count+1

In [None]:
plt.figure(figsize=(26,20))
count =0
for i in patient_formalities_list:
    plt.subplot(2,2,count+1)
    plt.xticks(rotation=90,fontsize=15)
    plt.xlabel(i,fontsize=20)
    sns.countplot(df_visual_diabetic[i], hue=df_visual_diabetic['readmitted'], palette='Set1');
    count = count+1

In [None]:
plt.figure(figsize=(26,10))
count =0
for i in diagnosis_list:
    plt.subplot(1,3,count+1)
    plt.xticks(rotation=90,fontsize=15)
    plt.xlabel(i,fontsize=20)
    sns.countplot(df_visual_diabetic[i], hue=df_visual_diabetic['readmitted'], palette='autumn');
    count = count+1

In [None]:
# Based on drugs

In [None]:
plt.figure(figsize=(25,35))
count =0
for i in drugs_list:
    plt.subplot(7,3,count+1)
    plt.xticks(fontsize=15)
    plt.xlabel(i,fontsize=20)
    sns.countplot(df_visual_diabetic[i], hue=df_visual_diabetic['readmitted'], palette='coolwarm');
    count = count+1

In [None]:
fig, ax =plt.subplots(figsize=(15,5))
sns.countplot(x="insulin", hue="readmitted", data=df_visual_diabetic, palette="YlGnBu")
plt.show()

In [None]:
# Understanding how the major drug 'Insulin' influences patient behaviour
# for majority of the patients Insulin was not administered
# people whose insulin doses were Up or down i.e increased are more likely to get re-admitted..
# Why down increases readmission???

In [None]:
# exclude patients without a glucose reading
plt.figure(figsize=(15, 7))
glucose_none = df_visual_diabetic[df_visual_diabetic.max_glu_serum != 'None']

# glucose serum results and readmit impact
ax = sns.countplot(x='max_glu_serum', hue='readmitted', palette='Accent', data=glucose_none)
labels_catnum(ax,glucose_none)
plt.title('Readmits By Glucose Serum Levels')
plt.show()

In [None]:
# if the glucose serum test value with

In [None]:
# exclude patients without an A1C reading
alc_none = df_visual_diabetic[df_visual_diabetic.A1Cresult != 'None']
plt.figure(figsize=(15, 9))
# A1C results and readmit impact
ax = sns.countplot(x='A1Cresult', hue='readmitted', palette='Wistia', data=alc_none)
labels_catnum(ax, alc_none)
plt.title('Readmits By A1C Test Results')
plt.show()

In [None]:
# change in medications
plt.figure(figsize=(15,7))
ax = sns.countplot(x='change', hue='readmitted', data=df_visual_diabetic)
labels(ax)
plt.title('Change in Diabetic Medications')
plt.show()

In [None]:
# if there is no change in medication the readmission rate is less..

In [None]:
plt.figure(figsize=(15,7))
ax = sns.countplot(x='diabetesMed', hue='readmitted', data=df_visual_diabetic)
labels(ax)
plt.title('Comparison When No drugs were given vs when atleast 1 drug was administered')
plt.show()

In [None]:
#     No significant impact on patient readmission..

### CATEGORICAL VS CATEGORICAL

### PATIENT DEMOGRAPHICS VS REST

In [None]:
plt.figure(figsize=(26,60))
count =0
for i in patient_demographics_list:
    for j in patient_formalities_list:
        plt.subplot(6,2,count+1)
        plt.xticks(fontsize=15,rotation=90)
        plt.xlabel(i,fontsize=20)
        sns.countplot(x=j, hue=i,data=df_visual_diabetic, palette='gnuplot');
        count = count+1

In [None]:
plt.figure(figsize=(26,99))
count =0
for i in patient_demographics_list:
    for j in diagnosis_list:
        plt.subplot(9,1,count+1)
        plt.xticks(fontsize=15)
        plt.xlabel(i,fontsize=20)
        sns.countplot(x=j, hue=i,data=df_visual_diabetic, palette='Set1_r');
        count = count+1

In [None]:
plt.figure(figsize=(26,30))
count =0
for i in patient_demographics_list:
    for j in primary_tests_list:
        plt.subplot(3,2,count+1)
        plt.xticks(fontsize=15)
        plt.xlabel(i,fontsize=20)
        sns.countplot(x=j, hue=i,data=df_visual_diabetic, palette='gnuplot');
        count = count+1

In [None]:
plt.figure(figsize=(26,30))
count =0
for i in patient_demographics_list:
    for j in medication_changes_list:
        plt.subplot(3,2,count+1)
        plt.xticks(fontsize=15)
        plt.xlabel(i,fontsize=20)
        sns.countplot(x=j, hue=i,data=df_visual_diabetic, palette='CMRmap_r');
        count = count+1

In [None]:
plt.figure(figsize=(26,30))
count =0
for i in patient_demographics_list:
    plt.subplot(3,1,count+1)
    plt.xticks(fontsize=15)
    plt.xlabel(i,fontsize=20)
    sns.countplot(x='insulin', hue=i,data=df_visual_diabetic, palette='gnuplot');
    count = count+1

### PATIENT FORMALITIES VS REST

In [None]:
plt.figure(figsize=(26,60))
count =0
for i in patient_formalities_list:
    for j in diagnosis_list:
        plt.subplot(6,2,count+1)
        plt.xticks(fontsize=15)
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.countplot(x=j, hue=i,data=df_visual_diabetic, palette='brg');
        count = count+1

In [None]:
plt.figure(figsize=(26,40))
count =0
for i in patient_formalities_list:
    for j in medication_changes_list:
        plt.subplot(4,2,count+1)
        plt.xticks(fontsize=15)
        plt.xlabel(i,fontsize=20)
        sns.countplot(x=j, hue=i,data=df_visual_diabetic, palette='CMRmap');
        count = count+1

In [None]:
plt.figure(figsize=(26,40))
count =0
for i in patient_formalities_list:
    for j in primary_tests_list:
        plt.subplot(4,2,count+1)
        plt.xticks(fontsize=15)
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.countplot(x=j, hue=i,data=df_visual_diabetic, palette='Dark2_r');
        count = count+1

In [None]:
plt.figure(figsize=(26,20))
count =0
for i in patient_formalities_list:
    plt.subplot(2,2,count+1)
    plt.xticks(fontsize=15)
    plt.xlabel(i,fontsize=20)
    sns.countplot(x='insulin', hue=i,data=df_visual_diabetic, palette='Dark2');
    count = count+1

### DIAGNOSES VS REST

In [None]:
plt.figure(figsize=(26,30))
count =0
for i in diagnosis_list:
    for j in primary_tests_list:
        plt.subplot(3,2,count+1)
        plt.xticks(fontsize=15)
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.countplot(x=j, hue=i,data=df_visual_diabetic, palette='brg_r');
        count = count+1

In [None]:
plt.figure(figsize=(26,30))
count =0
for i in diagnosis_list:
    for j in medication_changes_list:
        plt.subplot(3,2,count+1)
        plt.xticks(fontsize=15)
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.countplot(x=j, hue=i,data=df_visual_diabetic, palette='icefire');
        count = count+1

In [None]:
plt.figure(figsize=(26,30))
count =0
for i in diagnosis_list:
    plt.subplot(3,1,count+1)
    plt.xticks(fontsize=15)
    plt.xlabel(i,fontsize=20)
    sns.countplot(x='insulin', hue=i,data=df_visual_diabetic, palette='Set1_r');
    count = count+1

### PRIMARY TESTS VS REST

In [None]:
plt.figure(figsize=(26,20))
count =0
for i in primary_tests_list:
    for j in medication_changes_list:
        plt.subplot(2,2,count+1)
        plt.xticks(fontsize=15)
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.countplot(x=j, hue=i,data=df_visual_diabetic, palette='nipy_spectral_r');
        count = count+1

In [None]:
plt.figure(figsize=(26,20))
count =0
for i in primary_tests_list:
    plt.subplot(2,1,count+1)
    plt.xticks(fontsize=15)
    plt.xlabel(i,fontsize=20)
    sns.countplot(x='insulin', hue=i,data=df_visual_diabetic, palette='CMRmap_r');
    count = count+1

In [None]:
# reduced level of insulin for older people should be taken into consideration..doctors have to study..

In [None]:
# patient_demographics_list,hospital_formalities , drugs,diagnosis_list,primary_tests_list,medication_changes_list

In [None]:
# Have all the diabetic patients provided with insulin?

In [None]:
# comparison between patients who were not administered any drug and their diagnosis

### NUMERICAL FEATURES VS CATEGORICAL FEATURES

### patient_demographics_list vs Number_of_hospital_formalities 

In [None]:
plt.figure(figsize=(26,40))
count =0
for i in patient_demographics_list:
    for j in number_of_hospital_formalities:
        plt.subplot(8,2,count+1)
        plt.xticks(fontsize=15)
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_diabetic.sort_values(i),palette='nipy_spectral')
        count += 1

In [None]:
plt.figure(figsize=(26,24))
count =0
for i in patient_demographics_list:
    for j in number_of_visits:
        plt.subplot(4,3,count+1)
        plt.xticks(rotation=90,fontsize=15)
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_diabetic.sort_values(i),palette='nipy_spectral_r')
        count += 1

In [None]:
plt.figure(figsize=(26,100))
count =0
for i in patient_formalities_list:
    for j in number_of_hospital_formalities:
        plt.subplot(10,2,count+1)
        plt.xticks(rotation=90,fontsize=9,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_diabetic.sort_values(i),palette='plasma_r')
        count += 1

In [None]:
plt.figure(figsize=(26,40))
fig.tight_layout()
count =0
for i in patient_formalities_list:
    for j in number_of_visits:
        plt.subplot(8,2,count+1,)
        plt.xticks(rotation=90,fontsize=9,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_diabetic.sort_values(i),palette='brg')
        count += 1

In [None]:
plt.figure(figsize=(26,104))
fig.tight_layout()
count =0
for i in diagnosis_list:
    for j in number_of_hospital_formalities:
        plt.subplot(8,2,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_diabetic.sort_values(i),palette='Set1')
        count += 1

In [None]:
plt.figure(figsize=(26,27))
fig.tight_layout()
count =0
for i in diagnosis_list:
    for j in number_of_visits:
        plt.subplot(4,3,count+1,)
        plt.xticks(rotation=90,fontsize=9,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_diabetic.sort_values(i),palette='plasma')
        count += 1

In [None]:
plt.figure(figsize=(26,50))
fig.tight_layout()
count =0
for i in primary_tests_list:
    for j in number_of_hospital_formalities:
        plt.subplot(5,2,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_diabetic.sort_values(i),palette='autumn')
        count += 1

In [None]:
plt.figure(figsize=(26,20))
fig.tight_layout()
count =0
for i in primary_tests_list:
    for j in number_of_visits:
        plt.subplot(3,3,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_diabetic.sort_values(i),palette='brg')
        count += 1

In [None]:
plt.figure(figsize=(26,50))
fig.tight_layout()
count =0
for i in medication_changes_list:
    for j in number_of_hospital_formalities:
        plt.subplot(5,2,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_diabetic.sort_values(i),palette='icefire')
        count += 1

In [None]:
plt.figure(figsize=(26,20))
fig.tight_layout()
count =0
for i in medication_changes_list:
    for j in number_of_visits:
        plt.subplot(3,3,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_diabetic.sort_values(i),palette='CMRmap')
        count += 1

In [None]:
plt.figure(figsize=(26,30))
fig.tight_layout()
count =0
for i in ['insulin']:
    for j in number_of_hospital_formalities:
        plt.subplot(3,2,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_diabetic.sort_values(i),palette='gnuplot')
        count += 1

In [None]:
plt.figure(figsize=(26,10))
fig.tight_layout()
count =0
for i in ['insulin']:
    for j in number_of_visits:
        plt.subplot(2,2,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_diabetic.sort_values(i),palette='Set2')
        count += 1

# MULTIVARIATE ANALYSIS

In [None]:
plt.figure(figsize=(26,40))
count =0
for i in patient_demographics_list:
    for j in number_of_hospital_formalities:
        plt.subplot(8,2,count+1)
        plt.xticks(fontsize=15)
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_diabetic.sort_values(i),hue='readmitted',palette='nipy_spectral')
        count += 1

In [None]:
plt.figure(figsize=(26,24))
count =0
for i in patient_demographics_list:
    for j in number_of_visits:
        plt.subplot(5,3,count+1)
        plt.xticks(rotation=90,fontsize=15)
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_diabetic.sort_values(i),palette='brg_r',hue='readmitted')
        count += 1

In [None]:
plt.figure(figsize=(26,100))
count =0
for i in patient_formalities_list:
    for j in number_of_hospital_formalities:
        plt.subplot(10,2,count+1)
        plt.xticks(rotation=90,fontsize=9,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_diabetic.sort_values(i),palette='plasma_r',hue='readmitted')
        count += 1

In [None]:
plt.figure(figsize=(26,40))
fig.tight_layout()
count =0
for i in patient_formalities_list:
    for j in number_of_visits:
        plt.subplot(8,2,count+1,)
        plt.xticks(rotation=90,fontsize=9,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_diabetic.sort_values(i),palette='gnuplot',hue='readmitted')
        count += 1

In [None]:
plt.figure(figsize=(26,104))
fig.tight_layout()
count =0
for i in diagnosis_list:
    for j in number_of_hospital_formalities:
        plt.subplot(8,2,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_diabetic.sort_values(i),palette='CMRmap',hue='readmitted')
        count += 1

In [None]:
plt.figure(figsize=(26,54))
fig.tight_layout()
count =0
for i in diagnosis_list:
    for j in number_of_visits:
        plt.subplot(6,2,count+1,)
        plt.xticks(rotation=90,fontsize=9,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_diabetic.sort_values(i),palette='plasma',hue='readmitted')
        count += 1

In [None]:
plt.figure(figsize=(26,50))
fig.tight_layout()
count =0
for i in primary_tests_list:
    for j in number_of_hospital_formalities:
        plt.subplot(5,2,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_diabetic.sort_values(i),palette='Reds',hue='readmitted')
        count += 1

In [None]:
plt.figure(figsize=(26,40))
fig.tight_layout()
count =0
for i in primary_tests_list:
    for j in number_of_visits:
        plt.subplot(4,2,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_diabetic.sort_values(i),palette='icefire',hue='readmitted')
        count += 1

In [None]:
plt.figure(figsize=(26,50))
fig.tight_layout()
count =0
for i in medication_changes_list:
    for j in number_of_hospital_formalities:
        plt.subplot(5,2,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_diabetic.sort_values(i),palette='autumn_r',hue='readmitted')
        count += 1

In [None]:
plt.figure(figsize=(26,28))
fig.tight_layout()
count =0
for i in medication_changes_list:
    for j in number_of_visits:
        plt.subplot(4,2,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_diabetic.sort_values(i),palette='CMRmap_r',hue='readmitted')
        count += 1

In [None]:
plt.figure(figsize=(26,30))
fig.tight_layout()
count =0
for i in ['insulin']:
    for j in number_of_hospital_formalities:
        plt.subplot(3,2,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_diabetic.sort_values(i),palette='gnuplot_r',hue='readmitted')
        count += 1

In [None]:
plt.figure(figsize=(26,20))
fig.tight_layout()
count =0
for i in ['insulin']:
    for j in number_of_visits:
        plt.subplot(2,2,count+1,)
        plt.xticks(rotation=90,fontsize=13,fontweight='heavy')
        plt.xlabel(i,fontsize=20)
        plt.legend('right')
        sns.boxplot(x=i, y=j, data=df_visual_diabetic.sort_values(i),palette='icefire_r',hue='readmitted')
        count += 1

### BUSINESS INTERPRETATION AND INSIGHTS

### INSIGHTS THAT ARE FOUND USING PREVIOUS ANALYSIS

### FINDING THE CAUSE FOR EVENTS.. 

In [None]:
df_visual_diabetic[(df_visual_diabetic['diag_1'].isnull()) & (df_visual_diabetic['diag_1'].isnull()) & (df_visual_diabetic['diag_1'].isnull())]

In [None]:
# Understanding Expired Patients wrt their disorders

In [None]:
df_visual_diabetic[df_visual_diabetic['discharge_disposition_id']=='Expired']['readmitted'].value_counts()

In [None]:
df_visual_diabetic[df_visual_diabetic['discharge_disposition_id']=='Hospice']['readmitted'].value_counts()

In [None]:
diag_tab_expired = pd.DataFrame(df_visual_diabetic[df_visual_diabetic['discharge_disposition_id']=='Expired']['diag_1'].value_counts())

In [None]:
diag_tab_expired['diag_2'] = df_visual_diabetic[df_visual_diabetic['discharge_disposition_id']=='Expired']['diag_2'].value_counts()

In [None]:
diag_tab_expired['diag_3'] = df_visual_diabetic[df_visual_diabetic['discharge_disposition_id']=='Expired']['diag_3'].value_counts()

In [None]:
diag_tab_expired.plot(kind='bar',figsize=(26,10));

In [None]:
# From the above graph we could say that many patients who expired had Circulatory disorder...But the ratio of circulatory
# patients is also considerably high

In [None]:
fatality_percent_diag_1 = {}
fatality_percent_diag_2 = {}
fatality_percent_diag_3 = {}
for cat in df_visual_diabetic['diag_1'].unique():
    fatality_percent_diag_1.update({cat:100*(len(df_visual_diabetic[(df_visual_diabetic['diag_1']==cat) & (df_visual_diabetic['discharge_disposition_id']=='Expired')])/len(df_visual_diabetic[df_visual_diabetic['diag_1']==cat]))})
    fatality_percent_diag_2.update({cat:100*(len(df_visual_diabetic[(df_visual_diabetic['diag_2']==cat) & (df_visual_diabetic['discharge_disposition_id']=='Expired')])/len(df_visual_diabetic[df_visual_diabetic['diag_2']==cat]))})
    fatality_percent_diag_3.update({cat:100*(len(df_visual_diabetic[(df_visual_diabetic['diag_3']==cat) & (df_visual_diabetic['discharge_disposition_id']=='Expired')])/len(df_visual_diabetic[df_visual_diabetic['diag_3']==cat]))})
    

In [None]:
fatality_percent = pd.DataFrame()
for key in fatality_percent_diag_1:
    fatality_percent[key]=[fatality_percent_diag_1[key],fatality_percent_diag_2[key],fatality_percent_diag_3[key]]
    

In [None]:
fatality_percent = fatality_percent.T

In [None]:
fatality_percent.columns=['diag_1','diag_2','diag_3']

In [None]:
fatality_percent.plot(kind='bar',figsize=(26,10));

In [None]:
# the above plot shows the rate of expiration in terms of disease..

In [None]:
# Understandiing transferred Patients wrt their disorders

In [None]:
diag_tab_transferred = pd.DataFrame(df_visual_diabetic[df_visual_diabetic['discharge_disposition_id']=='Transferred/Referred']['diag_1'].value_counts())

In [None]:
diag_tab_transferred['diag_2'] = df_visual_diabetic[df_visual_diabetic['discharge_disposition_id']=='Transferred/Referred']['diag_2'].value_counts()

In [None]:
diag_tab_transferred['diag_3'] = df_visual_diabetic[df_visual_diabetic['discharge_disposition_id']=='Transferred/Referred']['diag_3'].value_counts()

In [None]:
diag_tab_transferred.plot(kind='bar',figsize=(26,10));

In [None]:
# Many patients who were transferred had a circulatory disorder and other disorders with diabetes..

In [None]:
transfer_percent_diag_1 = {}
transfer_percent_diag_2 = {}
transfer_percent_diag_3 = {}
for cat in df_visual_diabetic['diag_1'].unique():
    transfer_percent_diag_1.update({cat:100*(len(df_visual_diabetic[(df_visual_diabetic['diag_1']==cat) & (df_visual_diabetic['discharge_disposition_id']=='Transferred/Referred')])/len(df_visual_diabetic[df_visual_diabetic['diag_1']==cat]))})
    transfer_percent_diag_2.update({cat:100*(len(df_visual_diabetic[(df_visual_diabetic['diag_2']==cat) & (df_visual_diabetic['discharge_disposition_id']=='Transferred/Referred')])/len(df_visual_diabetic[df_visual_diabetic['diag_2']==cat]))})
    transfer_percent_diag_3.update({cat:100*(len(df_visual_diabetic[(df_visual_diabetic['diag_3']==cat) & (df_visual_diabetic['discharge_disposition_id']=='Transferred/Referred')])/len(df_visual_diabetic[df_visual_diabetic['diag_3']==cat]))})
    

In [None]:
transfer_percent = pd.DataFrame()
for key in transfer_percent_diag_1:
    transfer_percent[key]=[transfer_percent_diag_1[key],transfer_percent_diag_2[key],transfer_percent_diag_3[key]]
    

In [None]:
transfer_percent = transfer_percent.T

In [None]:
transfer_percent.columns=['diag_1','diag_2','diag_3']

In [None]:
transfer_percent.plot(kind='bar',figsize=(26,10));

In [None]:
# the rate of transfer of patients who were injured and patients who had musculoskeletal disorders is high(all of them have diabetes)..

In [None]:
pd.crosstab(df_visual_diabetic['insulin'],df_visual_diabetic['age'],margins=True)

In [None]:
pd.crosstab(df_visual_diabetic['insulin'],df_visual_diabetic['age'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
# many children less than thee age of 10 suffer from diabetes and a steady doze of insulin was provided to many of them..

In [None]:
pd.crosstab(df_visual_diabetic['insulin'],df_visual_diabetic['age'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['insulin'],df_visual_diabetic['diag_1'],margins=True)

In [None]:
pd.crosstab(df_visual_diabetic['diag_1'],df_visual_diabetic['insulin'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['diag_1'],df_visual_diabetic['insulin'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['diag_2'],df_visual_diabetic['insulin'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['insulin'],df_visual_diabetic['patient_frequency_categorized'],margins=True)

In [None]:
# although few patients visited the hospital more than 10 times they were not administered with insulin..

In [None]:
pd.crosstab(df_visual_diabetic['insulin'],df_visual_diabetic['patient_frequency_categorized'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['age'],df_visual_diabetic['diag_1'],margins=True)

In [None]:
# the age group of 1-20 are least prone to all of these diseases except diabetes..

In [None]:
pd.crosstab(df_visual_diabetic['age'],df_visual_diabetic['diag_1'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
# At a young age patients are more prone to diabetes..As age increases the diabetic patients are also prone to circulatory disorders..

In [None]:
pd.crosstab(df_visual_diabetic['age'],df_visual_diabetic['diag_1'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
# Until the age of 50 patients are more prone to diabetes..

In [None]:
pd.crosstab(df_visual_diabetic['age'],df_visual_diabetic['discharge_disposition_id'],margins=True)

In [None]:
pd.crosstab(df_visual_diabetic['age'],df_visual_diabetic['discharge_disposition_id'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
# Many of the inpatients are of the age 10-20 and 60-70 ..

In [None]:
pd.crosstab(df_visual_diabetic['age'],df_visual_diabetic['discharge_disposition_id'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
# as Age increases the transfer rate of the patients is also increasing..

In [None]:
pd.crosstab(df_visual_diabetic['age'],df_visual_diabetic['patient_frequency_categorized'],margins=True)

In [None]:
pd.crosstab(df_visual_diabetic['age'],df_visual_diabetic['readmitted'],margins=True)

In [None]:
pd.crosstab(df_visual_diabetic['age'],df_visual_diabetic['readmitted'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['age'],df_visual_diabetic['readmitted'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
# add this without inference..

In [None]:
pd.crosstab(df_visual_diabetic['age'],df_visual_diabetic['patient_frequency_categorized'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
# Patientss who are in the age 20-50 tend to visit the hospital more often..

In [None]:
pd.crosstab(df_visual_diabetic['age'],df_visual_diabetic['patient_frequency_categorized'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
# Many patients in all age group visit the hospital only once..

In [None]:
pd.crosstab(df_visual_diabetic['diag_1'],df_visual_diabetic['discharge_disposition_id'],margins=True)

In [None]:
pd.crosstab(df_visual_diabetic['diag_1'],df_visual_diabetic['discharge_disposition_id'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['diag_1'],df_visual_diabetic['discharge_disposition_id'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['diag_1'],df_visual_diabetic['patient_frequency_categorized'],margins=True)

In [None]:
# circulatory respiratory and patients with other disorders tend to visit the hospital more number of times..

In [None]:
pd.crosstab(df_visual_diabetic['diag_1'],df_visual_diabetic['patient_frequency_categorized'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
# 

In [None]:
pd.crosstab(df_visual_diabetic['diag_1'],df_visual_diabetic['patient_frequency_categorized'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
# patients with respiratory and other illness tend to visit the hospital often..

In [None]:
pd.crosstab(df_visual_diabetic['discharge_disposition_id'],df_visual_diabetic['patient_frequency_categorized'],margins=True)

In [None]:
# add anyways..

In [None]:
pd.crosstab(df_visual_diabetic['discharge_disposition_id'],df_visual_diabetic['patient_frequency_categorized'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['discharge_disposition_id'],df_visual_diabetic['patient_frequency_categorized'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['diag_1'],df_visual_diabetic['readmitted'],margins=True)

In [None]:
pd.crosstab(df_visual_diabetic['diag_1'],df_visual_diabetic['readmitted'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['diag_1'],df_visual_diabetic['readmitted'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['discharge_disposition_id'],df_visual_diabetic['readmitted'],margins=True)

In [None]:
# add anyways..

In [None]:
pd.crosstab(df_visual_diabetic['discharge_disposition_id'],df_visual_diabetic['readmitted'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['discharge_disposition_id'],df_visual_diabetic['readmitted'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['patient_frequency_categorized'],df_visual_diabetic['readmitted'],margins=True)

In [None]:
# Nearly 4200 patients were readmitted in the first instance itself..

In [None]:
pd.crosstab(df_visual_diabetic['patient_frequency_categorized'],df_visual_diabetic['readmitted'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
# patients who visit the hospital more than once are more likely to be readmitted..

In [None]:
pd.crosstab(df_visual_diabetic['patient_frequency_categorized'],df_visual_diabetic['readmitted'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['age'],df_visual_diabetic['readmitted'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['age'],df_visual_diabetic['readmitted'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['age'],df_visual_diabetic['insulin'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['age'],df_visual_diabetic['insulin'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
# diabetesMed,change,age,insulin,A1Cresult,max_glu_serum

In [None]:
pd.crosstab(df_visual_diabetic['insulin'],df_visual_diabetic['readmitted'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['insulin'],df_visual_diabetic['age'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['insulin'],df_visual_diabetic['age'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['diabetesMed'],df_visual_diabetic['age'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['diabetesMed'],df_visual_diabetic['age'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['age'],df_visual_diabetic['diabetesMed'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['age'],df_visual_diabetic['diabetesMed'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['age'],df_visual_diabetic['A1Cresult'],normalize='columns',margins=True)

In [None]:
pd.crosstab(df_visual_diabetic['age'],df_visual_diabetic['A1Cresult'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['age'],df_visual_diabetic['A1Cresult'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['A1Cresult'],df_visual_diabetic['age'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['A1Cresult'],df_visual_diabetic['age'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['max_glu_serum'],df_visual_diabetic['age'],normalize='columns').plot(kind='bar',figsize=(26,10));

In [None]:
pd.crosstab(df_visual_diabetic['max_glu_serum'],df_visual_diabetic['age'],normalize='index').plot(kind='bar',figsize=(26,10));

In [None]:
df_visual_diabetic.drop(columns=['patient_frequency_categorized'],inplace=True)

In [None]:
df_visual_diabetic.drop(columns=['patient_nbr'],inplace=True)

In [None]:
index_to_be_dropped = df_visual_diabetic[df_visual_diabetic['discharge_disposition_id']=='Expired'].index

In [None]:
df_visual_diabetic.drop(index_to_be_dropped,inplace=True)

In [None]:
df_visual_diabetic.to_csv('Diabetic_data_for_stats.csv')

# Statistical Test

In [None]:
df_stat_diabetic = pd.read_csv('Diabetic_data_for_stats.csv',index_col=0)


df_stat_diabetic.head()

In [None]:
df_stat_diabetic.isnull().sum()

In [None]:
# mode imputation for statistical analysis

In [None]:
for col in ['race','diag_2','diag_3']:
    df_stat_diabetic[col] = df_stat_diabetic[col].fillna(df_stat_diabetic[col].mode()[0])

In [None]:
df_stat_diabetic.isnull().sum()

In [None]:
df_stat_diabetic.shape

In [None]:
quasi_constant_feat = []

for feature in df_stat_diabetic.columns:

    predominant = (df_stat_diabetic[feature].value_counts() / np.float(len(df_stat_diabetic))).sort_values(ascending=False).values[0]

    if predominant > 0.998:
        
        quasi_constant_feat.append(feature)

len(quasi_constant_feat)

In [None]:
quasi_constant_feat

### CHI-SQUARED TEST FOR INDEPENDENCE

In [None]:
categorical_columns = df_stat_diabetic.select_dtypes(include='object').columns.tolist()

In [None]:
# define a function that returns a table, a chi-square value, and a p value
def chisquare_test(df, var_list, target, null_list=[]):
    for var in var_list:
        print('\n\n',var.upper())
        chi_test = pd.crosstab(df[var], df[target])
        display(chi_test)
        
        chisq_value, pvalue, dof, expected = chi2_contingency(chi_test)
        print('---'*10,'\nExpected Chi table ')
        display(pd.DataFrame(expected,index=chi_test.index,columns=chi_test.columns))
        print(f"""Chi-square value: {chisq_value:.2f}
p-value\t\t: {pvalue:.3f}         for      {var.upper()}\n""")
        print('===='*30)
        
        if pvalue > 0.05: # adds variables that fail to reject the null hypothesis
            null_list.append(var)
            
    print(f'Failed to Reject null hypothesis: {null_list}')

In [None]:
cols_cat = df_stat_diabetic.select_dtypes(exclude=np.number).columns.to_list()
chi_squared_failed_features=[]
chisquare_test(df_stat_diabetic, cols_cat,'readmitted',chi_squared_failed_features)

In [None]:
chi_squared_failed_features

### ONE-WAY ANOVA

In [None]:
# The numerical variables 
numerical=df_stat_diabetic.select_dtypes(include=np.number).columns.to_list()
print(numerical)

In [None]:
# define a function that performs the ANOVA test and returns a table
def anova_table(var_list,target_column, failed_list=[],test_list=[]):
    for var in var_list:
        print('\n\n')
        print(var.upper())
        print('=='*20)
        for cat in df_stat_diabetic[target_column].unique():
            print(cat.upper())
            data = df_stat_diabetic[df_stat_diabetic[target_column]==cat][var]
            shapiro_stats = stats.shapiro(data)
            print(shapiro_stats)
            if shapiro_stats[1]<0.05:
                print(f'\n-----------------Shapiro test for {var} and {cat} has failed--------------------\n')
                
            print('--'*44,'\n')
            test_list.append(data)
            
        print(f'\nlevene  test for {var} \n')
        print(stats.levene(*test_list))
        print('--'*20,'\n\n')   
        print(f'\nanova  test for {var} \n')
        print(stats.f_oneway(*test_list))
        print('--'*20,'\n\n')
        print(f'\nkruskal  test for {var}\n')
        print(stats.kruskal(*test_list))
        print('--'*20,'\n\n')
        print('=='*50)  
        print('\n\n')
        

In [None]:
anova_table(numerical,'readmitted')

In [None]:
# All the numerical features are significant.

In [None]:
insignificant_features = list(set(quasi_constant_feat).union(set(chi_squared_failed_features)))

In [None]:
insignificant_features

In [None]:
df_stat_diabetic.drop(columns=insignificant_features,inplace=True)

In [None]:
df_stat_diabetic.shape

In [None]:
df_stat_diabetic.to_csv('Diabetic_data_for_encoding.csv')

## Encoding Ordinal Variables

In [None]:
df_encoding_diabetic = pd.read_csv('Diabetic_data_for_encoding.csv',index_col=0)


df_encoding_diabetic.head()

In [None]:
df_encoding_diabetic['age']=df_encoding_diabetic['age'].apply(lambda x: x[1]).astype(int)

In [None]:
df_encoding_diabetic['max_glu_serum']=df_encoding_diabetic['max_glu_serum'].replace({'None':0,'Norm':1,'>200':200,'>300':300}).astype(int)

In [None]:
df_encoding_diabetic['A1Cresult']=df_encoding_diabetic['A1Cresult'].replace({'None':0,'Norm':1,'>8':8,'>7':7}).astype(int)

In [None]:
for col in drugs_list:
    if col in df_encoding_diabetic.columns:
        df_encoding_diabetic[col].replace({'No':0,'Steady':0.5,'Down':-1,'Up':1},inplace=True)

In [None]:
df_encoding_diabetic.head()

## MultiCollinearity

In [None]:
plt.figure(figsize=(26,13))
sns.heatmap(df_encoding_diabetic.corr(),annot=True)
plt.xticks(rotation=90)
plt.show()

### One Hot Encoding of Categorical Data

In [None]:
df_diabetic_for_ml=pd.get_dummies(df_encoding_diabetic.drop(columns=['readmitted']),drop_first=True)
df_diabetic_for_ml['readmitted'] = df_encoding_diabetic['readmitted']

In [None]:
df_diabetic_for_ml.to_csv('Diabetic_data_for_ml.csv')

## TRAIN TEST SPLIT and SCALING

In [None]:
df_diabetic_for_ml = pd.read_csv('Diabetic_data_for_ml.csv',index_col=0)
df_diabetic_for_ml.head()

In [None]:
X=df_diabetic_for_ml.drop(columns=['readmitted'])
y=df_diabetic_for_ml['readmitted']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,stratify=Y, random_state=42)
sc =  StandardScaler()
X_train = pd.DataFrame(sc.fit_transform(X_train),columns=X_train.columns)
X_test = pd.DataFrame(sc.transform(X_test),columns=X_train.columns)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
X_train.head()

In [None]:
y_train.head()

# MODEL BUILDING

## Logistic Regression:

In [None]:
cv_acc_train = {}
cv_acc_test = {}
cv_TPR = {}
cv_FPR = {}
cv_f1_train = {}
cv_f1_test = {}
cv_acc = {}

In [None]:
def plot_result(model, name:str):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    
    scores_train = cross_val_score(model, X_train, y_train, cv=5, scoring = 'balanced_accuracy')
    scores_test = cross_val_score(model, X_test, y_test, cv=5, scoring = 'balanced_accuracy')  
    scores_train_f1 = cross_val_score(model, X_train, y_train, cv=5, scoring = 'f1_micro')
    scores_test_f1 = cross_val_score(model, X_test, y_test, cv=5, scoring = 'f1_micro')  
    cv_acc_train[name] = round(scores_train.mean(), 4)*100 
    cv_acc_test[name] = round(scores_test.mean(), 4)*100
    cv_TPR[name] = (confusion_matrix(y_test, y_pred)[1][1]/confusion_matrix(y_test, y_pred)[1].sum())*100 
    cv_FPR[name] = (confusion_matrix(y_test, y_pred)[0][1]/confusion_matrix(y_test, y_pred)[0].sum())*100 
    cv_f1_train[name] = round(scores_train_f1.mean(), 4)*100 
    cv_f1_test[name] = round(scores_test_f1.mean(), 4)*100
    cv_acc[name] = accuracy_score(y_test,y_pred)
    print('Average Balanced Accuracy (CV=5), Test Set:', scores_test.mean())  
    print('Average Balanced Accuracy (CV=5), Training Set: ', scores_train.mean())
    print('Average Micro f1 (CV=5), Test Set:', scores_test_f1.mean())  
    print('Average Micro f1 (CV=5), Training Set: ', scores_train_f1.mean())

    
    print(classification_report(y_test, y_pred, zero_division=0))

    
    plot_confusion_matrix(model, X_test, y_test)
    plt.show()

In [None]:
lgc = LogisticRegression()
knc = KNeighborsClassifier()
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier()
abc = AdaBoostClassifier()
gbc = GradientBoostingClassifier()

In [None]:
plot_result(lgc, "LogisticRegression_base")

In [None]:
plot_result(dtc, "DecisionTreeClassifier_base")

In [None]:
plot_result(rfc, "RandomForestClassifier_base")

In [None]:
plot_result(abc, "AdaBoostClassifier_base")

In [None]:
plot_result(gbc, "GradientBoostingClassifier_base")

In [None]:
def plot_multiclass_roc(clf, X_test, y_test, n_classes, figsize=(5,5)):
    
    
    y_score = clf.decision_function(X_test)
    
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    # calculate dummies once
    y_test_dummies = pd.get_dummies(y_test, drop_first=False).values
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test_dummies[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # roc for each class
    fig, ax = plt.subplots(figsize=figsize)
    ax.plot([0, 1], [0, 1], 'k--')
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('Receiver operating characteristic example')
    for i in range(n_classes):
        ax.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f) for label %i' % (roc_auc[i], i))
    ax.legend(loc="best")
    ax.grid(alpha=.4)
    sns.despine()
    plt.show()

In [None]:
plot_multiclass_roc(lgc, X_test, y_test, n_classes=3, figsize=(16, 10))

# Feature Selection Using Embedded methods

### Feature Selection using Logistice Regression

In [None]:
sel2=SelectFromModel(LogisticRegression(penalty='l1',C=0.015,solver='saga'))
sel2.fit(X_train,y_train)

In [None]:
len(X_train.columns)

In [None]:
selected_feat = X_train.columns[(sel2.get_support())]

In [None]:
len(selected_feat)

In [None]:
df_embedded=df_diabetic_for_ml[selected_feat]

In [None]:
df_embedded.head()

In [None]:
X=df_embedded
Y=df_diabetic_for_ml['readmitted']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30,stratify=Y, random_state=42)
sc =  StandardScaler()
X_train = pd.DataFrame(sc.fit_transform(X_train),columns=X_train.columns)
X_test = pd.DataFrame(sc.transform(X_test),columns=X_train.columns)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
log_reg_drop=LogisticRegression()
log_reg_drop.fit(X_train,y_train)

In [None]:
print ('Accuracy Score of Train data set for Logistice Regression is',accuracy_score(y_train,log_reg_drop.predict(X_train)))
print ('Accuracy Score of Test data set for Logistice Regression is',accuracy_score(y_test,log_reg_drop.predict(X_test)))

In [None]:
plot_result(log_reg_drop,'LassoLogistic')

# MODEL BUILDING : ITERATION 4 (FEATURE SELECTION USING WRAPPER METHODS

<a id="rfe"></a>
# 5. Recursive Feature Elimination (RFE)

It is the process that returns the significant features in the dataset by recursively removing the less significant feature subsets.

In [None]:
df = pd.read_csv('Diabetic_data_for_ml.csv',index_col=0)

df.head()

In [None]:
X = df.drop(columns=['readmitted'])

In [None]:
y = df['readmitted']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,stratify=y, random_state=42)

In [None]:
sc =  StandardScaler()

In [None]:
X_train = pd.DataFrame(sc.fit_transform(X_train),columns=X_train.columns)

In [None]:
X_test = pd.DataFrame(sc.transform(X_test),columns=X_train.columns)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
y_train.unique()

In [None]:
rfc_hybrid = RandomForestClassifier()

rfe_model = RFE(estimator=rfc_hybrid, n_features_to_select = None,verbose=2)


rfe_model = rfe_model.fit(X_train, y_train)


feat_index = pd.Series(data = rfe_model.ranking_, index = X_train.columns)


signi_feat_rfe = feat_index[feat_index==1].index


print(signi_feat_rfe)

In [None]:
df_hybrid = df_diabetic_for_ml[list(signi_feat_rfe)+['readmitted']]

In [None]:
X = df_hybrid.drop(columns=['readmitted'])

In [None]:
y = df_hybrid['readmitted']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,stratify=y, random_state=42)

In [None]:
sc =  StandardScaler()

In [None]:
X_train = pd.DataFrame(sc.fit_transform(X_train),columns=X_train.columns)

In [None]:
X_test = pd.DataFrame(sc.transform(X_test),columns=X_train.columns)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
rfe_model = RandomForestClassifier()

In [None]:
plot_result(rfe_model,'Random_Forest_RFE')

# RFE GRADIENT BOOSTING

In [None]:
df = pd.read_csv('Diabetic_data_for_ml.csv',index_col=0)

df.head()

In [None]:
X = df.drop(columns=['readmitted'])

In [None]:
y = df['readmitted']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,stratify=y, random_state=42)

In [None]:
sc =  StandardScaler()

In [None]:
X_train = pd.DataFrame(sc.fit_transform(X_train),columns=X_train.columns)

In [None]:
X_test = pd.DataFrame(sc.transform(X_test),columns=X_train.columns)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
y_train.unique()

In [None]:
rfc_hybrid = GradientBoostingClassifier()

rfe_model = RFE(estimator=rfc_hybrid, n_features_to_select = None,verbose=2)


rfe_model = rfe_model.fit(X_train, y_train)


feat_index = pd.Series(data = rfe_model.ranking_, index = X_train.columns)


signi_feat_rfe = feat_index[feat_index==1].index


print(signi_feat_rfe)

In [None]:
df_hybrid = df_diabetic_for_ml[list(signi_feat_rfe)+['readmitted']]

In [None]:
X = df_hybrid.drop(columns=['readmitted'])

In [None]:
y = df_hybrid['readmitted']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,stratify=y, random_state=42)

In [None]:
sc =  StandardScaler()

In [None]:
X_train = pd.DataFrame(sc.fit_transform(X_train),columns=X_train.columns)

In [None]:
X_test = pd.DataFrame(sc.transform(X_test),columns=X_train.columns)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
model = GradientBoostingClassifier()

In [None]:
plot_result(model,'Gradient_Boosting_RFE')

In [None]:
# FINAL STACKING MODEL

In [None]:
base_learners = [('rf_model', RandomForestClassifier(criterion = 'entropy', max_features = 'auto',
                                                     n_estimators = 150, random_state = 10)),

                 ('logistic_model',LogisticRegression(penalty='l1',class_weight='balanced',solver='saga',C=0.015,random_state=10)),

                 ('xgb_moel',XGBClassifier(base_score=0.5, booster='gbtree',verbosity=None))
                 ]



stack_model = StackingClassifier(estimators = base_learners, final_estimator = GaussianNB())

In [None]:
plot_result(stack_model,'Final_Stacking_Classifier')

In [None]:
d = {}
for key in cv_acc_test:
    l = [cv_acc[key],cv_acc_test[key],cv_acc_train[key],cv_f1_test[key],cv_f1_train[key],cv_FPR[key],cv_TPR[key]]
    d.update({key:l})

In [None]:
model_performance_df = pd.DataFrame(d,index=['Accuracy','Balanced_Test_Accuracy','Balanced_Train_Accuracy','Micro_f1_Test','Micro_f1_Train','False_Positive_Rate','True_Positive_Rate']).T

In [None]:
model_performance_df