## Problem Statement:
### Pedicting effective treatments  for diabetes in turn reducing the readmission into the hospital

In [None]:
from IPython.display import Image
Image("../input/infographic/INFOGRAPHIC.jpg")

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
data = pd.read_csv('../input/diabetes/diabetic_data.csv')
data.shape

# Data Preparation

In [None]:
data.columns

In [None]:
data.info()

In [None]:
data.isnull().values.any()

In [None]:
data.race.value_counts().plot(kind = 'bar' )

In [None]:
data.payer_code.value_counts().plot(kind = 'bar' )

In [None]:
data.medical_specialty.value_counts()

In [None]:
data.max_glu_serum.value_counts().plot(kind = 'bar' )

In [None]:
data.A1Cresult.value_counts().plot(kind = 'bar' )

In [None]:
data.change.value_counts().plot(kind = 'bar' )

In [None]:
data.diabetesMed.value_counts().plot(kind = 'bar' )

In [None]:
data.readmitted.value_counts().plot(kind = 'bar' )

In [None]:
data.age.value_counts().plot(kind = 'bar')

## Filtering patients with Diabetes
### diabetesMed = Yes

In [None]:
data=data[data.diabetesMed=='Yes']
data.shape

## Filtering patients who didn't readmit
### readmission = NO

In [None]:
data=data[data.readmitted=='NO']
data.shape

## Excluding patients who are Dead and are in hospise

In [None]:
data=data[~data.discharge_disposition_id.isin([11,13,14,19,20])]
data.shape

# Handling Missing Values

### We can observe that, Payer code, medical speciality & weight have more than 50% of the missing data, and prefer to drop those features.

In [None]:
data = data.drop(['medical_specialty','payer_code','weight'],axis=1)

**We can observe that the "Race" Feature has some missing values**

**Missing value Imputation using MODE for Race Feature as most of the people in the Dataset are Caucasian**

##### 1. Replacing the ? with NaN's

In [None]:
data['race']=data.race.replace('?',np.nan)


##### 2. Filling the NaN's with the mode

In [None]:
data['race'].fillna(data['race'].mode()[0], inplace=True)


In [None]:
data.race.isnull().sum()

In [None]:
data.shape

In [None]:
data.columns

In [None]:
treatments = data[['encounter_id','metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone']].copy()

In [None]:
treatments.head()

# Feature Engineering

 ### Custom encoding for the 23 Drug Features


In [None]:
treatments=treatments.replace(['No','Steady','Up','Down'],[0,1,1,1])
treatments.set_index('encounter_id',inplace=True)


In [None]:
treatments.head()

In [None]:
treatments.sum(axis=1).value_counts()

# Patients are Given at max a combination of 6 drugs for treating diabetes

### Feature Engineering - Creating a new feature "Treatments"

**1. When the value of Insuin is '1' , creating the classes "insulin" & "io" (insulin + others )********

In [None]:
i1 = treatments[treatments['insulin']==1].sum(axis = 1).replace([1,2,3,4,5,6],['insulin','io','io','io','io','io'])

In [None]:
i1.value_counts()

**2. When the value of Insuin is '0' , creating the classes "others" & "no med"**

In [None]:
i0=treatments[treatments['insulin']==0].sum(axis=1).replace([0,1,2,3,4,5,6],['no med','other','other','other','other','other','other'])

In [None]:
i0.value_counts()

In [None]:
treatments=pd.concat([i1,i0])
treatments = pd.DataFrame({'treatments':treatments})

In [None]:
treatments.head()

**Adding the new feature to the Actual Dataframe**

In [None]:
data=data.join(treatments,on='encounter_id') #setting index as encounter_id

In [None]:
data.head()

## Since the treatments column was created from the 23 Drugs, We will be removing them

In [None]:
data = data.drop(['metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone'],axis=1)

## Choosing the records with treatments Insulin and Insulin + other ( w.r.t Problem Statement)

In [None]:
data=data[data.treatments!='other']
data.shape

In [None]:
data.columns

# Here the features which contains numeric values are of type Discrete Quantitative and has a finite set of values. Discrete data can be both Quantitative and Qualitative. So treating outliers in this dataset is not possible

**One hot encoding the nominal categorical values**

In [None]:
data = pd.get_dummies(data, columns=['race', 'gender','max_glu_serum', 'A1Cresult', 'change',
       'diabetesMed', 'readmitted'])

In [None]:
data.head()

** Encoding the AGE(ordinal) categorical column**

In [None]:
data.age.value_counts()

In [None]:
labels = data['age'].astype('category').cat.categories.tolist()
replace_age = {'age' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}

print(replace_age)

In [None]:
data.replace(replace_age, inplace=True)

In [None]:
data.age.value_counts()

# Exploratory Data Analysis

### UNI VARIATE ANALYSIS

In [None]:
data.num_lab_procedures.plot(kind='hist')

In [None]:
import seaborn as sns
sns.distplot(data.time_in_hospital)

In [None]:
import matplotlib.pyplot as plt
age_count = data['age'].value_counts()
sns.set(style="darkgrid")
sns.barplot(age_count.index, age_count.values, alpha=0.9)
plt.title('Frequency Distribution of age')
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Age', fontsize=12)
plt.show()

In [None]:
labels = data['age'].astype('category').cat.categories.tolist()
counts = data['age'].value_counts()
sizes = [counts[var_cat] for var_cat in labels]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True) #autopct is show the % on plot
ax1.axis('equal')
plt.show()

# Feature Identification

In [None]:
data.columns

### Considering the Domain knowledge, we would like to drop the Columns "diag_1" , "diag_2" ,"diag_3"

##### Since they contain the information about the codes of different types of treatments given to the patient. They don't contribute to the effectiveness of the treat (i.e, our problemm statement)

In [None]:
data = data.drop(['diag_1','diag_2','diag_3'],axis = 1)

## With respect to the problem statement given, the output variable is observed to be the “treatments” feature
## The input variables are both Discrete Quantitative and Categorical and our output variable is Categorical


## Since we have a combination of Discrete Quantitative Variables and Categorical Variables, we cannot perform general Correlation tests

In [None]:
from IPython.display import Image
Image("../input/correlation/Picture1.png")

### We will be performing Chi-Square Test of Independence for finding the Correlation btw the variables

# Chi-Square Test of Independence

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import chi2_contingency

class ChiSquare:
    def __init__(self, dataframe):
        self.df = dataframe
        self.p = None #P-Value
        self.chi2 = None #Chi Test Statistic
        self.dof = None
        
        self.dfObserved = None
        self.dfExpected = None
        
    def _print_chisquare_result(self, colX, alpha):
        result = ""
        if self.p<alpha:
            result="{0} is IMPORTANT for Prediction".format(colX)
        else:
            result="{0} is NOT an important predictor. (Discard {0} from model)".format(colX)

        print(result)
        
    def TestIndependence(self,colX,colY, alpha=0.05):
        X = self.df[colX].astype(str)
        Y = self.df[colY].astype(str)
        
        self.dfObserved = pd.crosstab(Y,X) 
        chi2, p, dof, expected = stats.chi2_contingency(self.dfObserved.values)
        self.p = p
        self.chi2 = chi2
        self.dof = dof 
        
        self.dfExpected = pd.DataFrame(expected, columns=self.dfObserved.columns, index = self.dfObserved.index)
        
        self._print_chisquare_result(colX,alpha)

In [None]:
data['dummyCat'] = np.random.choice([0, 1], size=(len(data),), p=[0.5, 0.5])

data.dummyCat.value_counts()

In [None]:
#Initialize ChiSquare Class
cT = ChiSquare(data)

#Feature Selection
testColumns = ['encounter_id', 'patient_nbr', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient','number_diagnoses',
       'race_AfricanAmerican', 'race_Asian', 'race_Caucasian', 'race_Hispanic',
       'race_Other', 'gender_Female', 'gender_Male',
       'max_glu_serum_>200', 'max_glu_serum_>300', 'max_glu_serum_None',
       'max_glu_serum_Norm', 'A1Cresult_>7', 'A1Cresult_>8', 'A1Cresult_None',
       'A1Cresult_Norm', 'change_Ch', 'change_No', 'diabetesMed_Yes',
       'readmitted_NO', 'dummyCat']
for var in testColumns:
    cT.TestIndependence(colX=var,colY="treatments" ) 

# Model Building
## Train Test Split

Since our target variable is Categorical , We would be importing the required Classification model packages

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [None]:
X = data.drop(['encounter_id','patient_nbr','num_lab_procedures','number_outpatient','number_emergency',
                      'race_Asian','race_Other','diabetesMed_Yes','max_glu_serum_>200','A1Cresult_>8','A1Cresult_Norm',
                      'readmitted_NO','dummyCat','treatments'],axis=1)
Y = data['treatments']
print(X.shape)
print(Y.shape)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=2)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

# Base Model

In [None]:
y_p=[]
for i in range(y_test.shape[0]):
    y_p.append(y_test.mode()[0])#Highest class is assigned to a list which is compared with ytest
len(y_p) 

In [None]:
y_pred=pd.Series(y_p)

In [None]:
print("Accuracy : ",accuracy_score(y_test,y_pred))

## Our Baseline accuracy is 54% 
#### We can set the accuracy as 54% and the models we build should be giving us accuracies greater than 54%

# Predictive Model Development - Iteration 1 

## Baseline Models - Logistic Regression 

In [None]:
#Logistic Regression
m1=LogisticRegression()
m1.fit(X_train,y_train)
y_pred_lr=m1.predict(X_test)
Train_Score_lr = m1.score(X_train,y_train)
Test_Score_lr = accuracy_score(y_test,y_pred_lr)


print('Training Accuracy is:',Train_Score_lr)
print('Testing Accuracy is:',Test_Score_lr)
print(classification_report(y_test,y_pred_lr))

## KNN

In [None]:
m2 = KNeighborsClassifier()
m2.fit(X_train,y_train)
y_pred_knn = m2.predict(X_test)
Train_Score_knn = m2.score(X_train,y_train)
Test_Score_knn = accuracy_score(y_test,y_pred_knn)

print('Training Accuracy is :',Train_Score_knn)
print('Testing Accuracy is:',Test_Score_knn)
print(classification_report(y_test,y_pred_knn))


## Bernoulli Naives Bayes

In [None]:
m3=BernoulliNB()
m3.fit(X_train,y_train)
y_pred_bnb=m3.predict(X_test)
Train_Score_bnb = m3.score(X_train,y_train)
Test_Score_bnb = accuracy_score(y_test,y_pred_bnb)

print('Training Accuracy :',Train_Score_bnb)
print('Testing Accuracy  :',Test_Score_bnb)
print(classification_report(y_test,y_pred_bnb))

## Decision Trees

In [None]:
m4 = DecisionTreeClassifier()
m4.fit(X_train,y_train)
y_pred_dt=m4.predict(X_test)
Train_Score_dt = m4.score(X_train,y_train)
Test_Score_dt = accuracy_score(y_test,y_pred_dt)

print('Training Accuracy :',Train_Score_dt)
print('Testing Accuracy :',Test_Score_dt)
print(classification_report(y_test,y_pred_dt))

## Random Forest

In [None]:
m5 = RandomForestClassifier()
m5.fit(X_train,y_train)
y_pred_rf=m5.predict(X_test)
Train_Score_rf = m5.score(X_train,y_train)
Test_Score_rf = accuracy_score(y_test,y_pred_rf)

print('Training Accuracy :',Train_Score_rf)
print('Testing Accuracy :',Test_Score_rf)
print(classification_report(y_test,y_pred_rf))

# Predictive Model Development - Iteration 2

## Hyperparameter Tuning

### For Decision Tree

In [None]:
# GridSearchCV to find optimal max_depth
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# specify number of folds for k-fold CV
n_folds = 3

# parameters to build the model on
parameters = {'max_depth': range(5, 15, 5),
    'min_samples_leaf': range(50, 150, 50),
    'min_samples_split': range(50, 150, 50),
    'criterion': ["entropy", "gini"]}

# instantiate the model
dtree = DecisionTreeClassifier(random_state = 100)

# fit tree on training data
tree = GridSearchCV(dtree, parameters, 
                    cv=n_folds, 
                   scoring="accuracy")
tree.fit(X_train, y_train)

In [None]:
tree.best_params_

In [None]:
m6 = DecisionTreeClassifier(criterion='gini',max_depth=5,min_samples_leaf=50,min_samples_split=50)
m6.fit(X_train,y_train)
y_pred_tdt=m6.predict(X_test)
Train_Score_tdt = m6.score(X_train,y_train)
Test_Score_tdt = accuracy_score(y_test,y_pred_tdt)

print('Training Accuracy :',Train_Score_tdt)
print('Testing Accuracy  :',Test_Score_tdt)
print(classification_report(y_test,y_pred_tdt))


### For KNN

In [None]:
#Gridsearch CV to find Optimal K value for KNN model
grid = {'n_neighbors':np.arange(1,50)}
knn=KNeighborsClassifier()
knn_cv=GridSearchCV(knn,grid,cv=3)
knn_cv.fit(X_train,y_train)


print("Tuned Hyperparameter k: {}".format(knn_cv.best_params_))

In [None]:
m7 = KNeighborsClassifier(n_neighbors=19)
m7.fit(X_train,y_train)
y_pred_tknn=m7.predict(X_test)
Train_Score_tknn = m7.score(X_train,y_train)
Test_Score_tknn = accuracy_score(y_test,y_pred_tknn)


print('Training Accuracy :',Train_Score_tknn)
print('Testing Accuracy  :',Test_Score_tknn)
print(classification_report(y_test,y_pred_tknn))

### For Random Forest 

In [None]:
parameter={'n_estimators':np.arange(1,101)}
gs = GridSearchCV(m5,parameter,cv=3)
gs.fit(X_train,y_train)
gs.best_params_



In [None]:
m8 = RandomForestClassifier(n_estimators=73)
m8.fit(X_train,y_train) 
y_pred_trf=m8.predict(X_test)
Train_Score_trf = m8.score(X_train,y_train)
Test_Score_trf = accuracy_score(y_test,y_pred_trf)


print('Training Accuracy :',Train_Score_trf)
print('Testing Accuracy  :',Test_Score_trf)
print(classification_report(y_test,y_pred_trf))

# Predictive Model Development - Iteration 3 

## CatBoostClassifier

In [None]:
data.treatments.replace(['insulin','io'],[0,1],inplace = True)

In [None]:
a = data.drop(['age','treatments'],axis=1)
b = data.treatments

In [None]:
cate_features_index = np.where(a.dtypes != int)[0]


In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(a,b,train_size=.70,random_state=2)


In [None]:
from catboost import CatBoostClassifier, Pool,cv
#let us make the catboost model, use_best_model params will make the model prevent overfitting
model = CatBoostClassifier(eval_metric='Accuracy',use_best_model=True,random_seed=42)

In [None]:
model.fit(xtrain,ytrain,cat_features=cate_features_index,eval_set=(xtest,ytest))

In [None]:
#show the model test acc, but you have to note that the acc is not the cv acc,
#so recommend to use the cv acc to evaluate your model!
print('the test accuracy is :{:.6f}'.format(accuracy_score(ytest,model.predict(xtest))))
test_score_catboost = accuracy_score(ytest,model.predict(xtest))
print("the train accuracy is :",model.score(xtrain,ytrain))
train_score_catboost = model.score(xtrain,ytrain)

In [None]:
model.predict(xtest)

In [None]:
Model_Scores=pd.DataFrame({'Models':['Logistic Regression','KNN','Bernauli Naives Bayes','Decision Tree','Random Forest','Tuned Decison Tree','Tuned KNN','Tuned Random Forest','Cat Boost'],
             'Training Accuracy':[Train_Score_lr,Train_Score_knn,Train_Score_bnb,Train_Score_dt,Train_Score_rf,Train_Score_tdt,Train_Score_tknn,Train_Score_trf,train_score_catboost],
             'Testing Accuracy':[Test_Score_lr,Test_Score_knn,Test_Score_bnb,Test_Score_dt,Test_Score_rf,Test_Score_tdt,Test_Score_tknn,Test_Score_trf,test_score_catboost],
                })

Model_Scores.sort_values(by=('Testing Accuracy'),ascending=False)

# Model Comparision

## We have seen the individual values of each of the base model. Now let's compare and see which model is performing well for the given problem statement.

## when you evaluate the model we trained we get high scores, this just means how well our model learnt from our training data.

## Testing accuracy is a better estimate than training accuracy of out-of-sample performance

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from IPython.display import display

from sklearn.tree import export_graphviz
import graphviz

In [None]:
dot_data = export_graphviz(
    m6,
    out_file=None,
    feature_names=X.columns,
    class_names=['insulin', 'Insulin+others'],
    filled=True,
    rounded=True,
    special_characters=True)
graph = graphviz.Source(dot_data)
graph