In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')

# Data Inspection

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
# value cannot be 0 for these columns
cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI']

In [None]:
# replacing 0 with null values
for x in cols:
    df[x] = df[x].where(df[x]!=0,np.nan)

In [None]:
df.describe().T

# CHecking missing values

In [None]:
df.isna().sum().sort_values(ascending=False)

In [None]:
# calculating age group wise mean for Insulin,SkinThickness,BloodPressure,BMI
def compute_agewise(col):
    
    # age 21 to 35, replace null values with mean 
    df[col].loc[df[(df[col].isna()==True)&((df.Age>20)&(df.Age<=35))].index] = \
                            round(df[(df.Age>20)&(df.Age<=35)][col].mean(),1)
    
    # age 36 to 50, replace null values with mean 
    df[col].loc[df[(df[col].isna()==True)&((df.Age>35)&(df.Age<=50))].index] = \
                            round(df[(df.Age>35)&(df.Age<=50)][col].mean(),1)
    
    # age 51 to 70, replace null values with mean 
    df[col].loc[df[(df[col].isna()==True)&((df.Age>50)&(df.Age<=70))].index] = \
                            round(df[(df.Age>50)&(df.Age<=70)][col].mean(),1)
    
    # age geater than 71, replace null values with mean 
    df[col].loc[df[(df[col].isna()==True)&(df.Age>70)].index] = \
                            round(df[df.Age>70][col].mean(),1)
compute_agewise('Insulin')
compute_agewise('SkinThickness')
compute_agewise('BloodPressure')
compute_agewise('BMI')

In [None]:
# Replacing with mean
df.Glucose.fillna(round(df.Glucose.mean(),1),inplace=True)

In [None]:
df.isna().sum().sort_values(ascending=False)

No outliers, all data points looks in clusters # Checking Outliers

In [None]:
for x in df.columns:
    sns.boxplot(y=df[x])
    plt.show()

No outliers, all data points looks in clusters 

# Model Building

In [None]:
X = df.drop('Outcome',axis=1)
X

In [None]:
Y = df.Outcome
Y

# Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X)
X=scaler.transform(X) 
print(X)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, 
random_state=10)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(X_train, Y_train)
Y_pred=lr.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,precision_score,recall_score,roc_curve,roc_auc_score

cfm=confusion_matrix(Y_test,Y_pred)

print(cfm)

print("Classification Report")

print(classification_report(Y_test,Y_pred))

lracc=round(accuracy_score(Y_test,Y_pred),2)
lrrecall = round(recall_score(Y_test,Y_pred),2)
lrprec = round(precision_score(Y_test,Y_pred),2)

print('Accuracy:',lracc,'Recall:',lrrecall,'Precision:',lrprec)

In [None]:
# predict probabilities
lrprob = lr.predict_proba(X_test)
# roc curve
lr_fpr, lr_tpr, lr_thresh = roc_curve(Y_test, lrprob[:,1], pos_label=1)

# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(Y_test))]
p_fpr, p_tpr, _ = roc_curve(Y_test, random_probs, pos_label=1)

# auc scores
lr_auc_score = round(roc_auc_score(Y_test, lrprob[:,1]),2)
print('AUC Score:',lr_auc_score)

In [None]:
#plot roc curve
plt.plot(lr_fpr, lr_tpr, linestyle='--',color='orange', label='Logistic Regression')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')
plt.show()

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)
Y_pred=dt.predict(X_test)

In [None]:
cfm=confusion_matrix(Y_test,Y_pred)

print(cfm)

print("Classification Report")

print(classification_report(Y_test,Y_pred))

dtacc=round(accuracy_score(Y_test,Y_pred),2)
dtrecall = round(recall_score(Y_test,Y_pred),2)
dtprec = round(precision_score(Y_test,Y_pred),2)

print('Accuracy:',dtacc,'Recall:',dtrecall,'Precision:',dtprec)

In [None]:
# predict probabilities
dtprob = dt.predict_proba(X_test)
# roc curve
dt_fpr, dt_tpr, dt_thresh = roc_curve(Y_test, dtprob[:,1], pos_label=1)

# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(Y_test))]
p_fpr, p_tpr, _ = roc_curve(Y_test, random_probs, pos_label=1)

# auc scores
dt_auc_score = round(roc_auc_score(Y_test, dtprob[:,1]),2)
print('AUC Score:',dt_auc_score)


#plot roc curve
plt.plot(dt_fpr, dt_tpr, linestyle='--',color='orange', label='Decision Tree')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')
plt.show()

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, Y_train)
Y_pred=rf.predict(X_test)

In [None]:
cfm=confusion_matrix(Y_test,Y_pred)

print(cfm)

print("Classification Report")

print(classification_report(Y_test,Y_pred))

rfacc=round(accuracy_score(Y_test,Y_pred),2)
rfrecall = round(recall_score(Y_test,Y_pred),2)
rfprec = round(precision_score(Y_test,Y_pred),2)

print('Accuracy:',rfacc,'Recall:',rfrecall,'Precision:',rfprec)

In [None]:
# predict probabilities
rfprob = rf.predict_proba(X_test)
# roc curve
rf_fpr, rf_tpr, rf_thresh = roc_curve(Y_test, rfprob[:,1], pos_label=1)

# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(Y_test))]
p_fpr, p_tpr, _ = roc_curve(Y_test, random_probs, pos_label=1)

# auc scores
rf_auc_score = round(roc_auc_score(Y_test, rfprob[:,1]),2)
print('AUC Score:',rf_auc_score)


#plot roc curve
plt.plot(rf_fpr, rf_tpr, linestyle='--',color='orange', label='Random Forest')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')
plt.show()

# Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb.fit(X_train,Y_train)
Y_pred = gb.predict(X_test)

In [None]:
cfm=confusion_matrix(Y_test,Y_pred)

print(cfm)

print("Classification Report")

print(classification_report(Y_test,Y_pred))

gbacc=round(accuracy_score(Y_test,Y_pred),2)
gbrecall = round(recall_score(Y_test,Y_pred),2)
gbprec = round(precision_score(Y_test,Y_pred),2)

print('Accuracy:',gbacc,'Recall:',gbrecall,'Precision:',gbprec)

In [None]:
# predict probabilities
gbprob = gb.predict_proba(X_test)
# roc curve
gb_fpr, gb_tpr, gb_thresh = roc_curve(Y_test, gbprob[:,1], pos_label=1)

# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(Y_test))]
p_fpr, p_tpr, _ = roc_curve(Y_test, random_probs, pos_label=1)

# auc scores
gb_auc_score = round(roc_auc_score(Y_test, gbprob[:,1]),2)
print('AUC Score:',gb_auc_score)


#plot roc curve
plt.plot(rf_fpr, rf_tpr, linestyle='--',color='orange', label='Gradient Boosting')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')
plt.show()

# XG Boost

In [None]:
from xgboost import XGBClassifier
xg = XGBClassifier()
xg.fit(X_train,Y_train)
Y_pred = xg.predict(X_test)

In [None]:
cfm=confusion_matrix(Y_test,Y_pred)

print(cfm)

print("Classification Report")

print(classification_report(Y_test,Y_pred))

xgacc=round(accuracy_score(Y_test,Y_pred),2)
xgrecall = round(recall_score(Y_test,Y_pred),2)
xgprec = round(precision_score(Y_test,Y_pred),2)

print('Accuracy:',xgacc,'Recall:',xgrecall,'Precision:',xgprec)

In [None]:
# predict probabilities
xgprob = xg.predict_proba(X_test)
# roc curve
xg_fpr, xg_tpr, xg_thresh = roc_curve(Y_test, xgprob[:,1], pos_label=1)

# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(Y_test))]
p_fpr, p_tpr, _ = roc_curve(Y_test, random_probs, pos_label=1)

# auc scores
xg_auc_score = round(roc_auc_score(Y_test, xgprob[:,1]),2)
print('AUC Score:',xg_auc_score)


#plot roc curve
plt.plot(rf_fpr, rf_tpr, linestyle='--',color='orange', label='XG Boost')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')
plt.show()

# Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier

# create sub models
estimators = []

model1 = DecisionTreeClassifier()
estimators.append(('dt',model1))
model2 = GradientBoostingClassifier()
estimators.append(('gb',model2))
model3 = XGBClassifier()
estimators.append(('xgb',model3))

# create the ensemble model
ensemble = VotingClassifier(estimators,voting='soft')
ensemble.fit(X_train,Y_train)
Y_pred = ensemble.predict(X_test)

In [None]:
cfm=confusion_matrix(Y_test,Y_pred)

print(cfm)

print("Classification Report")

print(classification_report(Y_test,Y_pred))

vcacc=round(accuracy_score(Y_test,Y_pred),2)
vcrecall = round(recall_score(Y_test,Y_pred),2)
vcprec = round(precision_score(Y_test,Y_pred),2)

print('Accuracy:',vcacc,'Recall:',vcrecall,'Precision:',vcprec)

In [None]:
# predict probabilities
vcprob = ensemble.predict_proba(X_test)
# roc curve
vc_fpr, vc_tpr, vc_thresh = roc_curve(Y_test, vcprob[:,1], pos_label=1)

# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(Y_test))]
p_fpr, p_tpr, _ = roc_curve(Y_test, random_probs, pos_label=1)

# auc scores
vc_auc_score = round(roc_auc_score(Y_test, vcprob[:,1]),2)
print('AUC Score:',vc_auc_score)


#plot roc curve
plt.plot(rf_fpr, rf_tpr, linestyle='--',color='orange', label='Voting Classifier')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')
plt.show()

# Comparing Models

In [None]:
result = pd.DataFrame({'Accuracy':[lracc,dtacc,rfacc,gbacc,xgacc,vcacc],
                         'Recall':[lrrecall,dtrecall,rfrecall,gbrecall,xgrecall,vcrecall],
                         'Precision':[lrprec,dtprec,rfprec,gbprec,xgprec,vcprec],
                         'Auc':[lr_auc_score,dt_auc_score,rf_auc_score,gb_auc_score,xg_auc_score,vc_auc_score]},
                      
                        index=['Logistic Regression','Decision Tree','Random Forest','Gradient Boosting','XG Boost','Voting Classifier'])

In [None]:
result

## Accuracy

In [None]:
sns.barplot(y = result.Accuracy.sort_values(ascending=False).index,
           x = result.Accuracy.sort_values(ascending=False))

## Precision

In [None]:
sns.barplot(y = result.Precision.sort_values(ascending=False).index,
           x = result.Precision.sort_values(ascending=False))

## Recall

In [None]:
sns.barplot(y = result.Recall.sort_values(ascending=False).index,
           x = result.Recall.sort_values(ascending=False))

## AUC Score

In [None]:
sns.barplot(y = result.Auc.sort_values(ascending=False).index,
           x = result.Auc.sort_values(ascending=False))

In [None]:
plt.plot(lr_fpr, lr_tpr, linestyle='--',color='blue', label='Logistic Regression')
plt.plot(dt_fpr, dt_tpr, linestyle='--',color='brown', label='Decision Tree')
plt.plot(rf_fpr, rf_tpr, linestyle='--',color='green', label='Random Forest')
plt.plot(rf_fpr, rf_tpr, linestyle='-.',color='orange', label='Gradient Boosting')
plt.plot(rf_fpr, rf_tpr, linestyle='--',color='purple', label='XG Boost')
plt.plot(rf_fpr, rf_tpr, linestyle='--',color='red', label='Voting Classifier')
plt.plot(p_fpr, p_tpr, linestyle='--', color='yellow')

# title
plt.title('ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.show()