In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn import metrics
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV

from xgboost import XGBClassifier

In [None]:
df = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')

df.head()


In [None]:
df.shape

In [None]:
df.info()

In [None]:
df['Outcome'] = df['Outcome'].astype('object')

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.Outcome.value_counts()

In [None]:
def dist(i):
  plt.subplot(4,2,i+1)
  sns.histplot(df, x=df.columns[i], hue=df.Outcome, bins=17, kde=True)

plt.figure(figsize=(20,20), dpi = 300)
for i in range (0,len(df.columns)-1):
  dist(i)

In [None]:
sns.pairplot(df,hue='Outcome')

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True)

In [None]:
df.Outcome.value_counts().plot(kind='bar',figsize=(10,8)) 

In [None]:
df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)
df.isnull().sum()

In [None]:
df['Glucose'].fillna(df['Glucose'].mean(), inplace = True)
df['BloodPressure'].fillna(df['BloodPressure'].mean(), inplace = True)
df['SkinThickness'].fillna(df['SkinThickness'].median(), inplace = True)
df['Insulin'].fillna(np.random.choice(df['Insulin'][~df['Insulin'].isna()]),inplace = True)
df['BMI'].fillna(df['BMI'].mean(), inplace = True)

In [None]:
df.isnull().sum()

In [None]:
def dist_box(df,col):
    fig,(ax1,ax2)=plt.subplots(2,1)
    sns.distplot(df[col],ax=ax1)
    sns.boxplot(df[col],ax=ax2)

plt.figure(figsize=(20,20))
for i in range (0,len(df.columns)-1):
    dist_box(df,df.columns[i])

In [None]:
u=df['Insulin'].mean()+(3*df['Insulin'].std())
l=df['Insulin'].mean()-(3*df['Insulin'].std())

df_out_in=df[(df['Insulin']>u)|(df['Insulin']<l)]
print("Number of Outliers:" , len(df_out_in))
df_out_in

In [None]:
df['Insulin']=np.where(df['Insulin']>=415,df['Insulin'].mode()[0],df['Insulin'])

In [None]:
u=df['BloodPressure'].mean()+(3*df['BloodPressure'].std())
l=df['BloodPressure'].mean()-(3*df['BloodPressure'].std())

df_out_bp=df[(df['BloodPressure']>u)|(df['BloodPressure']<l)]

print("Number of Outliers:" , len(df_out_bp))
df_out_bp

In [None]:
df['BloodPressure']=np.where((df['BloodPressure']>=110)                                                       
                           | (df['BloodPressure']<=30  ) ,df['BloodPressure'].mode()[0],df['BloodPressure'])

In [None]:
u=df['SkinThickness'].mean()+(3*df['SkinThickness'].std())
l=df['SkinThickness'].mean()-(3*df['SkinThickness'].std())

df_out_st=df[(df['SkinThickness']>u)|(df['SkinThickness']<l)]

print("Number of Outliers:" , len(df_out_st))
df_out_st

In [None]:
df['SkinThickness']=np.where((df['SkinThickness']>=56)                                                       
                           ,df['SkinThickness'].mode()[0],df['SkinThickness'])

In [None]:
u=df['Pregnancies'].mean()+(3*df['Pregnancies'].std())
l=df['Pregnancies'].mean()-(3*df['Pregnancies'].std())

df_out_pr=df[(df['Pregnancies']>u)|(df['Pregnancies']<l)]

print("Number of Outliers:" , len(df_out_pr))
df_out_pr

In [None]:
df['Pregnancies']=np.where((df['Pregnancies']>13)                                                       
                           ,df['Pregnancies'].mode()[0],df['Pregnancies'])

In [None]:
df_features=df.drop('Outcome',axis=1)
df_target=df.Outcome

In [None]:
df_target = df_target.astype('int')

In [None]:
df_features = df_features.select_dtypes(include = [np.number])
df_features.columns

In [None]:
sc= StandardScaler()
num_scaled = sc.fit_transform(df_features)
X= pd.DataFrame(num_scaled, columns = df_features.columns)
X.head()


In [None]:
X_train,X_test,y_train,y_test= train_test_split(X,df_target,test_size=0.3,random_state=7)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
def get_test_report(model):
    test_pred = model.predict(X_test)
    return(classification_report(y_test,test_pred))

In [None]:
def plot_confusion_matrix(model):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test,y_pred)
    conf_matrix = pd.DataFrame(data = cm,columns = ['Predicted:0','Predicted:1'], index = ['Actual:0','Actual:1'])
    sns.heatmap(conf_matrix, annot = True, fmt = 'd', cbar = False, 
                linewidths = 0.1, annot_kws = {'size':25})
    plt.xticks(fontsize = 20)
    plt.yticks(fontsize = 20)
    plt.show()


In [None]:
def plot_ROC(model):
    y_pred_prob = model.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    plt.plot(fpr, tpr)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.plot([0, 1], [0, 1],'r--')
    plt.title('ROC curve for diabetes Prediction Classifier', fontsize = 15)
    plt.xlabel('False positive rate (1-Specificity)', fontsize = 15)
    plt.ylabel('True positive rate (Sensitivity)', fontsize = 15)
    plt.text(x = 0.02, y = 0.9, s = ('AUC Score:',round(roc_auc_score(y_test, y_pred_prob),4)))
    plt.grid(True)
    plt.show()

### Logistic Regression

In [None]:
lgst = LogisticRegression()
lgst_model = lgst.fit(X_train,y_train)

In [None]:
test_report=get_test_report(lgst_model)
print(test_report)

In [None]:
plot_confusion_matrix(lgst_model)

In [None]:
plot_ROC(lgst_model)

### RandomForestClassifer

In [None]:
rfc = RandomForestClassifier(n_estimators=150,criterion='gini')
rfc_model = rfc.fit(X_train,y_train)

In [None]:
rfc_report = get_test_report(rfc_model)
print(rfc_report)

In [None]:
plot_confusion_matrix(rfc_model)

In [None]:
plot_ROC(rfc_model)

### DecisionTree Classifier

In [None]:
dtc = DecisionTreeClassifier()
dtc_model = dtc.fit(X_train,y_train)

In [None]:
dtc_report = get_test_report(dtc_model)
print(dtc_report)

In [None]:
plot_confusion_matrix(dtc_model)

In [None]:
plot_ROC(dtc_model)

### KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
knn = KNeighborsClassifier(n_neighbors=28)
knn_model = knn.fit(X_train,y_train)

In [None]:
knn_report=get_test_report(knn_model)
print(knn_report)

In [None]:
plot_confusion_matrix(knn_model)

In [None]:
plot_ROC(knn_model)

### Gaussian NB

In [None]:
gnb = GaussianNB()
gnb_model = gnb.fit(X_train,y_train)

In [None]:
gnb_report = get_test_report(gnb_model)
print(gnb_report)

In [None]:
plot_confusion_matrix(gnb_model)

In [None]:
plot_ROC(gnb_model)

### AdaBoostingClassifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier

In [None]:
abc = AdaBoostClassifier(n_estimators=100,random_state=47)
abc_model = abc.fit(X_train,y_train)

In [None]:
abc_report = get_test_report(abc_model)
print(abc_report)

In [None]:
plot_confusion_matrix(abc_model)

In [None]:
plot_ROC(abc_model)

### GradientBoostingClassifier

In [None]:
gbc = GradientBoostingClassifier(n_estimators = 150, max_depth = 10, random_state = 10)
gbc_model=gbc.fit(X_train,y_train)

In [None]:
gbc_report = get_test_report(gbc_model)
print(gbc_report)

In [None]:
plot_confusion_matrix(gbc_model)

In [None]:
plot_ROC(gbc_model)

### XGB Classifier

In [None]:
xgb = XGBClassifier(max_depth = 10, gamma = 1)
xgb_model=xgb.fit(X_train, y_train)

In [None]:
xgb_report=get_test_report(xgb_model)
print(xgb_report)

In [None]:
plot_confusion_matrix(xgb_model)

In [None]:
plot_ROC(xgb_model)