In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
import cufflinks as cf
import plotly.express as px
%matplotlib inline

In [None]:
df = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv"
)


In [None]:
df.head(5)

In [None]:
df.info()
df.isnull().sum()

In [None]:
df.describe()

In [None]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
init_notebook_mode(connected=True)
cf.go_offline()

In [None]:
df.iplot(kind='scatter',x='BloodPressure',y='Age',xTitle='BP',yTitle='Age',mode='markers',size=5)

In [None]:
sns.barplot(x='Outcome',y='Glucose',data=df,estimator=np.std)

In [None]:
sns.barplot(x='Outcome',y='Pregnancies',data=df,estimator=np.std)

In [None]:
df['Glucose'].iplot(kind='box')

In [None]:
df.iplot(kind='box')

In [None]:
df['Age'].iplot(kind='hist',bins=20)

In [None]:
df['Glucose'].iplot(kind='hist',bins=20)

In [None]:
sns.jointplot(x='BMI',y='Age',data=df)

In [None]:
px.pie(df,labels = ['Healthy', 'Diabetic'],names='Outcome',hole=0.5)

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(),cmap="YlGnBu",annot=True)
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.distplot(df[df['Outcome'] == 0]['Age'],color='green',kde=True)
sns.distplot(df[df['Outcome'] == 1]['Age'],color='grey',kde=True)
plt.show()

In [None]:
q1 = df.Age.quantile(0.25)
q3 = df.Age.quantile(0.75)
iqr = q3-q1
lower = q1-1.5*iqr
upper = q3+1.5*iqr
med = np.median(df.Age)
for i in df.Age:
    if i > upper or i<lower:
        df.Age=df.Age.replace(i,med)

In [None]:
q1 = df.Pregnancies.quantile(0.25)
q3 = df.Pregnancies.quantile(0.75)
iqr = q3-q1
lower = q1-1.5*iqr
upper = q3+1.5*iqr
med = np.median(df.Pregnancies)
for i in df.Pregnancies:
    if i > upper or i<lower:
        df.Pregnancies=df.Pregnancies.replace(i,med)

In [None]:
q1 = df.Glucose.quantile(0.25)
q3 = df.Glucose.quantile(0.75)
iqr = q3-q1
lower = q1-1.5*iqr
upper = q3+1.5*iqr
med = np.median(df.Glucose)
for i in df.Glucose:
    if i > upper or i<lower:
        df.Glucose=df.Glucose.replace(i,med)

In [None]:
q1 = df.BloodPressure.quantile(0.25)
q3 = df.BloodPressure.quantile(0.75)
iqr = q3-q1
lower = q1-1.5*iqr
upper = q3+1.5*iqr
med = np.median(df.BloodPressure)
for i in df.BloodPressure:
    if i > upper or i<lower:
        df.BloodPressure=df.BloodPressure.replace(i,med)

In [None]:
q1 = df.BMI.quantile(0.25)
q3 = df.BMI.quantile(0.75)
iqr = q3-q1
lower = q1-1.5*iqr
upper = q3+1.5*iqr
med = np.median(df.BMI)
for i in df.BMI:
    if i > upper or i<lower:
        df.BMI=df.BMI.replace(i,med)

In [None]:
q1 = df.Insulin.quantile(0.25)
q3 = df.Insulin.quantile(0.75)
iqr = q3-q1
lower = q1-1.5*iqr
upper = q3+1.5*iqr
med = np.median(df.Insulin)
for i in df.Insulin:
    if i > upper or i<lower:
        df.Insulin=df.Insulin.replace(i,med)

In [None]:
q1 = df.SkinThickness.quantile(0.25)
q3 = df.SkinThickness.quantile(0.75)
iqr = q3-q1
lower = q1-1.5*iqr
upper = q3+1.5*iqr
med = np.median(df.SkinThickness)
for i in df.SkinThickness:
    if i > upper or i<lower:
        df.SkinThickness=df.SkinThickness.replace(i,med)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, classification_report, roc_curve, plot_roc_curve, auc, precision_recall_curve, plot_precision_recall_curve, average_precision_score
from sklearn.model_selection import cross_val_score

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
sfit = scaler.fit(df.drop('Outcome',axis=1))

In [None]:
StandardScaler(copy=True,with_mean=True,with_std=True)

In [None]:
scaled_features=scaler.transform(df.drop('Outcome',axis=1))

In [None]:
df_new = pd.DataFrame(scaled_features,columns = df.columns[:-1])

In [None]:
df_new

In [None]:
from sklearn.model_selection import train_test_split
X = df_new
y = df['Outcome']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=101)
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
predictions = logmodel.predict(X_test)

In [None]:
print(classification_report(y_test,predictions))

In [None]:
confusion_matrix(y_test,predictions)

In [None]:
error_rate = []
for i in range(1,40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='blue',linestyle='-.',marker='o',markerfacecolor='red',markersize=10)
plt.xlabel('K')
plt.ylabel('Error Rate')

In [None]:
knn = KNeighborsClassifier(n_neighbors=13)
knn.fit(X_train,y_train)
pred = knn.predict(X_test)
pred
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

In [None]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)

In [None]:

dtree_pred = dtree.predict(X_test)

In [None]:

print(confusion_matrix(y_test,dtree_pred))
print('\n')
print(classification_report(y_test,dtree_pred))

In [None]:
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train,y_train)

In [None]:
rfc_pred = rfc.predict(X_test)


In [None]:
print(confusion_matrix(y_test,rfc_pred))
print('\n')
print(classification_report(y_test,rfc_pred))

In [None]:
model = SVC()
model.fit(X_train,y_train)

In [None]:
svc_pred = model.predict(X_test)
print(confusion_matrix(y_test,svc_pred))
print(classification_report(y_test,svc_pred))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [None]:
model = []
model.append(['LogisticRegression',LogisticRegression(random_state=0)])

In [None]:
model.append(['KNN',KNeighborsClassifier(n_neighbors=13)])

In [None]:
model.append(['GaussianNB',GaussianNB()])

In [None]:
model.append(['Extra Tree',ExtraTreesClassifier(random_state=0)])

In [None]:
model.append(['SVM',SVC(random_state=0)])

In [None]:
model.append(['RandomForest',RandomForestClassifier(random_state=0)])

In [None]:
model.append(['DecisionTree',DecisionTreeClassifier(random_state=0)])

In [None]:
model.pop(1)
model

In [None]:
list1 = []
for i in range(len(model)):
    list2 = []
    mod = model[i][1]
    mod.fit(X_train,y_train)
    y_pred = mod.predict(X_test)
    cm = confusion_matrix(y_test,y_pred)
    accuracy = cross_val_score(estimator=mod,X=X_train,y=y_train,cv=10)
    precision = precision_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    print(model[i][0],':')
    plt.figure(figsize=(4,4))
    sns.heatmap(cm,cmap='Greens',annot=True,fmt='d',linewidths=5,cbar=False,annot_kws={'fontsize':15},yticklabels=['Healthy','Diabetes'],xticklabels=['Predicted Healthy','Predicted Diabetes'])
    plt.yticks(rotation = 0)
    plt.show()
    print('')
    print('Accracy score:',accuracy_score(y_test,y_pred))
    print('')
    print('Standard Deviation: {:.2f}%'.format(accuracy.std()*100))
    print('')
    print('Precision:{:.2f}'.format(precision))
    print('')
    print('Recall:{:.2f}'.format(recall))
    print('')
    print('F1:{:.2f}'.format(f1))
    print('------------------------------------')
    print('')
    list2.append(model[i][0])
    list2.append((accuracy_score(y_test,y_pred))*100)
    list2.append(accuracy.mean()*100)
    list2.append(accuracy.std()*100)
    list2.append(precision)
    list2.append(recall)
    list2.append(f1)
    list1.append(list2)


In [None]:
df_acc = pd.DataFrame(list1,columns = ['Model','Accuracy','Accuracy mean','Standard deviation','Precision','Recall','F1'])

In [None]:
dff = df_acc.sort_values(by=['Accuracy'],ascending=False)
cm = sns.light_palette("green",as_cmap=True)
s= dff.style.background_gradient(cmap=cm)
s