# Pima Indians Diabetes Database

**TASK : Predict the onset of diabetes based on diagnostic measures**

![](http://www.diabeteshealth.com/wp-content/uploads/2016/12/diabetes.jpg)

***Diabetes is a disease that occurs when your blood glucose, also called blood sugar, is too high. Blood glucose is your main source of energy and comes from the food you eat. Insulin, a hormone made by the pancreas, helps glucose from food get into your cells to be used for energy.***

**Early Signs of Diabetes**

* Hunger and fatigue. Your body converts the food you eat into glucose that your cells use for energy*
* Peeing more often and being thirstier
* Dry mouth and itchy skin
* Blurred vision


# IMPORTING THE LIBRARIES

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import scipy as sp
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline


# LOADING THE DATASET

In [None]:
data = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv') 



In [None]:
data.head()  #displaying the head of dataset


In [None]:
data.describe()      #description of dataset 


In [None]:
data.info()

In [None]:
data.shape    #768 rows and 9 columns

In [None]:
data.value_counts()

In [None]:
data.dtypes

In [None]:
data.columns

***Checking Null Values***

In [None]:
data.isnull().sum()

In [None]:
data.isnull().any()

In [None]:
data.isnull().all()

# Exploratory Data Analysis

In [None]:
data.corr()

In [None]:
plt.figure(figsize = (12,10))

sns.heatmap(data.corr(), annot =True)



In [None]:
data.hist(figsize=(18,12))
plt.show()



In [None]:

plt.figure(figsize=(14,10))
sns.set_style(style='whitegrid')
plt.subplot(2,3,1)
sns.boxplot(x='Glucose',data=data)
plt.subplot(2,3,2)
sns.boxplot(x='BloodPressure',data=data)
plt.subplot(2,3,3)
sns.boxplot(x='Insulin',data=data)
plt.subplot(2,3,4)
sns.boxplot(x='BMI',data=data)
plt.subplot(2,3,5)
sns.boxplot(x='Age',data=data)
plt.subplot(2,3,6)
sns.boxplot(x='SkinThickness',data=data)







In [None]:
 
mean_col = ['Glucose','BloodPressure','Insulin','Age','Outcome','BMI']

sns.pairplot(data[mean_col],palette='Accent')


In [None]:
sns.boxplot(x='Outcome',y='Insulin',data=data)



In [None]:
sns.regplot(x='BMI', y= 'Glucose', data=data)

In [None]:
sns.relplot(x='BMI', y= 'Glucose', data=data)

In [None]:
sns.scatterplot(x='Glucose', y= 'Insulin', data=data)

In [None]:
sns.jointplot(x='SkinThickness', y= 'Insulin', data=data)

In [None]:
sns.pairplot(data,hue='Outcome')


In [None]:
sns.lineplot(x='Glucose', y= 'Insulin', data=data)

In [None]:
sns.swarmplot(x='Glucose', y= 'Insulin', data=data)

In [None]:
sns.barplot(x="SkinThickness", y="Insulin", data=data[170:180])
plt.title("SkinThickness vs Insulin",fontsize=15)
plt.xlabel("SkinThickness")
plt.ylabel("Insulin")
plt.show()
plt.style.use("ggplot")


In [None]:
plt.style.use("default")
plt.figure(figsize=(5,5))
sns.barplot(x="Glucose", y="Insulin", data=data[170:180])
plt.title("Glucose vs Insulin",fontsize=15)
plt.xlabel("Glucose")
plt.ylabel("Insulin")
plt.show()


# TRAINING AND TESTING DATA

In [None]:
#train_test_splitting of the dataset

x = data.drop(columns = 'Outcome')

# Getting Predicting Value
y = data['Outcome']

from sklearn.model_selection import train_test_split 
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)


In [None]:
print(len(x_train))
print(len(x_test))
print(len(y_train))
print(len(y_test))


# MODELS

# 1. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression()
reg.fit(x_train,y_train)                         

In [None]:
y_pred=reg.predict(x_test)
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
print("Classification Report is:\n",classification_report(y_test,y_pred))
print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))
print("Training Score:\n",reg.score(x_train,y_train)*100)
print("Mean Squared Error:\n",mean_squared_error(y_test,y_pred))
print("R2 score is:\n",r2_score(y_test,y_pred))


In [None]:
print(accuracy_score(y_test,y_pred)*100)

**So we get a accuracy score of 82.46 % using Logistic Regression**

# 2. KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=7)

knn.fit(x_train,y_train)


In [None]:
y_pred=knn.predict(x_test)
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
print("Classification Report is:\n",classification_report(y_test,y_pred))
print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))
print("Training Score:\n",knn.score(x_train,y_train)*100)
print("Mean Squared Error:\n",mean_squared_error(y_test,y_pred))
print("R2 score is:\n",r2_score(y_test,y_pred))

In [None]:
print(accuracy_score(y_test,y_pred)*100)



**So we get a accuracy score of 75.97 % using KNeighborsClassifier**

# 3. SVC

In [None]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(x_train, y_train)


In [None]:
y_pred=svc.predict(x_test)
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
print("Classification Report is:\n",classification_report(y_test,y_pred))
print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))
print("Training Score:\n",svc.score(x_train,y_train)*100)
print("Mean Squared Error:\n",mean_squared_error(y_test,y_pred))
print("R2 score is:\n",r2_score(y_test,y_pred))


In [None]:


print(accuracy_score(y_test,y_pred)*100)


**So we get a accuracy score of 79.22 % using SVC**

# 4. Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_train,y_train)


In [None]:
y_pred=gnb.predict(x_test)
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
print("Classification Report is:\n",classification_report(y_test,y_pred))
print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))
print("Training Score:\n",gnb.score(x_train,y_train)*100)
print("Mean Squared Error:\n",mean_squared_error(y_test,y_pred))
print("R2 score is:\n",r2_score(y_test,y_pred))


In [None]:
print("Accuracy Score:\n",gnb.score(x_train,y_train)*100)

**So we get a accuracy score of 75.73 % using Naiye Bayes**

# 5. DECISION TREE CLASSIFIER

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(max_depth=6, random_state=123,criterion='entropy')

dtree.fit(x_train,y_train)


In [None]:
y_pred=dtree.predict(x_test)
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
print("Classification Report is:\n",classification_report(y_test,y_pred))
print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))
print("Training Score:\n",dtree.score(x_train,y_train)*100)
print("Mean Squared Error:\n",mean_squared_error(y_test,y_pred))
print("R2 score is:\n",r2_score(y_test,y_pred))

In [None]:
print(accuracy_score(y_test,y_pred)*100)



**So we get accuracy score of 73.37 % using DecisionTreeClassifier**

# 6.  RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()
rfc.fit(x_train,y_train)


In [None]:
y_pred=rfc.predict(x_test)
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
print("Classification Report is:\n",classification_report(y_test,y_pred))
print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))
print("Training Score:\n",rfc.score(x_train,y_train)*100)
print("Mean Squared Error:\n",mean_squared_error(y_test,y_pred))
print("R2 score is:\n",r2_score(y_test,y_pred))


In [None]:
print(accuracy_score(y_test,y_pred)*100)

**So we get a accuracy score of 81.18 % using RandomForestClassifier**

# 7. AdaBoostClassifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
adb = AdaBoostClassifier(base_estimator = None)
adb.fit(x_train,y_train)




In [None]:
y_pred=adb.predict(x_test)
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
print("Classification Report is:\n",classification_report(y_test,y_pred))
print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))
print("Training Score:\n",adb.score(x_train,y_train)*100)
print("Mean Squared Error:\n",mean_squared_error(y_test,y_pred))
print("R2 score is:\n",r2_score(y_test,y_pred))


In [None]:
print(accuracy_score(y_test,y_pred)*100)

**So we get a accuracy score of 77.92 % using AdaBoostClassifier**

# 8. Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc=GradientBoostingClassifier()
gbc.fit(x_train,y_train)


In [None]:
y_pred=gbc.predict(x_test)
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
print("Classification Report is:\n",classification_report(y_test,y_pred))
print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))
print("Training Score:\n",gbc.score(x_train,y_train)*100)
print("Mean Squared Error:\n",mean_squared_error(y_test,y_pred))
print("R2 score is:\n",r2_score(y_test,y_pred))

In [None]:
print(accuracy_score(y_test,y_pred)*100)

**So we get a accuracy score of 81.81 % using GradientBoostingClassifier**

# 9. XGBClassifier

In [None]:
from xgboost import XGBClassifier

xgb =XGBClassifier(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

xgb.fit(x_train, y_train)


In [None]:
y_pred=xgb.predict(x_test)
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
print("Classification Report is:\n",classification_report(y_test,y_pred))
print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))
print("Training Score:\n",xgb.score(x_train,y_train)*100)
print("Mean Squared Error:\n",mean_squared_error(y_test,y_pred))
print("R2 score is:\n",r2_score(y_test,y_pred))

In [None]:
xbg_accuracy=print(accuracy_score(y_test,y_pred)*100)

**So we get a accuracy score of 78.57 % using XGBClassifier**

# 10. ExtraTreesClassifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(n_estimators=100, random_state=0)
etc.fit(x_train,y_train)

In [None]:
y_pred=etc.predict(x_test)
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
print("Classification Report is:\n",classification_report(y_test,y_pred))
print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))
print("Training Score:\n",etc.score(x_train,y_train)*100)
print("Mean Squared Error:\n",mean_squared_error(y_test,y_pred))
print("R2 score is:\n",r2_score(y_test,y_pred))

In [None]:
print(accuracy_score(y_test,y_pred)*100)

 **So we get a accuracy score of 80.51 % using ExtraTreesClassifier**

In [None]:
data = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
data


***So now we conclude the accuracy of different models:***

* Logistic Regression= 82.46 %
* KNeighbors Classifier= 75.97 %
* SVC= 79.22 %
* Naiye Bayes= 75.73 %
* Decision Tree Classifier= 73.37%
* Random Forest Classifier= 81.57%
* Ada Boost Classifier= 77.92%
* Gradient Boosting Classifier= 81.81%
* XGB Classifier= 78.57%
* Extra Trees Classifier= 80.51%





# **If you liked this notebook, please UPVOTE it.**