# **Pima Indians Diabetes**
## Predict the onset of diabetes based on diagnostic measures

##**Importing libraries and dataset**

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
data.head()

## Intial Data Exploration

In [None]:
data.columns

In [None]:
data.shape

In [None]:
data.describe().T

In [None]:
data.info()

In [None]:
diabetes_data_copy = data.copy(deep = True)
diabetes_data_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = diabetes_data_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

## showing the count of Nans
print(diabetes_data_copy.isnull().sum())

##**Exploratory Data Analysis**


### histogram

In [None]:
p= data.hist(figsize=(20,20))

## Pair plot for data

In [None]:
sns.pairplot(data,hue= 'Outcome')

In [None]:
corr = data.corr()
corr.style.background_gradient()

In [None]:
p = sns.heatmap(corr, annot = True)

Every column is positively correlated with the outcome and the glucose has the most impact

## Scaling data

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
scaled_data_X =  pd.DataFrame(sc_X.fit_transform(data.drop(["Outcome"],axis = 1),),
        columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'])
scaled_data_Y = data.Outcome
scaled_data_X.head()

In [None]:
scaled_data_X.hist(figsize=(20,10))

# Modelling

###**Splitting into train and test data**

In [None]:
from sklearn.metrics import accuracy_score, precision_score

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(scaled_data_X,scaled_data_Y,train_size = 0.8 ,random_state=42)

##**KNN**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
for i in range(1,15):
  knn = KNeighborsClassifier(n_neighbors=i)
  knn = knn.fit(X_train, y_train)
  print(i, knn.score(X_test,y_test))

##**Decision Tree**

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt = dt.fit(X_train,y_train)
print(dt.score(X_test,y_test))

##**Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators= 275, min_samples_leaf= 0.12 ,random_state= 2)
rfc = rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
print(accuracy_score(y_pred,y_test))

##**Bagging Classifier**

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
bagging = BaggingClassifier(KNeighborsClassifier(),n_estimators=200,random_state= 42222)
bagging = bagging.fit(X_train, y_train)
print(bagging.score(X_test,y_test))

## **AdaBoost**

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators = 200, learning_rate= 1, random_state= 2)
ada = ada.fit(X_train, y_train)
print(ada.score(X_test,y_test))

##**Gradient Boosting**

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(n_estimators = 300, learning_rate= 1, random_state= 2)
gb = gb.fit(X_train, y_train)
print(gb.score(X_test,y_test))

##**Voting Classifier**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

dt = DecisionTreeClassifier(max_depth=2,random_state=1)
lr = LogisticRegression(random_state=1)
knn = KNeighborsClassifier()
Classifiers = {'LogisticRegression':lr,'KNearestNeighbors':knn,'DecisionTreeClassifier':dt}
for clf_name,clf in Classifiers.items():
  clf.fit(X_train,y_train)
  y_pred = clf.predict(X_test)
  print('{:s}:{:.3f}'.format(clf_name,100*accuracy_score(y_test,y_pred)))

In [None]:
vc = VotingClassifier(estimators = Classifiers.items())
vc.fit(X_train,y_train)
y_pred = vc.predict(X_test)
print('Voting Classifier:{:.3f}'.format(100*accuracy_score(y_test,y_pred)))

###**Voting Classifier has most accuracy 76.6**

In [None]:
vc_ans = vc.predict(X_test)
answer = pd.DataFrame({"Voting Classifier": vc_ans})
answer.sample(15)