In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np     
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import statsmodels as stats
from sklearn import preprocessing
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler 
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")
df.head()  #load the data

## Exploratory Data Analysis & Data Cleaning

#### **checking the datatypes**

In [None]:
df.dtypes   #the dataset contains continuous/ numeric data

#### column list of the dataframe

In [None]:
df.columns.tolist()  #column list of the dataframe

In [None]:
df.info

#### Checking null and missing values

In [None]:
df.isnull().any() #checking null values

In [None]:
df.isna().any()  #checking NaN/ missing values

### Univariate Analysis: Measures of the Central tendency and the Dispersion

In [None]:
df.describe() #shows more informations with statistical data

### Count plot & Histograms 

In [None]:
df.hist(figsize=(20,15))

#### Number of Pregnancies vs Outcome

In [None]:
sns.countplot(df["Pregnancies"], hue = df["Outcome"])
plt.title('Number of Pregnancies vs Outcome')

In [None]:
plt.figure(figsize = (10,10))
sns.histplot(x ="Pregnancies", hue = "Outcome",data=df, kde = True,palette="ch:s=.25,rot=-.25")

##### Glucose vs Outcome

In [None]:
sns.countplot(df["Glucose"], hue = df["Outcome"])
plt.title('Number of Glucose vs Outcome')

In [None]:
plt.figure(figsize = (10,10))
sns.histplot(x ="Glucose", hue = "Outcome",data=df, kde = True,palette = "dark")

#### BloodPressure vs Outcome

In [None]:
plt.figure(figsize = (10,10))
sns.histplot(x ="BloodPressure", hue = "Outcome",data=df, kde = True,palette = "flare")

#### SkinThickness vs Outcome

In [None]:
plt.figure(figsize = (10,10))
sns.histplot(x ="SkinThickness", hue = "Outcome",data=df, kde = True,palette="dark")

#### Insulin vs Outcome

In [None]:
plt.figure(figsize = (10,10))
sns.histplot(x ="Insulin", hue = "Outcome",data=df, kde = True, palette="dark")

#### BMI vs Outcome

In [None]:
plt.figure(figsize = (10,10))
sns.histplot(x ="BMI", hue = "Outcome",data=df, kde = True, palette="dark")

#### Age vs Outcome

In [None]:
plt.figure(figsize = (10,10))
sns.histplot(x ="Age", hue = "Outcome",data=df, kde = True,palette="Set1")

## Bivariate Analysis : Finding relationship between two independent variables

#### Checking Multicolinearity

Multicolinearity: 
    when an independent variable can be predicted from another independent variable. Here visualization is done with scatterplots and implots. & to show the strength of the rrelationships among the variables, Correlation Map will be plotted.

Here, Bloodpressure is depdendent on Age variable, Similarly, Glocose level and Skin thickness depends on age

#### Age & Bloodpressure

In [None]:
sns.scatterplot(data=df, x="Age", y="BloodPressure",hue = "Outcome")

In [None]:
sns.lmplot(data=df, x="Age", y="BloodPressure",hue = "Outcome",palette="Set2",col = "Outcome")

#### Age and Glucose Level

In [None]:
sns.scatterplot(data=df, x="Age", y="Glucose",hue = "Outcome")

In [None]:
sns.lmplot(data=df, x="Age", y="Glucose",hue = "Outcome",palette="Set1")

#### Age and Skin thickness

In [None]:
sns.scatterplot(data=df, x="Age", y="SkinThickness",hue = "Outcome")

In [None]:
sns.lmplot(data=df, x="Age", y="SkinThickness",hue = "Outcome",palette="Set1")

#### Pairplot

In [None]:
plt.figure(figsize=(20,20))
sns.set_theme(style="darkgrid")
sns.pairplot(df,hue ="Outcome", height=7)

### Correlation Map

In [None]:
corrmat = df.corr()
cmap = sns.diverging_palette(260,-10,s=50, l=75, n=6, as_cmap=True)
plt.subplots(figsize=(20,10))
sns.heatmap(corrmat,cmap= cmap, annot=True, square=True)

To overcome **Multi colinearity** dependent variables among the independent variables, adding those columns and updating dataframe by adding them. 

In [None]:
df['AgeBP'] = df['Age'] + df['BloodPressure']
df['AgeGL'] = df['Age'] + df['Glucose']
df['AgeTH'] = df['Age'] + df['SkinThickness']

df

#### Dropping unnecesary columns

In [None]:
df = df.drop(['Age', 'BloodPressure', 'SkinThickness', 'Glucose'], axis=1)
df

## Outliers Detection 

no outliers

In [None]:
a1 = sns.boxplot(x=df['Pregnancies']) 

In [None]:
a2 = sns.boxplot(x=df['Insulin'])

In [None]:
a3 = sns.boxplot(x=df['DiabetesPedigreeFunction'])

In [None]:
a4 = sns.boxplot(x=df['AgeBP'])

In [None]:
a5 = sns.boxplot(x=df['AgeGL'])

In [None]:
a6 = sns.boxplot(x=df['AgeTH'])

#### Recheck Correlation

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(np.abs(df.corr()),annot=True,cmap='viridis_r',fmt="0.2f")

## Feature Engineering

In [None]:
X = df.drop(["Outcome"],axis=1)
y = df["Outcome"]

In [None]:
X

In [None]:
y

### Data Splitting

In [None]:
from sklearn.model_selection import train_test_split     #data splitting
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=20)

### Feature Scaling : Standardization

In [None]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  
scaler.fit(X_train)  
X_train = scaler.transform(X_train)  
# apply same transformation to test data
X_test = scaler.transform(X_test)

In [None]:
df

**Accuracy results of different models on train and test dataset**

## Logistic Regression

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=21).fit(X_train, y_train)
lr.score(X_test, y_test)


print("Train Set Accuracy:"+str(accuracy_score(y_train,lr.predict(X_train))*100))
print("Test Set Accuracy:"+str(accuracy_score(y_test,lr.predict(X_test))*100))

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF=RandomForestClassifier(n_estimators=150, max_depth=25, random_state=25)
RF.fit(X_train,y_train)

print("Train Set Accuracy:"+str(accuracy_score(y_train,RF.predict(X_train))*100))
print("Test Set Accuracy:"+str(accuracy_score(y_test,RF.predict(X_test))*100))

## Suport Vector Classifier

In [None]:
from sklearn.svm import SVC
svm=SVC()
svm.fit(X_train,y_train)

print("Train Set Accuracy:"+str(accuracy_score(y_train,svm.predict(X_train))*100))
print("Test Set Accuracy:"+str(accuracy_score(y_test,svm.predict(X_test))*100))

## Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
DTC= DecisionTreeClassifier(criterion='entropy',max_depth=25)
DTC.fit(X_train,y_train)

print("Train Set Accuracy:"+str(accuracy_score(y_train,DTC.predict(X_train))*100))
print("Test Set Accuracy:"+str(accuracy_score(y_test,DTC.predict(X_test))*100))

## Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
GD=GradientBoostingClassifier(max_depth=15)
GD.fit(X_train,y_train)

print("Train Set Accuracy:"+str(accuracy_score(y_train,GD.predict(X_train))*100))
print("Test Set Accuracy:"+str(accuracy_score(y_test,GD.predict(X_test))*100))