### Import all the important libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import  DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import classification_report,confusion_matrix, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

#### Read the dataset

In [None]:
train= pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv') 

In [None]:
train.head()

In [None]:
train.shape

##### Ok So, Dataset is very small.

Lets check if there is any null value present

In [None]:
train.isnull().sum().sort_values(ascending=False)

OK So there is no null value.

Lets see some Statistical info about the features.

In [None]:
train.describe().transpose()

In [None]:
train['Outcome'].value_counts()

In [None]:
sns.countplot(train['Outcome']);

#### Dataset look like kind of balanced dataset.

Lets analyze the diabetes patients distribution over age.

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(train[train['Outcome']==1]['Age']);

In [None]:
plt.figure(figsize=(20,10))
sns.histplot(train[train['Outcome']==1]['Age'], bins=3);

So, from above plots it seems patients are mostly of age between 20 to 40 yrs

Lets see some other plots for all the features

In [None]:
def plotGraph(col):
    
    for ele in col:
        
        print('Plots for : ',ele)
        plt.figure(figsize=(30,10))
        
        # Distribution plot
        plt.subplot(1,3,1)
        sns.distplot(train[ele])
        plt.title('Distribution Plot')
        
        # Histogram
        plt.subplot(1,3,2)
        sns.histplot(train[ele])
        plt.title('Histogram plot')
        
        # Box plot
        plt.subplot(1,3,3)
        sns.boxplot(train[ele])
        plt.title('Box Plot')
        
        plt.show()

In [None]:
#plotGraph(train.columns)

Some of the features are skewed and have few outliers. 
But i am gonna use RandomForest for training so we are good.

In [None]:
plt.figure(figsize=(9,9))
sns.heatmap(train.corr(), annot=True, mask=np.triu(train.corr()))
plt.ylim(9,0);

### So, we can see there is really good positive correlation between Age and Pregnancies

### Also there is good Negative correlation between Age and SkinThickness

So, We can drop these two columns('Pregnancies','SkinThickness')

In [None]:
train.drop(['Pregnancies','SkinThickness'], axis=1, inplace=True)

Now lets prepare the model

In [None]:
y = train['Outcome']

In [None]:
train.drop('Outcome', axis=1, inplace=True)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train,y, test_size=0.2, random_state=15)

In [None]:
scaler= StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
def print_performance(yt,clf):
    y_pred=clf.predict(x_test)
    print('ROC_AUC value : ',roc_auc_score(yt,y_pred),'\n')
    print('classification_report : ','\n',classification_report(yt,y_pred))
    print('Confusion_matrics : ','\n',confusion_matrix(yt,y_pred))
    
    confu_matric(yt,y_pred)

In [None]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix

def confu_matric(y_test, y_pred):
    
    cm = confusion_matrix(y_test, y_pred)
    p = sns.heatmap(pd.DataFrame(cm), annot=True, cmap="YlGnBu" ,fmt='g')
    plt.title('Confusion matrix', y=1.1)
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')

In [None]:
# Random forest

classifier = RandomForestClassifier(n_estimators=250,random_state=15)
classifier.fit(x_train, y_train)
print_performance(y_test,classifier)

In [None]:
# decision tree 

classifier = DecisionTreeClassifier()
classifier.fit(x_train, y_train)
print_performance(y_test,classifier)

In [None]:
#Logistic regression

classifier = LogisticRegression(random_state = 15)
classifier.fit(x_train, y_train)
print_performance(y_test,classifier)

In [None]:
# K Neighbour

classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(x_train, y_train)
print_performance(y_test,classifier)

In [None]:
# SVC rbf

classifier = SVC(kernel = 'rbf', random_state = 15)
classifier.fit(x_train, y_train)
print_performance(y_test,classifier)

In [None]:
#Gaussian

classifier = GaussianNB()
classifier.fit(x_train, y_train)
print_performance(y_test,classifier)

In [None]:
# SVC linear

classifier = SVC(kernel = 'linear', random_state = 15)
classifier.fit(x_train, y_train)
print_performance(y_test,classifier)

In [None]:
# Gradient Boosting

classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=13, random_state=15)
classifier.fit(x_train, y_train)
print_performance(y_test,classifier)

In [None]:
# XGBClassifier

classifier = XGBClassifier(n_estimators=200,random_state = 15)
classifier.fit(x_train, y_train)
print_performance(y_test,classifier)

Let's try Neural Network

In [None]:
import tensorflow as tf

In [None]:
# Initialize ANN
ann = tf.keras.models.Sequential()

#Adding input layer and first hidden layer
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

# Adding 2nd hidden layer
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

# Adding output layer
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

#Compile ANN
ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

#Fit the model
ann.fit(x_train, y_train, batch_size = 16, epochs = 50)

# make prediction on x_test
y_pred = ann.predict(x_test)
#y_pred = (y_pred > 0.5)
y_pred = np.where(y_pred>0.5, 1,0)

In [None]:
# Predicting the Test set results
score, acc = ann.evaluate(x_test, y_test,
                            batch_size=16)
print('Test score:', score)
print('Test accuracy:', acc)

In [None]:
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

p = sns.heatmap(pd.DataFrame(cm), annot=True, cmap="YlGnBu" ,fmt='g')
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

That's it for now.

Thank you.....!