## Importing Necessary Libraries


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score 

import warnings
warnings.filterwarnings('ignore')

#### Load the Data

In [None]:
df = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")

## EDA

In [None]:
# check the first five rows of data
df.head()

In [None]:
df.shape

**We have 768 observations and 9 features.**

In [None]:
# checking the distribution of outcomes
df["Outcome"].value_counts()

In [None]:
# checking null values in data
df.isnull().sum()

**As we can see that there is no missing values.**

In [None]:
#checking correlation between features
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, linewidths=2,cmap='plasma')
plt.show()

* **Pregnancies,Glucose,BMI,Age have positive correlation with Outcome(target variable).**
* **SkinThickness and Insulin having correlation with each other.**
* **Age and Pregnancies are correlated.**

In [None]:
#checking datatypes of features
df.dtypes

In [None]:
# Summary Statistics
df.describe()

**Important Observation(s):**<br>
* It seems like null values are present in the form of zeros because almost all the features have minimun value 0.<br>
* It's not possible to have Glucose, BloodPressure, SkinThickness, Insulin, BMI to be zero. So, we have to handle this.<br>
    **Let's check, how many zeroes are present in each feature**.

In [None]:
#replacing zero values with NaN 
features_with_zero = ["Glucose", "BloodPressure", "SkinThickness","Insulin", "BMI"]
for col in features_with_zero:
    df[col].replace(0, np.nan, inplace=True)

In [None]:
df.head()

In [None]:
# Now check again the count of Null values 
df.isnull().sum()

* **We can see there are lot of null values in SkinThickness and Insulin column.**

## Data Cleaning

In [None]:
#Imputing mean instead of null values
for col in features_with_zero:
    df[col].replace(np.nan, df[col].mean(), inplace=True)

In [None]:
# summary statistics
df.describe()

**Now we can see,mean of Insulin and SkinThickness has increased.**

## Data PreProcessing

#### Separating dependent and independent features

In [None]:
X = df.drop('Outcome',axis=1)
y = df['Outcome']

#### Splitting the Dataset into Training and Test Set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.2,
                                                   random_state=50)

#### Feature Scaling

In [None]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

## Model Building

#### Helper Function

In [None]:
#Helper functions to check the performance of different classifiers

#function to display confusion matrix
def displayConfusionMatrix(y_test, y_pred):
    """Displays the confusion matrix in the form of heatmap.
    
    Parameters:
    y_test (array-like): list of true labels
    y_pred (array-like): list of predicted labels
    
    Returns:
    acc_score (float): Accuracy score 
    """
    acc_score = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt="d",cmap='plasma')
    plt.title(f"Accuracy: {acc_score:0.3f}")
    plt.xlabel("Predicted labels")
    plt.ylabel("Actual labels")
    plt.show()
    return acc_score

#function to build model and display classification report of a classifer
def model(classifier, X_train=X_train, y_train=y_train,
              X_test=X_test, y_test=y_test):
    """Fits the `classifier` to `X_train`, `y_train` and generate an elegant 
    classification report using `X_test` and `y_test`.
    
    Parameters:
    classifer : classifier obj implementing 'fit' method.
    X_train (array-like): 2D-array of input features of Training Set.
    y_train (array-like): list of target features of Training Set.
    X_test  (array-like): 2D-array of input features of Testing Set.
    y_test  (array-like): list of target features of Testing Set.
    
    Returns:
    acc_score (float): Accuracy score 
    """
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    acc_score = displayConfusionMatrix(y_test, y_pred)
    print(classification_report(y_test, y_pred))
    return acc_score

### Logistic Regression

In [None]:
log = LogisticRegression(random_state = 50)
log_accuracy = model(log)

### KNN

In [None]:
knn= KNeighborsClassifier(n_neighbors=7)
knn_accuracy = model(knn)

### Linear-Support Vector Machine(SVM)

In [None]:
linear_svm = SVC(kernel="linear", random_state=50)
lsvm_accuracy = model(linear_svm)

### Radial-Support Vector Machine(SVM)

In [None]:
radial_svm = SVC(kernel="rbf", random_state=50)
rsvm_accuracy = model(radial_svm)

### Naive Bayes

In [None]:
nb = GaussianNB()
nb_accuracy = model(nb)

### Decision Tree

In [None]:
dt = DecisionTreeClassifier(criterion="entropy",
                                             random_state=50)
dt_accuracy = model(dt)

### Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=150,
                                            criterion="entropy",
                                            random_state=50)
rf_accuracy = model(rf)

### XGBoost

In [None]:
xgb= XGBClassifier(use_label_encoder=False,
                       verbosity=0)
xgb_accuracy = model(xgb)

### Model Comparison

In [None]:
models = pd.DataFrame({
    'Model': ["Logistic Regression", "KNN", "SVM-Linear", "SVM-RBF", 
             "Naive Bayes", "Decision Tree", "Random Forest", "XGBoost"],
    'Accuracy Score': [log_accuracy, knn_accuracy, lsvm_accuracy, rsvm_accuracy, 
                       nb_accuracy, dt_accuracy, rf_accuracy, xgb_accuracy]
})

models.sort_values(by = 'Accuracy Score', ascending = False, ignore_index=True)

* **It seems like Logistic Regression and XGBoost performs best!**
* **But before we jump into any conclusions let's perform K-fold cross validation.**

### K-Fold Cross Validation

#### Helper Function

In [None]:
def perform_kfold(clf, X_train=X_train, y_train=y_train):
    """Performs k-fold cross validation on given data(X_train, y_train) using 
    the `clf` (aka classifier)
    
    Parameters:
    classifer : classifier obj implementing 'fit' method.
    X_train (array-like): 2D-array of input features of Training Set.
    y_train (array-like): list of target features of Training Set.
    
    Returns:
    mean_score (float): Mean of Accuracy scores after operation.
    std_score  (float): Standard Deviation of Accuracy scores.
    """
    scores = cross_val_score(estimator=clf, X=X_train, y=y_train, 
                            cv=10
                            )
    
    mean_score = scores.mean()
    std_score = scores.std()
    print(f"Mean Accuracy: {mean_score*100:0.3f} %")
    print(f"Standard Deviation: {std_score*100:0.3f} %")
    
    return mean_score, std_score

#### Logistic Regression

In [None]:
log_macc, log_std = perform_kfold(log)

#### KNN

In [None]:
knn_macc, knn_std = perform_kfold(knn)

#### Linear-Support Vector Machine(SVM)

In [None]:
lsvm_macc, lsvm_std = perform_kfold(linear_svm)

#### Radial-Support Vector Machine(SVM)

In [None]:
rsvm_macc, rsvm_std = perform_kfold(radial_svm)

#### Naive Bayes

In [None]:
nb_macc, nb_std = perform_kfold(nb)

#### Decision Tree

In [None]:
dt_macc, dt_std = perform_kfold(dt)

#### Random Forest

In [None]:
rf_macc, rf_std = perform_kfold(rf)

#### XGBoost

In [None]:
xgb_macc, xgb_std = perform_kfold(xgb)

#### Comparing Models after K-fold cross validation

In [None]:
cross_validated_models = pd.DataFrame({
    "Model": ["Logistic Regression", "KNN", "SVM-Linear", "SVM-RBF", 
             "Naive Bayes", "Decision Tree", "Random Forest", "XGBoost"],
    "Mean Accuracy Score": [log_macc, knn_macc, lsvm_macc, rsvm_macc, 
                       nb_macc, dt_macc, rf_macc, xgb_macc],
    "Standard Deviation": [log_std, knn_std, lsvm_std, rsvm_std, 
                       nb_std, dt_std, rf_std, xgb_std]
})

cross_validated_models.sort_values(by = 'Mean Accuracy Score', ascending = False,
                     ignore_index=True)

* **So, it turns out LogisticRegression is the winners after K-fold Crossvalidation.**