# Diabetes Prediction 
## [Youtube Explanation Video](https://youtu.be/TtHrmEeMKeY)


## Importing Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

## Importing Dataset

In [None]:
dataset = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")

## Exploratory Data Analysis

In [None]:
dataset.head()

In [None]:
dataset.shape

- We have 768 rows and 9 columns(features)

In [None]:
#checking correlation between features
plt.figure(figsize=(8, 6))
sns.heatmap(dataset.corr(), annot=True, linewidths=2)
plt.show()

- `Glucose`, `BMI`, `Age`, `Pregnancies` have noticeable positive correlation with `Outcome`

In [None]:
#check for null values
dataset.isna().any()

In [None]:
# getting overview of columns
dataset.info()

In [None]:
#summary statistics
dataset.describe()

- Important Observation(s):
    - It seems like null values are present in the form of `zeros`.
    - It's impossible to have `Glucose`, `BloodPressure`, `SkinThickness`, `Insulin`, `BMI` to be _zero_. So, we have to handle this.
- Let's get a sense of how many zero value are present in each column.    

In [None]:
#replace zero values with np.nan to visualize them
zero_not_accepted = ["Glucose", "BloodPressure", "SkinThickness",
                    "Insulin", "BMI"]
for col in zero_not_accepted:
    dataset[col].replace(0, np.nan, inplace=True)

#checkf if zeros were replaced in required columns    
dataset.head(n=10)

In [None]:
#Visualizing Null values
plt.figure(figsize=(9,5))
ax = sns.barplot(x=dataset.isna().sum(),
           y=dataset.columns, orient='h')
for p in ax.patches:
    ax.annotate(text=f"{p.get_width():.0f}", 
                xy=(p.get_width(), p.get_y()+p.get_height()/2),
                xytext=(5, 0), textcoords='offset points', 
                ha="left", va="center",
               )
plt.grid(False)
plt.show()

- We can see there are lot of null values in `SkinThickness` and `Insulin` column.
- So, after Imputation the `mean` will change drastically.

In [None]:
#imputing mean instead of null values
for col in zero_not_accepted:
    dataset[col].replace(np.nan, dataset[col].mean(), inplace=True)

In [None]:
dataset.describe()

- And as we thought, new _mean_ of `Insulin` and `SkinThickness` has increased drastically.

In [None]:
#Plot pairwise relationships in a dataset
plt.figure(figsize=(20,20))
sns.pairplot(data=dataset, hue="Outcome", diag_kind="hist")
plt.show()

In [None]:
#distribution of outcomes
dataset["Outcome"].value_counts()

## Data Preprocessing
- There are mainly 3 things to be performed
    1. Extracting input and output features.
    2. Splitting the dataset into Training and Testing set.
    3. Feature Scaling.

### Extracting input(independent) and output(dependent) feature

In [None]:
#extracting input and output features
X = dataset.iloc[:, :-1].to_numpy()
y = dataset.iloc[:, -1].to_numpy()

In [None]:
print(X)

In [None]:
print(y)

## Handling Class Imbalance Problem using Over Sampling

In [None]:
from imblearn.combine import SMOTETomek
smk = SMOTETomek(random_state=0)
X_r, y_r = smk.fit_resample(X, y)

In [None]:
# from imblearn.over_sampling import RandomOverSampler
# om = RandomOverSampler(random_state=0)
# X_r, y_r = om.fit_resample(X, y)

In [None]:
#Check if over sampling worked
from collections import Counter
print(f"Initial counts: {Counter(y)}")
print(f"Resampled Counts: {Counter(y_r)}")

In [None]:
print(X_r.shape, y_r.shape)

In [None]:
#updating the input and output features for further exploration
X = X_r
y = y_r

### Splitting the Dataset into Training and Test Set

In [None]:
#split the dataset in Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.2,
                                                   random_state=0)

In [None]:
print(X_train.shape)
print(X_train)

In [None]:
print(y_train.shape)
print(y_train)

In [None]:
print(X_test.shape)
print(X_test)

In [None]:
print(y_test.shape)
print(y_test)

### Feature Scaling

As all the input features are numerical values, we will perform Standarization(Z-score normalization): 

> $x_{i} = (x_{i} - \mu_{i})\div\sigma $

$\mu$ - mean, $\sigma$ - standard deviation


In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
#Notice the mean ~ 0 and std ~ 1 for all the input features
pd.DataFrame(X_train, columns=dataset.columns[:-1]).describe()

## Training Various Classification models from `sklearn`

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [None]:
#Helper functions to judge different classifiers

#function to display an exquisite confusion matrix
def disp_cm(y_test, y_pred)->float:
    """Displays the confusion matrix in the form of heatmap.
    
    Parameters:
    y_test (array-like): list of true labels
    y_pred (array-like): list of predicted labels
    
    Returns:
    acc_score (float): Accuracy score 
    """
    acc_score = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt="d")
    plt.title(f"Accuracy: {acc_score:0.3f}")
    plt.xlabel("Predicted labels")
    plt.ylabel("Actual labels")
    plt.show()
    return acc_score
#function to generate performance report of a classifer
def judge_clf(classifier, X_train=X_train, y_train=y_train,
              X_test=X_test, y_test=y_test)->float:
    """Fits the `classifier` to `X_train`, `y_train` and generate an elegant 
    classification report using `X_test` and `y_test`.
    
    Parameters:
    classifer : classifier obj implementing 'fit' method.
    X_train (array-like): 2D-array of input features of Training Set.
    y_train (array-like): list of target features of Training Set.
    X_test  (array-like): 2D-array of input features of Testing Set.
    y_test  (array-like): list of target features of Testing Set.
    
    Returns:
    acc_score (float): Accuracy score 
    """
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    acc_score = disp_cm(y_test, y_pred)
    print(classification_report(y_test, y_pred))
    return acc_score

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression(random_state=0)
log_acc = judge_clf(log_clf)

### KNN - KNearestNeighbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=5, metric="minkowski", p=2)
knn_acc = judge_clf(knn_clf)

### Linear SVM - Linear Support Vector Machine

In [None]:
from sklearn.svm import SVC
lsvm = SVC(kernel="linear", random_state=0)
lsvm_acc = judge_clf(lsvm)

### Kernel SVM - Kernel Support Vector Machine
- Kernel used: RadialBasisFunction

In [None]:
ksvm = SVC(kernel="rbf", random_state=0)
ksvm_acc = judge_clf(ksvm)

### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
nb_clf = GaussianNB()
naiveb_acc = judge_clf(nb_clf)

### Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree_clf = DecisionTreeClassifier(criterion="entropy",
                                             random_state=0
                                  )
dtree_acc = judge_clf(dtree_clf)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfor_clf = RandomForestClassifier(n_estimators=100,
                                            criterion="entropy",
                                            random_state=0)
rfor_acc = judge_clf(rfor_clf)

### XGBoost

In [None]:
from xgboost import XGBClassifier
xgb_clf = XGBClassifier(use_label_encoder=False,
                       verbosity=0)
xgb_acc = judge_clf(xgb_clf)

### MLP (Multi-Layer-Perceptron) classifier of `sklearn`

In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [None]:
from sklearn.neural_network import MLPClassifier
mlp_clf = MLPClassifier(random_state=0)
mlp_acc = judge_clf(mlp_clf)

### Comparing Models

In [None]:
models = pd.DataFrame({
    'Model': ["Logistic Regr", "KNN", "SVM-Linear", "SVM-RBF", 
             "Naive Bayes", "Decision-Tree", "Radom Forest", "XGB",
             "MLP"],
    'Accuracy Score': [log_acc, knn_acc, lsvm_acc, ksvm_acc, 
                       naiveb_acc, dtree_acc, rfor_acc, xgb_acc, 
                       mlp_acc]
})

models.sort_values(by = 'Accuracy Score', ascending = False, ignore_index=True)

- It seems like `Logistic Regression` performs best!
- But before we jump into any conclusions let's perform K-fold cross validation.

## K-Fold Cross Validation
- we'll apply k-fold cross validation on our _training_ set.

In [None]:
from sklearn.model_selection import cross_val_score
def perform_kfold(clf, X_train=X_train, y_train=y_train)->(float, float):
    """Performs k-fold cross validation on given data(X_train, y_train) using 
    the `clf` (aka classifier)
    
    Parameters:
    classifer : classifier obj implementing 'fit' method.
    X_train (array-like): 2D-array of input features of Training Set.
    y_train (array-like): list of target features of Training Set.
    
    Returns:
    mean_score (float): Mean of Accuracy scores after operation.
    std_score  (float): Standard Deviation of Accuracy scores.
    """
    scores = cross_val_score(estimator=clf, X=X_train, y=y_train, 
                            cv=10 #number of folds
                            )
    
    mean_score = scores.mean()
    std_score = scores.std()
    print(f"Mean Accuracy: {mean_score*100:0.3f} %")
    print(f"Standard Deviation: {std_score*100:0.3f} %")
    
    return mean_score, std_score
    

### Logistic Regresion

In [None]:
log_macc, log_std = perform_kfold(log_clf)

### KNN 

In [None]:
knn_macc, knn_std = perform_kfold(knn_clf)

### SVM with Linear kernel

In [None]:
lsvm_macc, lsvm_std = perform_kfold(lsvm)

### SVM with Non-Linear kernel
- Kernel used: RadialBasisFunction

In [None]:
ksvm_macc, ksvm_std = perform_kfold(ksvm)

### Naive Bayes

In [None]:
naiveb_macc, naiveb_std = perform_kfold(nb_clf)

### Decision Tree

In [None]:
dtree_macc, dtree_std = perform_kfold(dtree_clf)

### Random Forest

In [None]:
rfor_macc, rfor_std = perform_kfold(rfor_clf)

### XGBoost

In [None]:
xgb_macc, xgb_std = perform_kfold(xgb_clf)

### MLP Classifer

In [None]:
mlp_macc, mlp_std = perform_kfold(mlp_clf)

### Comparing Models after K-fold cross validation

In [None]:
models_1 = pd.DataFrame({
    "Model": ["Logistic Regr", "KNN", "SVM-Linear", "SVM-RBF", 
             "Naive Bayes", "Decision-Tree", "Radom Forest", "XGB",
             "MLP"],
    "Mean Accuracy Score": [log_macc, knn_macc, lsvm_macc, ksvm_macc, 
                       naiveb_macc, dtree_macc, rfor_macc, xgb_macc, 
                       mlp_macc],
    "Stadard Dev": [log_std, knn_std, lsvm_std, ksvm_std, 
                       naiveb_std, dtree_std, rfor_std, xgb_std, 
                       mlp_std]
})

models_1.sort_values(by = 'Mean Accuracy Score', ascending = False,
                     ignore_index=True)

- So, it turns out `MLPClassifier` and `Random forest` are the winners after K-fold Crossvalidation since they both have similar accuracy.

>Note: The standard deviation has been reduced after oversampling!