# Importing Libraries

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn.metrics import plot_confusion_matrix
from scipy.stats import norm, boxcox
from collections import Counter
from scipy import stats
from pandas_profiling import ProfileReport
import warnings
warnings.simplefilter(action='ignore', category=Warning)

# Loading Dataset

In [None]:
dataset = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')

# Exploratory Data Analysis


## 1) Using Manual Methods

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset.describe()


In [None]:
dataset.info()

In [None]:
dataset.isnull().values.any()


## Plotting Count for Qualities

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(12, 6))
sns.countplot(x="Outcome", data=dataset, palette='husl');

## Finding Correlation among the variables

In [None]:
plt.figure(figsize=(20, 17))
matrix = np.triu(dataset.corr())
sns.heatmap(dataset.corr(), annot=True,linewidth=.8, mask=matrix, cmap="rocket");


## Visualising Numerical Data

In [None]:
def boxPlotter(columnName):
    if not columnName == 'Outcome':
        sns.catplot(x="Outcome", y=columnName, data=dataset, kind="box");


In [None]:
for column in dataset.columns:
    boxPlotter(column)

### Type of Pregnancies with Count

In [None]:
pregnancy_count = dataset["Pregnancies"].value_counts().reset_index()
pregnancy_count

In [None]:
plt.figure(figsize=(30, 10))
plt.style.use("ggplot")
sns.barplot(x=pregnancy_count["index"], y=pregnancy_count["Pregnancies"]);
plt.title("TYPE OF PREGNANCIES WITH COUNT", fontsize=20)
plt.xlabel("PREGNANCIES", fontsize=20)
plt.ylabel("COUNT", fontsize=20)
plt.show()


## Plotting Distribution for each Column in dataset

In [None]:
def distributionPlot(columnName):
    if not columnName == 'Outcome':
        plt.figure()
        ax = sns.distplot(dataset[columnName][dataset.Outcome == 1],
                        color="darkturquoise", rug=True)
        sns.distplot(dataset[columnName][dataset.Outcome == 0], color="lightcoral", rug=True);
        plt.legend(['Diabetes', 'No Diabetes']) 

In [None]:
for column in dataset.columns:
    distributionPlot(column)

## Plotting Pairplot

In [None]:
sns.pairplot(dataset, hue="Outcome", palette="husl");

## Skewness Correction
I found out there were some columns with skewness in the dataset. Here, I'm trying to correct that Skewness

In [None]:
def skewnessCorrector(columnName):
    print('''Before Correcting''')
    (mu, sigma) = norm.fit(dataset[columnName])
    print("Mu before correcting {} : {}, Sigma before correcting {} : {}".format(
        columnName.capitalize(), mu, columnName.capitalize(), sigma))
    plt.figure(figsize=(20, 10))
    plt.subplot(1, 2, 1)
    sns.distplot(dataset[columnName], fit=norm, color="lightcoral");
    plt.title(columnName.capitalize() +
              " Distplot before Skewness Correction", color="black")
    plt.subplot(1, 2, 2)
    stats.probplot(dataset[columnName], plot=plt)
    plt.show()
    dataset[columnName], lam_fixed_acidity = boxcox(
        dataset[columnName])
    print('''After Correcting''')
    print("Mu after correcting {} : {}, Sigma after correcting {} : {}".format(
        columnName.capitalize(), mu, columnName.capitalize(), sigma))
    plt.figure(figsize=(20, 10))
    plt.subplot(1, 2, 1)
    sns.distplot(dataset[columnName], fit=norm, color="orange");
    plt.title(columnName.capitalize() +
              " Distplot After Skewness Correction", color="black")
    plt.subplot(1, 2, 2)
    stats.probplot(dataset[columnName], plot=plt)
    plt.show()


In [None]:
skewColumnList = ['DiabetesPedigreeFunction', 'Age']
for columns in skewColumnList:
    skewnessCorrector(columns)


# 2) Using Pandas Profiling

In [None]:
!pip install pandas_profiling

In [None]:
ProfileReport(dataset)

# Data Preprocessing

In [None]:
X = dataset.iloc[:, 0:-1].values
y = dataset.iloc[:, -1].values

In [None]:
X

In [None]:
y

In [None]:
X.shape

In [None]:
y.shape

## Splitting Dataset into Training Set and Test Set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1)


## Standardizing Independent Variables

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


## Dimensionality Reduction

### Applying PCA

In [None]:
# from sklearn.decomposition import PCA
# pca = PCA(n_components = 2)
# X_train = pca.fit_transform(X_train)
# X_test = pca.transform(X_test)

### Applying Kernel PCA

In [None]:
# from sklearn.decomposition import KernelPCA
# kpca = KernelPCA(n_components = 1, kernel = 'rbf')
# X_train = kpca.fit_transform(X_train)
# X_test = kpca.transform(X_test)

### Applying LDA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components=1)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)

# Training Classifiers on Training Set and drawing Inference

In [None]:
accuracy_scores = {}
def predictor(predictor, params):
    global accuracy_scores
    if predictor == 'lr':
        print('Training Logistic Regression on Training Set')
        from sklearn.linear_model import LogisticRegression
        classifier = LogisticRegression(**params)

    elif predictor == 'svm':
        print('Training Support Vector Machine on Training Set')
        from sklearn.svm import SVC
        classifier = SVC(**params)

    elif predictor == 'knn':
        print('TrainingK-Nearest Neighbours on Training Set')
        from sklearn.neighbors import KNeighborsClassifier
        classifier = KNeighborsClassifier(**params)

    elif predictor == 'dt':
        print('Training LDecision Tree Classifier on Training Set')
        from sklearn.tree import DecisionTreeClassifier
        classifier = DecisionTreeClassifier(**params)

    elif predictor == 'nb':
        print('Training Naive Bayes Classifier on Training Set')
        from sklearn.naive_bayes import GaussianNB
        classifier = GaussianNB(**params)

    elif predictor == 'rfc':
        print('Training Random Forest Classifier on Training Set')
        from sklearn.ensemble import RandomForestClassifier
        classifier = RandomForestClassifier(**params)

    classifier.fit(X_train, y_train)

    print('''Prediciting Test Set Result''')
    y_pred = classifier.predict(X_test)
    result = np.concatenate((y_pred.reshape(len(y_pred), 1),
                             y_test.reshape(len(y_test), 1)), 1)
    print(result, '\n')
    print('''Making Confusion Matrix''')
    from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print(cm, '\n')
    plot_confusion_matrix(classifier, X_test, y_test, cmap="pink")
    print('True Positives :', cm[0][0])
    print('False Positives :', cm[0][1])
    print('False Negatives :', cm[1][0])
    print('True Negatives :', cm[0][1], '\n')

    print('''Classification Report''')
    print(classification_report(y_test, y_pred,
          target_names=['0', '1'], zero_division=1))

    print('''Evaluating Model Performance''')
    accuracy = accuracy_score(y_test, y_pred)
    print(accuracy, '\n')

    print('''Applying K-Fold Cross validation''')
    from sklearn.model_selection import cross_val_score
    accuracies = cross_val_score(
        estimator=classifier, X=X_train, y=y_train, cv=10)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    accuracy_scores[classifier] = accuracies.mean()*100
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100), '\n')


## Training Logistic Regression on Training Set

In [None]:
predictor('lr', {'penalty': 'l1', 'solver': 'liblinear'})

## Training SVM on Training Set

In [None]:
predictor('svm', {'C': .25, 'gamma': 0.4,
          'kernel': 'linear', 'random_state': 0})

## Training Kernel SVM on Training Set

In [None]:
predictor('svm', {'C': 0.25, 'gamma': 0.4, 'kernel': 'rbf', 'random_state': 0})

## Training K-Nearest Neighbours on Training Set

In [None]:
predictor('knn', {'algorithm': 'auto', 'n_jobs': 1,
          'n_neighbors': 9, 'weights': 'uniform'})

## Training Decision Tree on Training Set

In [None]:
predictor('dt', {'criterion': 'gini', 'max_features': 'auto',
          'splitter': 'best', 'random_state': 0})

## Training Naive Bayes on Training Set

In [None]:
predictor('nb', {})

## Training Random Forest Classifier on Training Set

In [None]:
predictor('rfc', {'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 50, 'random_state': 0})

### Finding which Classifier performed best

In [None]:
maxKey = max(accuracy_scores, key=lambda x: accuracy_scores[x])
print('The model with highest K-Fold Validation Accuracy score is  {0} with an accuracy of  {1:.2f}'.format(
    maxKey, accuracy_scores[maxKey]))


## Accuracy Before HyperParamTuning and Without Applying Dimensionality Reduction
- LogisticRegression: 76.87466948704389,
- SVC: 77.20253833950291,
- K-SVC: 76.05499735589636,
- KNeighborsClassifier: 72.31623479640402,
- DecisionTreeClassifier: 68.725542041248,
- GaussianNB: 75.08461131676361,
- RandomForestClassifier: 75.06874669487044}

## Accuracy After HyperParamTuning and Without Applying Dimensionality Reduction 
- LogisticRegression: 76.87466948704389,
- SVC: 77.20253833950291,
- K-SVC: 76.05499735589636,
- KNeighborsClassifier: 72.96932839767318,
- DecisionTreeClassifier: 71.64992067689053,
- GaussianNB: 75.08461131676361,
- RandomForestClassifier: 76.53886832363828

## Accuracy After HyperParamTuning and Applying PCA
- LogisticRegression(: 71.48863035430988,
- SVC: 71.48598625066103,
- K-SVC: 71.65520888418826,
- KNeighborsClassifier: 67.42728714965625,
- DecisionTreeClassifier: 63.85510312004231,
- GaussianNB: 72.1390798519302,
- RandomForestClassifier: 64.48704389212057

## Accuracy After HyperParamTuning and Applying PCA
- LogisticRegression: 70.35166578529879,
- SVC: 68.71760973030143,
- K-SVC: 65.31200423056583,
- KNeighborsClassifier: 66.13167636171339,
- DecisionTreeClassifier: 64.68270756213644,
- GaussianNB: 71.82707562136436,
- RandomForestClassifier: 67.26335272342675

## Accuracy After HyperParamTuning and Applying LDA
- LogisticRegression: 77.35854045478582,
- SVC: 77.03331570597568,
- K-SVC: 78.01427815970385,
- KNeighborsClassifier: 76.22157588577473,
- DecisionTreeClassifier: 72.15494447382338,
- GaussianNB: 77.03067160232682,
- RandomForestClassifier: 72.15494447382338

## Accuracy After Applying LDA and Again HyperParamTuning
- LogisticRegression: 77.35854045478582,
- SVC: 77.03331570597568,
- K-SVC: 78.33685880486514,
- KNeighborsClassifie: 76.55208884188261,
- DecisionTreeClassifier: 72.15494447382338,
- GaussianNB: 77.03067160232682,
- RandomForestClassifier: 72.15494447382338

## Plotting Bar Chart for Accuracies of different classifiers

In [None]:
plt.figure(figsize=(12, 6))
model_accuracies = list(accuracy_scores.values())
model_names = ['LogisticRegression', 'SVC',
               'K-SVC', 'KNN', 'Decisiontree', 'GaussianNB', 'RandomForest']
sns.barplot(x=model_accuracies, y=model_names, palette='mako');

# Summary
- K-SVC performed best on this data set with an accuracy of 78.33%
- Logisitic Regression was just behind with an accuracy of an accuracy of 77.35% 
# **Please give your feedback by commenting below.**