In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Pima Indians Diabetes Database

This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. 
The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. 
Several constraints were placed on the selection of these instances from a larger database. 
In particular, all patients here are females at least 21 years old of Pima Indian heritage.

# Visualization of the data

In [None]:
# Dataset : https://www.kaggle.com/uciml/pima-indians-diabetes-database
df = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")
df.head()

In [None]:
df.info()

Here the information we need to take in consideration are the length of the dataset (here : 768 elements).
Then, we will take a look on the missing information. Here, we can see that we have all the data.
Thus, we can see all the type present in our data.
That we help us if we do a normalization of our data.

In [None]:
df.describe().T

# Permutation importance

Here, we are going to see which column has the most importance in our data.
In order to do that, we are going to use a random forest classifier.
We select one column that we are going to remove. Then, we see the accuracy of our model.
A feature is considered "important" if the accuracy of our model drops.
However, a feature is considered "unimportant" if the accuracy of our model isn't affected.

In [None]:
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# We get all the column except the last one.
X = df.iloc[:, :-1]
# We get the last column (Outcome)
y = df.iloc[:, -1]

# Create two sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

my_model = RandomForestClassifier(n_estimators=100, random_state=0).fit(X, y)

perm = PermutationImportance(my_model, random_state=1).fit(X, y)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

We can see that the "Glucose", "BMI", "Age" and "DiabetesPedegreeFunciton" are the 4 features with the most importance in our data.

In order to see the correlation between the "Glucose" and the "Outcome", we can create a graph that show us the Glucose value in function of the Outcome (Diabete or no Diabete).

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

g = sns.kdeplot(df["Glucose"][(df["Outcome"] == 0) & (df["Glucose"].notnull())], color="Red", shade = True)
g = sns.kdeplot(df["Glucose"][(df["Outcome"] == 1) & (df["Glucose"].notnull())], color="Blue", ax=g, shade = True)
g.set_xlabel("Glucose")
g.set_ylabel("Frequency")
g = g.legend(["No Diabete","Diabete"])

With this graph, we can see if we have a high glucose value, we have more proababilities that the person has Diabete.

In [None]:
g = sns.pairplot(df, hue="Outcome", palette="Set2", diag_kind="kde", height=3, size=3)

With this graph, we can visualize our data by pairs and with the two different value of the Outcome.
Here, in green, the person have Diabete. And in orange, the person have no Diabete.
When we visualize our data, we can see if some combinaison can be useful in order to describe this problem.

# Preparation of the pipeline

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

# A class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In our pipeline, we select all the features we want to take.

We can specify an imputer that allow us to select default value for missing one.
However, in our study, we have saw that all the value was there. So, we don't need to use it.


Then, we will apply an normalization method, here the "Standard Scaler".
It standardize features by removing the mean and scaling to unit variance. 

The standard score of a sample x is calculated as:

z = (x - u) / s

We can also use other approch in order to normalize our data, like MinMaxScaler.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import MinMaxScaler

# No need of the imputer because all the value are there :)
preprocess_pipeline = Pipeline([
    ("select_numeric", DataFrameSelector(["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "Pregnancies"])),
#   ("imputer", SimpleImputer(strategy="median")),
    ("Standardization", StandardScaler())
])



## Split our data in two sets (training and test)

Here we split our data in two sets. One for the training and another for the test.

In [None]:
# Create two sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

X_train = preprocess_pipeline.fit_transform(X_train)
X_test = preprocess_pipeline.fit_transform(X_test)

# Use some machine learning algorithm

## Support Vector Machine

First of all, we are going to run a SVM with no optimization. 

In [None]:
from sklearn.svm import SVC

svm_clf = SVC(gamma="auto")
svm_clf.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_score

# Here, we use cross validation
svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_scores.mean()

### Optimization of the SVM

In order to optimize our SVM, we need to define some parameters.
Here, we will use a rbf kernel.

#### Parameter : C

C is the penalty parameter, which represents misclassification or error term. The misclassification or error term tells the SVM optimisation how much error is bearable. This is how you can control the trade-off between decision boundary and misclassification term.

Bellow, we have an example of different values with the C parameter.
We can see that when C is high it will classify all the data points correctly, also there is a chance to overfit.

![image](https://miro.medium.com/max/800/0*08KrYhXpVQdUXWrX)


#### Parameter : Gamma

It defines how far influences the calculation of plausible line of separation.
When gamma is higher, nearby points will have high influence; low gamma means far away points also be considered to get the decision boundary.

![image](https://miro.medium.com/max/1370/1*6HVomcqW7BWuZ2vvGOEptw.png)


In order to optimize our model, we need to specify some values. The GridSearchCV class is going to test all the possibilities of parameters. This allow us to have the best parameters at the end.  

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = { 
    'gamma': [0.001, 0.01, 0.1, 1, 10], 
    'kernel': ['rbf'], 
    'C': [0.001, 0.01, 0.1, 1, 10, 15, 20],
}

svm_clf = GridSearchCV(SVC(), parameters, cv=10, n_jobs=-1).fit(X_train, y_train)

We are now going to visualize the best combinaison of parameter.

In [None]:
svm_clf.cv_results_['params'][svm_clf.best_index_]

In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

y_pred = svm_clf.predict(X_test)
accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
from sklearn.model_selection import cross_val_score

svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_scores.mean()

# Random Forest Classifier

In [None]:
forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

In [None]:
parameters = { 
    'n_estimators': [10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190],
}

forest_clf = GridSearchCV(RandomForestClassifier(random_state=42), parameters, cv=10, n_jobs=-1).fit(X_train, y_train)

print(forest_clf.cv_results_['params'][forest_clf.best_index_])

In [None]:
forest_clf.fit(X_train, y_train)

y_pred = forest_clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

print('Confusion matrix\n',cm)

In [None]:
from sklearn.metrics import plot_confusion_matrix


class_names = ["No Diabete", "Diabet"]

np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
titles_options = [("Confusion matrix, without normalization", None),
                  ("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
    disp = plot_confusion_matrix(forest_clf, X_test, y_test,
                                 display_labels=class_names,
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()


# Naive Bayesian

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

y_pred = gnb.fit(X_train, y_train).predict(X_test)
cm = confusion_matrix(y_test, y_pred)

print(accuracy_score(y_test, y_pred), f1_score(y_test, y_pred))

print('Confusion matrix\n', cm)


gnb_scores = cross_val_score(gnb, X_train, y_train, cv=10)
print(gnb_scores.mean())

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

neigh_clf = KNeighborsClassifier(n_neighbors=5)
y_pred = neigh_clf.fit(X_train, y_train).predict(X_test)

print(accuracy_score(y_test, y_pred), f1_score(y_test, y_pred))
print('Confusion matrix\n', cm)

## Optimize KNN

We are going to search the best parameter for the KNN algorithm.
Here, we are going to focus on the number of neighbors.

In [None]:
knn2 = KNeighborsClassifier()
param_grid = {
    'n_neighbors': np.arange(1, 50)
}
knn_gscv = GridSearchCV(knn2, param_grid, cv=5).fit(X_train, y_train)

In [None]:
print(knn_gscv.best_params_)
print(knn_gscv.best_score_)

Now, we can use the best parameter for our system.

In [None]:
neigh_clf = KNeighborsClassifier(n_neighbors=21)
y_pred = neigh_clf.fit(X_train, y_train).predict(X_test)

print(accuracy_score(y_test, y_pred), f1_score(y_test, y_pred))
print('Confusion matrix\n', cm)


knn_scores = cross_val_score(neigh_clf, X_train, y_train, cv=10)
print(knn_scores.mean())

# Classifiers Comparaison

In [None]:
from sklearn.metrics import plot_roc_curve
ax = plt.gca()
forest_clf_roc_curve = plot_roc_curve(forest_clf, X_test, y_test, ax=ax, alpha=0.8)
svm_clf_roc_curve = plot_roc_curve(svm_clf, X_test, y_test, ax=ax, alpha=0.8)
bay_clf_curve = plot_roc_curve(gnb, X_test, y_test, ax=ax, alpha=0.8)
knn_clf_curve = plot_roc_curve(knn_gscv, X_test, y_test, ax=ax, alpha=0.8)
plt.show()

Based on the AUC (Aire Under the Curve), we can see thath the SVM is better than the others.