In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")

In [None]:
df.head()

In [None]:
df.columns

Toutes les colonnes sont pertinents, alors on ne va pas supprimer aucune colonne

In [None]:
df.count()

Il ne y'a pas explicitement de manque, mais compte tenu de la description de la dataset, on trouve qu'il y'a un implicitement un manque. On a des colonnes avec un 0 qui signifie NaN. 
La premiere chose alors à faire, c'est de chercher les colonnes concernées.

In [None]:
num_missing = (df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age','Outcome']] == 0).sum()
print(num_missing)

Pregnancies Glucose BloodPressure SkinThickness Insulin BMI et Outcome contiennent des 0.  
Pour Pregnancie et Outcome cela est normal mais pour les autres ça ne l'est pas sinon le sujet est mort. 
Nous allons donc remplacé c'est 0 par des NaN puis remplacer ces NaN par un nombre aléatoire généré par une loi normale de même moyenne et variance


In [None]:
from numpy import nan
df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0, nan)

In [None]:
df.fillna(df.mean(), inplace=True)

In [None]:
df.head(100)

### Création des jeux d'apprentissage et de test 

In [None]:
X = df.drop(['Outcome'], axis=1)
y = df.Outcome

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

### Régression logistique

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train,y_train)
y_lr = lr.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score,auc, accuracy_score

In [None]:
print(accuracy_score(y_test, y_lr))

In [None]:
print(classification_report(y_test, y_lr))

In [None]:
probas = lr.predict_proba(X_test)
print(probas)

In [None]:
dfprobas = pd.DataFrame(probas,columns=['proba_0','proba_1'])
dfprobas['y'] = np.array(y_test)
dfprobas

In [None]:
plt.figure(figsize=(10,10))
sns.distplot(1-dfprobas.proba_0[dfprobas.y==0], bins=50)
sns.distplot(dfprobas.proba_1[dfprobas.y==1], bins=50)

In [None]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,probas[:, 1])
roc_auc = auc(false_positive_rate, true_positive_rate)
print (roc_auc)

In [None]:
plt.figure(figsize=(12,12))
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')        # plus mauvaise courbe
plt.plot([0,0,1],[0,1,1],'g:')     # meilleure courbe
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

## Random Forests

In [None]:
from sklearn import ensemble
rf = ensemble.RandomForestClassifier()
rf.fit(X_train, y_train)
y_rf = rf.predict(X_test)
rf_score = accuracy_score(y_test, y_rf)
print(rf_score)


In [None]:
print(classification_report(y_test, y_rf))

### Ajustement des hyperparamètres

In [None]:
rf1 = ensemble.RandomForestClassifier(n_estimators=20, min_samples_leaf=15, max_features=5)
rf1.fit(X_train, y_train)
y_rf1 = rf.predict(X_test)
rf1_score = accuracy_score(y_test, y_rf1)
print(rf1_score)

In [None]:
print(classification_report(y_test, y_rf1))

In [None]:
from sklearn.model_selection import validation_curve
params = np.arange(1, 300,step=30)
train_score, val_score = validation_curve(rf, X, y, 'n_estimators', params, cv=7)
plt.figure(figsize=(12,12))
plt.plot(params, np.median(train_score, 1), color='blue', label='training score')
plt.plot(params, np.median(val_score, 1), color='red', label='validation score')
plt.legend(loc='best')
plt.ylim(0, 1)
plt.xlabel('n_estimators')
plt.ylabel('score');

In [None]:
from sklearn import model_selection

In [None]:
param_grid = {
              'n_estimators': [1,100,500],
              'min_samples_leaf': [1,100,300,500]
             }
estimator = ensemble.RandomForestClassifier()
rf_gs = model_selection.GridSearchCV(estimator, param_grid)

In [None]:
rf_gs.fit(X_train, y_train)
print(rf_gs.best_params_)

In [None]:
rf2 = rf_gs.best_estimator_
y_rf2 = rf2.predict(X_test)
rf2_score = accuracy_score(y_test, y_rf2)
print(rf2_score)

In [None]:
print(classification_report(y_test, y_rf2))

### Importance des caractéristiques

In [None]:
importances = rf2.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(8,5))
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), X_train.columns[indices])
plt.title('Importance des caracteristiques')

### XGBoost

In [None]:
import xgboost as XGB
xgb  = XGB.XGBClassifier()
xgb.fit(X_train, y_train)
y_xgb = xgb.predict(X_test)
rf2_score = accuracy_score(y_test, y_xgb)
print(rf2_score)

In [None]:
cm = confusion_matrix(y_test, y_xgb)
print(cm)
print(classification_report(y_test, y_xgb))

###  Support Vector Machines

In [None]:
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train, y_train)
y_clf = clf.predict(X_test)
rf2_score = accuracy_score(y_test, y_clf)
print(rf2_score)

In [None]:
cm = confusion_matrix(y_test, y_clf)
print(cm)
print(classification_report(y_test, y_clf))

### Nearest Centroid Classifier

In [None]:
from sklearn.neighbors.nearest_centroid import NearestCentroid
import numpy as np
clf = NearestCentroid()
clf.fit(X_train, y_train)
y_clf = clf.predict(X_test)
rf2_score = accuracy_score(y_test, y_clf)
print(rf2_score)

In [None]:
cm = confusion_matrix(y_test, y_clf)
print(cm)
print(classification_report(y_test, y_clf))