In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Imports

In [None]:
#modelos
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
#plotting
import seaborn as sns
import matplotlib.pyplot as plt
#dados
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy import stats
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_val_score

## Lendo o dataset

In [None]:
df = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")

X = df.drop("Outcome", axis=1)
y = df["Outcome"]

print(df.columns)
df.head(10)

## Análise de dados

In [None]:
df.describe()

### Distribuição da classe (Outcome)

In [None]:
sns.distplot(df['Outcome'],kde=False)

In [None]:
#Percentual de negativos (0) vs. positivos (1)
df['Outcome'].value_counts()/df['Outcome'].count()

### Correlação

In [None]:
sns.heatmap(df.corr(), annot=True, fmt=".2f")

### Pares de dados
Separando diabéticos de não-diabéticos

In [None]:
sns.pairplot(df, hue="Outcome", palette="husl")

### Glucose - Outcome
Nota-se que os Outcomes positivos estão concentrados mais acima do que os Outcomes negativos

In [None]:
sns.swarmplot(data=df, y='Glucose', x='Outcome')

### Glucose - BMI
Há uma certa separação entre direita e esquerda, e valores mais acima são em maioria de pessoas com Outcome positivo, especialmente em Glucose = 0 (provavelmente outliers?).

In [None]:
sns.scatterplot(data=df, x='Glucose', y='BMI', hue='Outcome')

### Age - Pregnancies
Apesar de haver uma correlação moderada entre idade e número de gravidez, não é possível encontrar algum padrão sobre o Outcome

In [None]:
sns.regplot(data=df, x='Pregnancies', y='Age')

In [None]:
sns.scatterplot(data=df, y='Age', x='Pregnancies', hue='Outcome')

## Encontrando outliers
Algumas colunas têm valores 0 que não fazem sentido no mundo real, como o BMI (Índice de Massa Corporal)

### BMI

In [None]:
sns.boxplot(x=df['BMI'])

In [None]:
bmiout = df[(df['BMI'] > 50) | (df['BMI'] < 15)]
sns.pairplot(bmiout, hue='Outcome',palette='husl', diag_kind='hist')

In [None]:
sns.heatmap(bmiout.corr(),annot=True, fmt='.2f')

### Glucose

In [None]:
sns.boxplot(x=df['Glucose'])

In [None]:
q1 = df['Glucose'].quantile(0.25)
q3 = df['Glucose'].quantile(0.75)
iqr = q3 - q1

print(q3 + 1.5 * iqr)
gluout = df[(df['Glucose'] > q3 + 1.5 * iqr) | (df['Glucose'] < q1 - 1.5 * iqr)]

### Insulin

In [None]:
sns.boxplot(x=df['Insulin'])

In [None]:
q1 = df['Insulin'].quantile(0.25)
q3 = df['Insulin'].quantile(0.75)
iqr = q3 - q1

print(q3 + 1.5 * iqr)
insout = df[(df['Insulin'] > q3 + 1.5 * iqr)]
sns.pairplot(insout, hue='Outcome',palette='husl', diag_kind='hist')

In [None]:
sns.heatmap(insout.corr(), annot=True, fmt='.2f')

### Skin Thickness

In [None]:
sns.boxplot(x=df['SkinThickness'])

In [None]:
q1 = df['SkinThickness'].quantile(0.25)
q3 = df['SkinThickness'].quantile(0.75)
iqr = q3 - q1

print(q3 + 1.5 * iqr)
sknout = df[(df['SkinThickness'] > q3 + 1.5 * iqr) | (df['SkinThickness'] < q1 - 1.5 * iqr)]

### Pregnancies

In [None]:
sns.boxplot(x=df['Pregnancies'])

In [None]:
q1 = df['Pregnancies'].quantile(0.25)
q3 = df['Pregnancies'].quantile(0.75)
iqr = q3 - q1

print(q3 + 1.5 * iqr)
prgout = df[(df['Pregnancies'] > q3 + 1.5 * iqr) | (df['Pregnancies'] < q1 - 1.5 * iqr)]

### Blood Pressure

In [None]:
sns.boxplot(x=df['BloodPressure'])

In [None]:
q1 = df['BloodPressure'].quantile(0.25)
q3 = df['BloodPressure'].quantile(0.75)
iqr = q3 - q1

print(q3 + 1.5 * iqr)
blpout = df[(df['BloodPressure'] > q3 + 1.5 * iqr) | (df['BloodPressure'] < q1 - 1.5 * iqr)]
sns.pairplot(blpout, hue='Outcome',palette='husl', diag_kind='hist')

In [None]:
sns.heatmap(blpout.corr(), annot=True, fmt='.2f')

### Diabetes Pedigree Function

In [None]:
sns.boxplot(x=df['DiabetesPedigreeFunction'])

In [None]:
q1 = df['DiabetesPedigreeFunction'].quantile(0.25)
q3 = df['DiabetesPedigreeFunction'].quantile(0.75)
iqr = q3 - q1

print(q3 + 1.5 * iqr)
dpfout = df[(df['DiabetesPedigreeFunction'] > q3 + 1.5 * iqr) | (df['DiabetesPedigreeFunction'] < q1 - 1.5 * iqr)]
sns.pairplot(dpfout, hue='Outcome',palette='husl', diag_kind='hist')

In [None]:
sns.heatmap(dpfout.corr(), annot=True, fmt='.2f')

### Age

In [None]:
sns.boxplot(x=df['Age'])

In [None]:
q1 = df['Age'].quantile(0.25)
q3 = df['Age'].quantile(0.75)
iqr = q3 - q1

print(q3 + 1.5 * iqr)
ageout = df[(df['Age'] > q3 + 1.5 * iqr) | (df['Age'] < q1 - 1.5 * iqr)]
sns.pairplot(ageout, hue='Outcome',palette='husl', diag_kind='hist')

In [None]:
sns.heatmap(ageout.corr(), annot=True, fmt='.2f')

### Números de outliers e porcentagens por atributo

In [None]:
outlier_count = {'Pregnancies':prgout.count().max(), 'Glucose':gluout.count().max(), 'Blood Pressure':blpout.count().max(), 'Skin Thickness':sknout.count().max(), 'Insulin':insout.count().max(), 'BMI':bmiout.count().max(), 'Diabetes Pedigree Function':dpfout.count().max(), 'Age':ageout.count().max()}
outlier_count = {k: v for k, v in reversed(sorted(outlier_count.items(), key=lambda item: item[1]))}
outlier_count

In [None]:
outlier_percent = {}
for key in outlier_count:
    outlier_percent[key] = round(outlier_count[key] / df.count().max(),4)

outlier_percent = {k: v for k, v in reversed(sorted(outlier_percent.items(), key=lambda item: item[1]))}
outlier_percent

In [None]:
#Procurando por valores nulos
df.isnull().sum()

### Cálculo de VIF

In [None]:
def calc_vif(tabela):
  vif = pd.DataFrame()
  vif['variaveis'] = tabela.columns
  vif['vif'] = [variance_inflation_factor(tabela.values, i) for i in range(tabela.shape[1])]

  return vif

vif = calc_vif(df)

In [None]:
vif.sort_values('vif', ascending=False)

### Importância das features
Foi feito um ranqueamento entre os atributos com base em um algoritmo de Random Forest

In [None]:
forest = RandomForestClassifier(n_estimators=100,
                              random_state=0)

forest.fit(X, y)

features = forest.feature_importances_
features_and_names = {}
for i in range(len(features)):
    features_and_names[df.columns[i]] = features[i]

features_and_names = {k: v for k, v in reversed(sorted(features_and_names.items(), key=lambda item: item[1]))}

In [None]:
features_and_names

Nota-se que os atributos mais importantes são Glucose, BMI, Age e DiabetesPedigreeFunction, enquanto Insulin e SkinThickness são os que menos influenciam.

## Distribuições

### Glucose (Outcomes positivos e negativos)
Nota-se uma considerável distinção entre a distribuição dos positivos e dos negativos

In [None]:
sns.distplot(df[df['Outcome'] == 0]['Glucose'], kde=False)
sns.distplot(df[df['Outcome'] == 1]['Glucose'], kde=False)

### BMI (Outcomes positivos e negativos)
A distribuição dos positivos se "camufla" no meio da distribuição dos negativos

In [None]:
sns.distplot(df[df['Outcome'] == 0]['BMI'], kde=False)
sns.distplot(df[df['Outcome'] == 1]['BMI'], kde=False)

### Distribuição geral de cada atributo

In [None]:
fig, ax = plt.subplots(3,3,figsize=(30,15))
for variable, i in zip(df.columns, range(len(df.columns))):
  sns.distplot(df[variable], ax=ax[i//3][i%3])
plt.show()

## Machine Learning
Dado que os Outcomes são valores 0 ou 1, foi decidido que classificadores são mais adequados para este problema.

### Modelos testados:
- Random Forest Classifier
- XGB Classifier
- Logistic Regression
- K Neighbors Classifier
- Decision Tree Classifier

In [None]:
stats_auc = {}
stats_ks = {}
stats_recall = {}

### Random Forest Classifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

model = RandomForestClassifier(n_estimators=100, random_state=0)
model.fit(X_train, y_train)

In [None]:
preds = model.predict(X_test)

print(preds[:20])
print(y_test.head(20))

fpr, tpr, _ = roc_curve(y_test, preds)
alg = 'Random Forest'
a = auc(fpr,tpr)
ks = stats.ks_2samp(preds,y_test)
rec = recall_score(y_test, preds)
stats_auc[alg] = a
stats_ks[alg] = ks.pvalue
stats_recall[alg] = rec
print("AUC:",a)
print("KS statistic:",ks.statistic, "pvalue:", ks.pvalue)
print("Recall:",rec)

In [None]:
probs = model.predict_proba(X_test)
sns.distplot(probs)

### XGB Classifier

In [None]:
xgb = XGBClassifier(n_estimators=100,learning_rate=0.05, random_state=0) 

xgb.fit(X_train, y_train)

In [None]:
preds = xgb.predict(X_test)

print(preds[:20])
print(y_test.head(20))

fpr, tpr, _ = roc_curve(y_test, preds)
alg = 'XGB'
a = auc(fpr,tpr)
ks = stats.ks_2samp(preds,y_test)
rec = recall_score(y_test, preds)
stats_auc[alg] = a
stats_ks[alg] = ks.pvalue
stats_recall[alg] = rec
print("AUC:",a)
print("KS statistic:",ks.statistic, "pvalue:", ks.pvalue)
print("Recall:",rec)

In [None]:
probs = xgb.predict_proba(X_test)
sns.distplot(probs)

### Logistic Regression

In [None]:
lr = LogisticRegression(random_state=0,max_iter=1000000)
lr.fit(X_train, y_train)

In [None]:
preds = lr.predict(X_test)

fpr, tpr, _ = roc_curve(y_test, preds)
alg = 'Logistic Regression'
a = auc(fpr,tpr)
ks = stats.ks_2samp(preds,y_test)
rec = recall_score(y_test, preds)
stats_auc[alg] = a
stats_ks[alg] = ks.pvalue
stats_recall[alg] = rec
print("AUC:",a)
print("KS statistic:",ks.statistic, "pvalue:", ks.pvalue)
print("Recall:",rec)

In [None]:
sns.distplot(preds)

### K Neighbors Classifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=13)
knn.fit(X_train, y_train)

In [None]:
preds = knn.predict(X_test)

fpr, tpr, _ = roc_curve(y_test, preds)
alg = 'KNN'
a = auc(fpr,tpr)
ks = stats.ks_2samp(preds,y_test)
rec = recall_score(y_test, preds)
stats_auc[alg] = a
stats_ks[alg] = ks.pvalue
stats_recall[alg] = rec
print("AUC:",a)
print("KS statistic:",ks.statistic, "pvalue:", ks.pvalue)
print("Recall:",rec)

In [None]:
probs = knn.predict_proba(X_test)
sns.distplot(probs)

### Decision Tree Classifier

In [None]:
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)

In [None]:
preds = tree.predict(X_test)

print(preds[:20])
print(y_test.head(20))

fpr, tpr, _ = roc_curve(y_test, preds)
alg = 'Decision Tree'
a = auc(fpr,tpr)
ks = stats.ks_2samp(y_test,preds)
rec = recall_score(y_test, preds)
stats_auc[alg] = a
stats_ks[alg] = ks.pvalue
stats_recall[alg] = rec
print("AUC:",a)
print("KS statistic:",ks.statistic, "pvalue:", ks.pvalue)
print("Recall:",rec)

In [None]:
probs = tree.predict_proba(X_test)
sns.distplot(probs)

## Resultados

Foram utilizadas 3 métricas: AUC, Kolmogorov-Smirnov e Recall Médio. Todos os resultados estão em ordem decrescente.

### AUC

In [None]:
stats_auc = {k: v for k, v in reversed(sorted(stats_auc.items(), key=lambda item: item[1]))}
stats_auc

### Kolmogorov-Smirnov

In [None]:
stats_ks = {k: v for k, v in reversed(sorted(stats_ks.items(), key=lambda item: item[1]))}
stats_ks

### Recall médio

In [None]:
stats_recall = {k: v for k, v in reversed(sorted(stats_recall.items(), key=lambda item: item[1]))}
stats_recall

Em termos de distribuição das predições, o modelo que mais aproximou da distribuição real do dataset foi a Logistic Regression. <br><br>
Quanto ao resultado AUC, os valores mais altos foram do XGB e Logistic Regression, aproximando-se de 0.75. <br><br>
No teste KS, a Decision Tree, XGB e KNN tiveram resultados próximos ao 1.0, enquanto Logistic Regression e Random Forest tiveram resultados mais baixos, com 0.77 e 0.51. <br><br>
No recall médio, o XGB Classifier mostrou os maiores resultados, com 0.66, enquanto todos os outros encontram-se abaixo de 0.60.