In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pylab
pylab.rcParams['figure.figsize'] = (15.0, 10.0)

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.hist(bins=10)

In [None]:
df.isnull().sum()

In [None]:
sns.boxplot(data = df, orient= 'v', palette="pastel")
plt.title("Distribuição das características dos registros")

In [None]:
sns.pairplot(df, hue="Outcome", palette="Set1")

In [None]:
ax = sns.boxplot(x="Outcome", y="Age", data=df)

ax.figure.set_size_inches(14, 8)
ax.set_title("Distribuição das Idades entre Diabéticos e não diabeticos", fontsize=14)
ax.set_xlabel("Quantidade de Diabéticas", fontsize= 14)
ax.set_ylabel("Idade", fontsize = 14)
ax.set_xticklabels(["Diabeticos ", "Nao Diabeticos"], fontsize=14)
ax

In [None]:
df_correlacao = df.corr()

mask = np.triu(np.ones_like(df_correlacao, dtype=np.bool))
plt.figure(figsize=(12, 8))
heatmap = sns.heatmap(df_correlacao, annot=True, cmap='RdBu', fmt='.2f', mask=mask, square=True, linecolor="white")
heatmap.set_title("Correlação entre as características")

In [None]:
x = df.drop(["Outcome"], axis=1, inplace=False)
y = df["Outcome"]

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (0, 1))
X = scaler.fit_transform(x)
colunas = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'Pregnancies', 'DiabetesPedigreeFunction', 'Age']
X = pd.DataFrame(X)
X.columns = colunas

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 
num_folds = 10 # numero folds da validação cruzada
num_instances = len(X) #numero de instancias
seed = 7 # numero do seed garante que o resultado seja sempre o mesmo

Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

teste = SelectKBest(score_func = chi2, k=5)
fit = teste.fit(X,y)

print(fit.scores_)
features = fit.transform(X)

features = pd.DataFrame(features)
print(features)

Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

arvore = DecisionTreeClassifier()
arvore.fit(x_train, y_train)
result = arvore.predict(x_test)

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix
print(arvore.score(x_test, y_test))
print(confusion_matrix(y_test, result))
print(metrics.classification_report(y_test, result))

**Encontrando as features mais importantes**

In [None]:
arvore.feature_importances_

In [None]:
feature_imp = pd.Series(arvore.feature_importances_, index=X.columns)

In [None]:
feature_imp.sort_values(ascending=False)

**Visualizando as features mais importantes**

In [None]:
plt.figure(figsize=(12, 6))
feature_imp_sort = feature_imp.sort_values(ascending=False)
sns.barplot(x=feature_imp_sort, y=feature_imp_sort.index)
plt.title("Importância de Features")
plt.show()

**Testando os modelos**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics
from sklearn.metrics import confusion_matrix


In [None]:
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold, train_test_split, GridSearchCV

# Definindo os valores para o número de folds
num_folds = 5
num_instances = len(X)
seed = 7

# Preparando o modelo

modelos = []
modelos.append(("LogisticRegression", LogisticRegression()))
modelos.append(("DecisionTreeClassifier", DecisionTreeClassifier(random_state=0, max_depth=30, max_features=5)))
modelos.append(("KNeighborsClassifier", KNeighborsClassifier(n_neighbors=17, p=1)))
modelos.append(("MLPClassifier", MLPClassifier(hidden_layer_sizes=(20, 10, 20), max_iter=3000)))
modelos.append(("SuportVectorMachine", SVC(kernel='linear', gamma='auto')))
modelos.append(("NaiveBayes", GaussianNB()))
modelos.append(("RandomForestClassifier", RandomForestClassifier(n_estimators=200, criterion='entropy', n_jobs=-1, max_depth=100, 
                                                                 bootstrap=True, random_state=0)))

# Avaliando cada modelo
resultados = []
nomes = []

for nome, modelo in modelos:
   kfold = KFold(n_splits = num_folds, shuffle=True, random_state = seed)
   cv_results = cross_val_score(modelo, X, y, cv = kfold, scoring = 'f1_macro')
   resultados.append(cv_results)
   nomes.append(nome)
   msg = "%s - F1 Score: %f, Desvio Padrão: %f" % (nome, cv_results.mean(), cv_results.std())
   print(msg)

**Transformando esses resultados em um Dataframe**

In [None]:
df_resultado = pd.DataFrame(resultados, columns=["Resultado1", "Resultado2", "Resultado3", "Resultado4", "Resultado5"])
df_resultado

In [None]:
df_nomes = pd.DataFrame(nomes, columns=["Nomes"])
df_nomes

**Agora podemos juntar o Dataframe de nomes e os resultados após cada treinamento**

In [None]:
nome_resultado = pd.concat([df_nomes, df_resultado], axis=1)
nome_resultado

**Criando uma coluna com a media dos resultados obtidos em cada treinamento**

In [None]:
nome_resultado["Media"] = (nome_resultado["Resultado1"]+ nome_resultado["Resultado2"]+ nome_resultado["Resultado3"]+ 
                           nome_resultado["Resultado4"]+ nome_resultado["Resultado5"]) / 5
nome_resultado.sort_values(by="Media", ascending=False)

In [None]:
fig = plt.figure(figsize = (18, 6))
splot = sns.barplot(x="Nomes", y="Media", data=nome_resultado.sort_values(by="Media", ascending=False))
# percorrendo cada barra e calculando sua altura para imprimir no gráfico
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.6f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
plt.title("Média F1 dos modelos treinados")