In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
sns.set_theme(context = 'paper')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
df.head()

About the data:

age: Age of the patient

anaemia: If the patient had the haemoglobin below the normal range

creatinine_phosphokinase: The level of the creatine phosphokinase in the blood in mcg/L

diabetes: If the patient was diabetic

ejection_fraction: Ejection fraction is a measurement of how much blood the left ventricle pumps out with each contraction

high_blood_pressure: If the patient had hypertension

platelets: Platelet count of blood in kiloplatelets/mL

serum_creatinine: The level of serum creatinine in the blood in mg/dL

serum_sodium: The level of serum sodium in the blood in mEq/L

sex: The sex of the patient

smoking: If the patient smokes actively or ever did in past

time: It is the time of the patient's follow-up visit for the disease in months

DEATH_EVENT: If the patient deceased during the follow-up period

# **Explorando o Dataset**

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Buscando Duplicatas
df.duplicated().sum()

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(df['DEATH_EVENT'])
plt.title("Distribuição de Classes \n 0: Morte | 1: Sobreviveu")

In [None]:
# plt.figure(figsize=(15, 10))
# sns.pairplot(df)
# plt.show()

In [None]:
corr = df.drop(columns= ['smoking', 'high_blood_pressure'], axis = 1).corr()
plt.figure(figsize=(14,8))
sns.heatmap(corr, annot=True, vmax=1, vmin=-1)
plt.xticks(rotation=85, size=15)
plt.show()

# **Pré-processamento**

In [None]:
# Separando os labels do dataset
y = np.array(df['DEATH_EVENT'])

#X = df.drop('DEATH_EVENT', axis = 1)
X = df.drop(columns= ['DEATH_EVENT', 'smoking', 'high_blood_pressure'], axis = 1)

In [None]:
# Gerando o dataset com o get_dummies e transformando em um array
X = pd.get_dummies(X)
y = np.array(y)

In [None]:
# colunas = df.columns

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [None]:
# Separando os dados em treino e teste
# from sklearn.model_selection import train_test_split

# Aplicando a função de treino e teste para separar os conjuntos de treino e teste segundo uma porcentagem definida
# X_train, X_test, y_train, y_test = train_test_split (X, y,
#                                                      stratify=y,
#                                                      shuffle=True,
#                                                      test_size=0.2,                                                     
#                                                      random_state=42)

In [None]:
# Importando a estratedia de métricas
from sklearn.metrics import f1_score, make_scorer, accuracy_score, confusion_matrix

from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold

rskf = RepeatedStratifiedKFold(n_splits=3, n_repeats=10, random_state=42)

X = np.array(X)
for train_index, test_index in rskf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

# Importando o modelo de Regressão Linear
from sklearn.linear_model import LogisticRegressionCV
lr = LogisticRegressionCV(random_state=42)

# Treinando o modelo
lr_model = lr.fit(X_train, y_train)

# Realizando as previsões
y_pred = lr.predict(X_test)

In [None]:
acc = accuracy_score(y_test, y_pred)
mc = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Acurácia: {acc*100:.2f}%")
print(f"F1 Score: {f1*100:.2f}")
print('Matriz de Confusão:\n', mc)

# **Random Forest**

In [None]:
# Importando o modelo de Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=5, n_estimators=100, random_state=42)

# Treinando o modelo
rf.fit(X_train, y_train)

# Realizando Previsões
y_pred = rf.predict(X_test)

In [None]:
acc = accuracy_score(y_test, y_pred)
mc = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Acurácia: {acc*100:.2f}%")
print(f"F1 Score: {f1*100:.2f}")
print('Matriz de Confusão:\n', mc)

# **Decision Tree Classifier**

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)
y_pred = dt.predict(X_test)

In [None]:
acc = accuracy_score(y_test, y_pred)
mc = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Acurácia: {acc*100:.2f}%")
print(f"F1 Score: {f1*100:.2f}")
print('Matriz de Confusão:\n', mc)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, p=1).fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [None]:
acc = accuracy_score(y_test, y_pred)
mc = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Acurácia: {acc*100:.2f}%")
print(f"F1 Score: {f1*100:.2f}")
print('Matriz de Confusão:\n', mc)