## Importando as bibliotecas/Packages


Qual o problema de negócio?

Construir uma maquina preditiva e fazer um EDA,com o intuito de prever se o paciente é mais suscetível,ou não,a um possível atauqe cardíaco

What is the business problem?

Build a predictive machine and do EDA, in order to predict if the patient is more susceptible, or not, to a possible heart attack

In [None]:
!pip install dataprep;

In [None]:
#Bibliotecas para manipulações dos dados
import numpy as np
import pandas as pd
import warnings
import pickle
warnings.filterwarnings('ignore')

#bibliotecas para a visualização dos dados
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from dataprep.eda import create_report
from dataprep.eda import plot_missing
from dataprep.eda import plot_correlation
from dataprep.eda import plot

#biblioecas para o pré processamento de uma maquina preditiva
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


#bibliotecas de algoritímos de classificação
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
import xgboost as xgb


#bibliotecas de validação da maquina preditiva

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from yellowbrick.classifier import ConfusionMatrix


## Descrições das variaveis/Data Dictionary








* **VARIÁVEIS CONTÍNUAS** 

---



age - Idade do paciente

trtbps - Pressão sanguínea em repouso (em mm Hg)

chol - Colesterol em mg / dl obtido por meio do sensor de IMC

thalachh - Frequência cardíaca máxima alcançada

oldpeak - pico anterior

* **VARIÁVEIS CATEGÓRICAS**

---



sex - Sexo do paciente

cp - Tipo de dor torácica ~ 0 = angina típica, 1 = angina atípica, 2 = dor não anginosa, 3 = assintomática


fbs - (açúcar no sangue em jejum> 120 mg / dl) ~ 1 = Verdadeiro, 0 = Falso

restecg - Resultados eletrocardiográficos em repouso ~ 0 = Normal, 1 = normalidade da onda ST-T, 2 = hipertrofia ventricular esquerda

-slp - Declive

número de vasos cardíacos principais 

thall - resultado do teste de estresse = (0,3)

exng - Angina induzida por exercício = (1 = Sim, 0 = Não)


* **VARIÁVEL ALVO**
---

-output - Variável de destino(1 = alto,0 = baixo)

## Leitura e primeiras visualizações dos dados/Reading the dataset

In [None]:
#lendo os dados da ocorrencia de atque cardíaco


In [None]:
df=pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')

In [None]:
#analise dos dados

In [None]:
cat_cols = ['sex','exng','caa','cp','fbs','restecg','slp','thall']
con_cols = ["age","trtbps","chol","thalachh","oldpeak"]
target_col = ["output"]

print("As variáveis categóricas são : ", cat_cols)
print("As variáveis contínuas são : ", con_cols)
print("A variável alvo é o :  ", target_col)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.info()

## Limpeza dos dados/Data cleaning

In [None]:
#Tratamento dos valores faltantes

In [None]:
df.isnull().sum()

In [None]:
plot_missing(df)

In [None]:
#Tratamento de linhas duplicadas

In [None]:
df.duplicated().sum()

In [None]:
df.loc[df.duplicated(keep=False),:]

In [None]:
df.drop_duplicates(keep='first',inplace=True)

In [None]:
df.shape

## EDA

As variáveis categóricas:
['sex', 'exng', 'caa', 'cp', 'fbs', 'restecg', 'slp', 'thall']

---



As variáveis contínuas são 
['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']

### Contagem e distribuição das variáveis cetegóricas/Counting and distribution of cat features

In [None]:
fig = plt.figure(figsize=(18,15))
gs = fig.add_gridspec(3,3)
gs.update(wspace=0.5, hspace=0.25)
ax0 = fig.add_subplot(gs[0,0])
ax1 = fig.add_subplot(gs[0,1])
ax2 = fig.add_subplot(gs[0,2])
ax3 = fig.add_subplot(gs[1,0])
ax4 = fig.add_subplot(gs[1,1])
ax5 = fig.add_subplot(gs[1,2])
ax6 = fig.add_subplot(gs[2,0])
ax7 = fig.add_subplot(gs[2,1])
ax8 = fig.add_subplot(gs[2,2])

background_color = "#e0fbfc"
color_palette = ["#800000","#8000ff","#6aac90","#5833ff","#da8829"]
fig.patch.set_facecolor(background_color) 
ax0.set_facecolor(background_color) 
ax1.set_facecolor(background_color) 
ax2.set_facecolor(background_color) 
ax3.set_facecolor(background_color) 
ax4.set_facecolor(background_color) 
ax5.set_facecolor(background_color) 
ax6.set_facecolor(background_color) 
ax7.set_facecolor(background_color) 
ax8.set_facecolor(background_color) 

# Title of the plot
ax0.spines["bottom"].set_visible(False)
ax0.spines["left"].set_visible(False)
ax0.spines["top"].set_visible(False)
ax0.spines["right"].set_visible(False)
ax0.tick_params(left=False, bottom=False)
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.text(0.5,0.5,
         'Count plot para \n variáveis categóricas\n_________________',
         horizontalalignment='center',
         verticalalignment='center',
         fontsize=18, fontweight='bold',
         fontfamily='serif',
         color="#000000")

# Sex count
ax1.text(0.3, 220, 'Sex', fontsize=15, fontweight='bold', fontfamily='serif', color="#000000")
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.countplot(ax=ax1,data=df,x='sex',palette=color_palette)
ax1.set_xlabel("")
ax1.set_ylabel("")

# Exng count
ax2.text(0.3, 220, 'Exng', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax2.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.countplot(ax=ax2,data=df,x='exng',palette=color_palette)
ax2.set_xlabel("")
ax2.set_ylabel("")

# Caa count
ax3.text(1.5, 200, 'Caa', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax3.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.countplot(ax=ax3,data=df,x='caa',palette=color_palette)
ax3.set_xlabel("")
ax3.set_ylabel("")

# Cp count
ax4.text(1.5, 162, 'Cp', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax4.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.countplot(ax=ax4,data=df,x='cp',palette=color_palette)
ax4.set_xlabel("")
ax4.set_ylabel("")

# Fbs count
ax5.text(0.5, 290, 'Fbs', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax5.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.countplot(ax=ax5,data=df,x='fbs',palette=color_palette)
ax5.set_xlabel("")
ax5.set_ylabel("")

# Restecg count
ax6.text(0.75, 165, 'Restecg', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax6.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.countplot(ax=ax6,data=df,x='restecg',palette=color_palette)
ax6.set_xlabel("")
ax6.set_ylabel("")

# Slp count
ax7.text(0.85, 155, 'Slp', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax7.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.countplot(ax=ax7,data=df,x='slp',palette=color_palette)
ax7.set_xlabel("")
ax7.set_ylabel("")

# Thall count
ax8.text(1.2, 180, 'Thall', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax8.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.countplot(ax=ax8,data=df,x='thall',palette=color_palette)
ax8.set_xlabel("")
ax8.set_ylabel("")

for s in ["top","right","left"]:
    ax1.spines[s].set_visible(False)
    ax2.spines[s].set_visible(False)
    ax3.spines[s].set_visible(False)
    ax4.spines[s].set_visible(False)
    ax5.spines[s].set_visible(False)
    ax6.spines[s].set_visible(False)
    ax7.spines[s].set_visible(False)
    ax8.spines[s].set_visible(False)

### Contagem e Distribuição das variáveis contínuas/Counting and distribuition of con features

In [None]:
fig = plt.figure(figsize=(20,16))
gs = fig.add_gridspec(2,3)
gs.update(wspace=0.3, hspace=0.15)
ax0 = fig.add_subplot(gs[0,0])
ax1 = fig.add_subplot(gs[0,1])
ax2 = fig.add_subplot(gs[0,2])
ax3 = fig.add_subplot(gs[1,0])
ax4 = fig.add_subplot(gs[1,1])
ax5 = fig.add_subplot(gs[1,2])

background_color = "#e0fbfc"
color_palette = ["#800000","#8000ff","#6aac90","#5833ff","#da8829"]
fig.patch.set_facecolor(background_color) 
ax0.set_facecolor(background_color) 
ax1.set_facecolor(background_color) 
ax2.set_facecolor(background_color) 
ax3.set_facecolor(background_color) 
ax4.set_facecolor(background_color) 
ax5.set_facecolor(background_color) 

# Titulo
ax0.spines["bottom"].set_visible(False)
ax0.spines["left"].set_visible(False)
ax0.spines["top"].set_visible(False)
ax0.spines["right"].set_visible(False)
ax0.tick_params(left=False, bottom=False)
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.text(0.5,0.5,
         'Verificando a distribuição das\n variáveis contínuas \n________________',
         horizontalalignment='center',
         verticalalignment='center',
         fontsize=15, fontweight='bold',
         fontfamily='serif',
         color="#000000")

# Age 
ax1.text(-0.08, 81, 'idade', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax1.grid(color='#000000', linestyle=':', axis='y',  dashes=(1,5))
sns.boxenplot(ax=ax1,y=df['age'],palette=["#800000"],width=0.6)
ax1.set_xlabel("")
ax1.set_ylabel("")

# Trtbps 
ax2.text(-0.25, 208, 'Pressão sangínea', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax2.grid(color='#000000', linestyle=':', axis='y',dashes=(1,5))
sns.boxenplot(ax=ax2,y=df['trtbps'],palette=["#8000ff"],width=0.6)
ax2.set_xlabel("")
ax2.set_ylabel("")

# Chol 
ax3.text(-0.15, 600, 'Colesterol', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax3.grid(color='#000000', linestyle=':', axis='y',  dashes=(1,5))
sns.boxenplot(ax=ax3,y=df['chol'],palette=["#6aac90"],width=0.6)
ax3.set_xlabel("")
ax3.set_ylabel("")

# Thalachh 
ax4.text(-0.40, 210, 'Taxa cardíaca máxima alcançada', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax4.grid(color='#000000', linestyle=':', axis='y', dashes=(1,5))
sns.boxenplot(ax=ax4,y=df['thalachh'],palette=["#5833ff"],width=0.6)
ax4.set_xlabel("")
ax4.set_ylabel("")

# oldpeak 
ax5.text(-0.1, 6.6, 'Oldpeak', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax5.grid(color='#000000', linestyle=':', axis='y',  dashes=(1,5))
sns.boxenplot(ax=ax5,y=df['oldpeak'],palette=["#da8829"],width=0.6)
ax5.set_xlabel("")
ax5.set_ylabel("")

for s in ["top","right","left"]:
    ax1.spines[s].set_visible(False)
    ax2.spines[s].set_visible(False)
    ax3.spines[s].set_visible(False)
    ax4.spines[s].set_visible(False)
    ax5.spines[s].set_visible(False)

### Distribuição do output/Output distribuition

In [None]:
fig = plt.figure(figsize=(10,7))
gs = fig.add_gridspec(1,2)
gs.update(wspace=0.5, hspace=0.25)
ax0 = fig.add_subplot(gs[0,0])
ax1 = fig.add_subplot(gs[0,1])

background_color = "#e0fbfc"
color_palette = ["#800000","#8000ff","#6aac90","#5833ff","#da8829"]
fig.patch.set_facecolor(background_color) 
ax0.set_facecolor(background_color) 
ax1.set_facecolor(background_color) 

# Title of the plot
ax0.spines["bottom"].set_visible(False)
ax0.spines["left"].set_visible(False)
ax0.spines["top"].set_visible(False)
ax0.spines["right"].set_visible(False)
ax0.tick_params(left=False, bottom=False)
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.text(0.5,0.5,
         'Count plot do \n OUTPUT\n_________________',
         horizontalalignment='center',
         verticalalignment='center',
         fontsize=18, fontweight='bold',
         fontfamily='serif',
         color="#000000")

# output count
ax1.text(0.3, 190, 'OUTPUT', fontsize=15, fontweight='bold', fontfamily='serif', color="#000000")
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.countplot(ax=ax1,data=df,x='output',palette=color_palette)
ax1.set_xlabel("")
ax1.set_ylabel("")




### Correleção entre as varíaveis/Correlation Matrix

In [None]:
fig = plt.figure(figsize=(10,10))
gs = fig.add_gridspec(1,1)
gs.update(wspace=0.3, hspace=0.15)
ax0 = fig.add_subplot(gs[0,0])

color_palette = ["#5833ff","#da8829"]
df_corr = df[con_cols].corr().transpose()
ax0.text(1.5,-0.1,"Correlation Matrix",fontsize=22, fontweight='bold', fontfamily='serif', color="#000000")
sns.heatmap(df_corr,fmt=".1f",annot=True,cmap='YlGnBu')
plt.show()

In [None]:
plot_correlation(df)

### **Relação das variáveis contínuas com o output (1 = alto,0 =baixo)/Relation among con features  to output (1 = high, 0 = low)**

* AGE
* TRTBPS
* CHOL
* THALACHH
* OLDPEAK

In [None]:
fig = plt.figure(figsize=(18,18))
gs = fig.add_gridspec(5,2)
gs.update(wspace=0.5, hspace=0.6)
ax0 = fig.add_subplot(gs[0,0])
ax1 = fig.add_subplot(gs[0,1])
ax2 = fig.add_subplot(gs[1,0])
ax3 = fig.add_subplot(gs[1,1])
ax4 = fig.add_subplot(gs[2,0])
ax5 = fig.add_subplot(gs[2,1])
ax6 = fig.add_subplot(gs[3,0])
ax7 = fig.add_subplot(gs[3,1])
ax8 = fig.add_subplot(gs[4,0])
ax9 = fig.add_subplot(gs[4,1])

background_color = "#e0fbfc"

color_palette = ["#800000","#8000ff","#6aac90","#5833ff","#da8829"]
fig.patch.set_facecolor(background_color) 
ax0.set_facecolor(background_color) 
ax1.set_facecolor(background_color) 
ax2.set_facecolor(background_color)
ax3.set_facecolor(background_color)
ax4.set_facecolor(background_color)
ax5.set_facecolor(background_color) 
ax6.set_facecolor(background_color) 
ax7.set_facecolor(background_color)
ax8.set_facecolor(background_color)
ax9.set_facecolor(background_color)

# Age titulo
ax0.text(0.5,0.5,"Relação da age\ncom\n a variável alvo\n___________",
        horizontalalignment = 'center',
        verticalalignment = 'center',
        fontsize = 18,
        fontweight='bold',
        fontfamily='serif',
        color='#000000')
ax0.spines["bottom"].set_visible(False)
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.tick_params(left=False, bottom=False)

# Age
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.kdeplot(ax=ax1, data=df, x='age',hue="output", fill=True,palette=["#8000ff","#da8829"], alpha=.5, linewidth=0)
ax1.set_xlabel("")
ax1.set_ylabel("")

# TrTbps titulo
ax2.text(0.5,0.5,"Relação do Trtbps\ncom\n a variável alvo\n___________",
        horizontalalignment = 'center',
        verticalalignment = 'center',
        fontsize = 18,
        fontweight='bold',
        fontfamily='serif',
        color='#000000')
ax2.spines["bottom"].set_visible(False)
ax2.set_xticklabels([])
ax2.set_yticklabels([])
ax2.tick_params(left=False, bottom=False)

# TrTbps
ax3.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.kdeplot(ax=ax3, data=df, x='trtbps',hue="output", fill=True,palette=["#8000ff","#da8829"], alpha=.5, linewidth=0)
ax3.set_xlabel("")
ax3.set_ylabel("")

# Chol titulo
ax4.text(0.5,0.5,"Relação do chol\nem coreelação com\n a variável alvo\n___________",
        horizontalalignment = 'center',
        verticalalignment = 'center',
        fontsize = 18,
        fontweight='bold',
        fontfamily='serif',
        color='#000000')
ax4.spines["bottom"].set_visible(False)
ax4.set_xticklabels([])
ax4.set_yticklabels([])
ax4.tick_params(left=False, bottom=False)

# Chol
ax5.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.kdeplot(ax=ax5, data=df, x='chol',hue="output", fill=True,palette=["#8000ff","#da8829"], alpha=.5, linewidth=0)
ax5.set_xlabel("")
ax5.set_ylabel("")


# thalachh titulo
ax6.text(0.5,0.5,"Relação do thlachh\ncom\n a variável alvo\n___________",
        horizontalalignment = 'center',
        verticalalignment = 'center',
        fontsize = 18,
        fontweight='bold',
        fontfamily='serif',
        color='#000000')
ax6.spines["bottom"].set_visible(False)
ax6.set_xticklabels([])
ax6.set_yticklabels([])
ax6.tick_params(left=False, bottom=False)

# thalachh
ax7.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.boxenplot(ax=ax7, data=df,x='output',y='thalachh',palette=["#8000ff","#da8829"])
ax7.set_xlabel("")
ax7.set_ylabel("")


# Oldpeak title
ax8.text(0.5,0.5,"Relação do oldpeak \ncom\n a variável alvo",
        horizontalalignment = 'center',
        verticalalignment = 'center',
        fontsize = 18,
        fontweight='bold',
        fontfamily='serif',
        color='#000000')
ax8.spines["bottom"].set_visible(False)
ax8.set_xticklabels([])
ax8.set_yticklabels([])
ax8.tick_params(left=False, bottom=False)

# Oldpeak
ax9.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.kdeplot(ax=ax9, data=df, x='oldpeak',hue="output", fill=True,palette=["#8000ff","#da8829"], alpha=.5, linewidth=0)
ax9.set_xlabel("")
ax9.set_ylabel("")

for i in ["top","left","right"]:
    ax0.spines[i].set_visible(False)
    ax1.spines[i].set_visible(False)
    ax2.spines[i].set_visible(False)
    ax3.spines[i].set_visible(False)
    ax4.spines[i].set_visible(False)
    ax5.spines[i].set_visible(False)
    ax6.spines[i].set_visible(False)
    ax7.spines[i].set_visible(False)
    ax8.spines[i].set_visible(False)
    ax9.spines[i].set_visible(False)


### **Relação entre as variáveis categóricas com o output (1 = alto,0 =baixo)/Relation among cat features and output (1 = high, 0 = low)**

*   RESTECG
*   EXNG 
*   SLP
*   FBS
*   THALL 
*   CP  
*   CAA
*   SEX 



In [None]:
fig = plt.figure(figsize=(30,25))
gs = fig.add_gridspec(11,2)
gs.update(wspace=0.5, hspace=0.5)
ax0 = fig.add_subplot(gs[0,0])
ax1 = fig.add_subplot(gs[0,1])
ax2 = fig.add_subplot(gs[1,0])
ax3 = fig.add_subplot(gs[1,1])
ax4 = fig.add_subplot(gs[2,0])
ax5 = fig.add_subplot(gs[2,1])
ax6 = fig.add_subplot(gs[3,0])
ax7 = fig.add_subplot(gs[3,1])
ax8 = fig.add_subplot(gs[4,0])
ax9 = fig.add_subplot(gs[4,1])
ax10 = fig.add_subplot(gs[5,0])
ax11 = fig.add_subplot(gs[5,1])
ax12 = fig.add_subplot(gs[6,0])
ax13 = fig.add_subplot(gs[6,1])
ax14 = fig.add_subplot(gs[7,0])
ax15 = fig.add_subplot(gs[7,1])

background_color = "#e0fbfc"
color_palette = ["#800000","#8000ff","#6aac90","#5833ff","#da8829"]
fig.patch.set_facecolor(background_color) 
ax0.set_facecolor(background_color) 
ax1.set_facecolor(background_color) 
ax2.set_facecolor(background_color)
ax3.set_facecolor(background_color)
ax4.set_facecolor(background_color)
ax5.set_facecolor(background_color) 
ax6.set_facecolor(background_color) 
ax7.set_facecolor(background_color)
ax8.set_facecolor(background_color)
ax9.set_facecolor(background_color)
ax10.set_facecolor(background_color)
ax11.set_facecolor(background_color)
ax12.set_facecolor(background_color)
ax13.set_facecolor(background_color)
ax14.set_facecolor(background_color)
ax15.set_facecolor(background_color)


# Cp title
# 0 = Typical Angina, 1 = Atypical Angina, 2 = Non-anginal Pain, 3 = Asymptomatic
ax0.text(0.5,0.5,"Relação do cp \ncom \n a variável alvo\n___________",
        horizontalalignment = 'center',
        verticalalignment = 'center',
        fontsize = 18,
        fontweight='bold',
        fontfamily='serif',
        color='#000000')
ax0.spines["bottom"].set_visible(False)
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.tick_params(left=False, bottom=False)
ax0.text(1,.5,"0 - Típica Angina\n1 - Atípica Angina\n2 - Dor não Anginosa\n3 - Assintomática",
        horizontalalignment = 'center',
         verticalalignment = 'center',
        fontsize = 14
        )

# Cp
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.kdeplot(ax=ax1, data=df, x='cp',hue="output", fill=True,palette=["#8000ff","#da8829"], alpha=.5, linewidth=0)
ax1.set_xlabel("")
ax1.set_ylabel("")

# Caa title
ax2.text(0.5,0.5,"Relação do caa \ncom \n a variável alvo\n___________",
        horizontalalignment = 'center',
        verticalalignment = 'center',
        fontsize = 18,
        fontweight='bold',
        fontfamily='serif',
        color='#000000')
ax2.text(1,.5,"Numero de vasos sanguíneos\n0,1,2,3,4\n___________",
        horizontalalignment = 'center',
         verticalalignment = 'center',
        fontsize = 14
        )

ax2.spines["bottom"].set_visible(False)
ax2.set_xticklabels([])
ax2.set_yticklabels([])
ax2.tick_params(left=False, bottom=False)

# Caa
ax3.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.kdeplot(ax=ax3, data=df, x='caa',hue="output", fill=True,palette=["#8000ff","#da8829"], alpha=.5, linewidth=0)
ax3.set_xlabel("")
ax3.set_ylabel("")

# Sex title
ax4.text(0.5,0.5,"Relação do sex \ncom \n a variável alvo\n___________",
        horizontalalignment = 'center',
        verticalalignment = 'center',
        fontsize = 18,
        fontweight='bold',
        fontfamily='serif',
        color='#000000')
ax4.text(1,.5,"0 - Female\n1 - Male",
        horizontalalignment = 'center',
         verticalalignment = 'center',
        fontsize = 14
        )
ax4.spines["bottom"].set_visible(False)
ax4.set_xticklabels([])
ax4.set_yticklabels([])
ax4.tick_params(left=False, bottom=False)

# Sex
ax5.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.countplot(ax=ax5,data=df,x='sex',palette=["#8000ff","#da8829"], hue='output')
ax5.set_xlabel("")
ax5.set_ylabel("")

# Thall title
ax6.text(0.5,0.5,"Relação do thall \ncom \n a variável alvo\n___________",
        horizontalalignment = 'center',
        verticalalignment = 'center',
        fontsize = 18,
        fontweight='bold',
        fontfamily='serif',
        color='#000000')
ax6.text(1,.5,"Resultado do teste\nde estresse\n0, 1, 2, 3",
        horizontalalignment = 'center',
         verticalalignment = 'center',
        fontsize = 14
        )
ax6.spines["bottom"].set_visible(False)
ax6.set_xticklabels([])
ax6.set_yticklabels([])
ax6.tick_params(left=False, bottom=False)

# Thall
ax7.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.kdeplot(ax=ax7, data=df, x='thall',hue="output", fill=True,palette=["#8000ff","#da8829"], alpha=.5, linewidth=0)
ax7.set_xlabel("")
ax7.set_ylabel("")

# fbs title
ax8.text(0.5,0.5,"Relação do fbs \ncom \n a variável alvo\n___________",
        horizontalalignment = 'center',
        verticalalignment = 'center',
        fontsize = 18,
        fontweight='bold',
        fontfamily='serif',
        color='#000000')
ax8.text(1,.5,"0 -menor que 120mg de açucar\n1 -maior que 120mg de açucar ",
        horizontalalignment = 'center',
         verticalalignment = 'center',
        fontsize = 14
        )
ax8.spines["bottom"].set_visible(False)
ax8.set_xticklabels([])
ax8.set_yticklabels([])
ax8.tick_params(left=False, bottom=False)

# fbs
ax9.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.countplot(ax=ax9,data=df,x='fbs',palette=["#8000ff","#da8829"], hue='output')
ax9.set_xlabel("")
ax9.set_ylabel("")

# Exng title

ax10.text(0.5,0.5,"Relação do exng \ncom \n a variável alvo\n___________",
        horizontalalignment = 'center',
        verticalalignment = 'center',
        fontsize = 18,
        fontweight='bold',
        fontfamily='serif',
        color='#000000')
ax10.text(1,.5,"Angina induzida por exercício\n0 = Não,1 = Sim",
        horizontalalignment = 'center',
         verticalalignment = 'center',
        fontsize = 14
        )
#Exng
ax10.spines["bottom"].set_visible(False)
ax10.set_xticklabels([])
ax10.set_yticklabels([])
ax10.tick_params(left=False, bottom=False)

ax11.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.countplot(ax=ax11,data=df,x='exng',palette=["#8000ff","#da8829"], hue='output')
ax11.set_xlabel("")
ax11.set_ylabel("")


# Restecg title
ax12.text(0.5,0.5,"Relação do Restecg \ncom \n a variável alvo\n___________",
        horizontalalignment = 'center',
        verticalalignment = 'center',
        fontsize = 18,
        fontweight='bold',
        fontfamily='serif',
        color='#000000')
ax12.text(1,.5,"Resultados eletrocardiográficos\n(0, 1, 2)",
        horizontalalignment = 'center',
         verticalalignment = 'center',
        fontsize = 14
        )
ax12.spines["bottom"].set_visible(False)
ax12.set_xticklabels([])
ax12.set_yticklabels([])
ax12.tick_params(left=False, bottom=False)

# Restecg
ax13.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.kdeplot(ax=ax13, data=df, x='restecg',hue="output", fill=True,palette=["#8000ff","#da8829"], alpha=.5, linewidth=0)
ax13.set_xlabel("")
ax13.set_ylabel("")



# Slp title
ax14.text(0.5,0.5,"Relação do Slp \ncom \n a variável alvo",
        horizontalalignment = 'center',
        verticalalignment = 'center',
        fontsize = 18,
        fontweight='bold',
        fontfamily='serif',
        color='#000000')
ax14.text(1,.5,"Declive\n(0, 1, 2)",
        horizontalalignment = 'center',
         verticalalignment = 'center',
        fontsize = 14
        )
ax14.spines["bottom"].set_visible(False)
ax14.set_xticklabels([])
ax14.set_yticklabels([])
ax14.tick_params(left=False, bottom=False)

# Slp
ax15.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.kdeplot(ax=ax15, data=df, x='slp',hue="output", fill=True,palette=["#8000ff","#da8829"], alpha=.5, linewidth=0)
ax15.set_xlabel("")
ax15.set_ylabel("") 


for i in ["top","left","right"]:
    ax0.spines[i].set_visible(False)
    ax1.spines[i].set_visible(False)
    ax2.spines[i].set_visible(False)
    ax3.spines[i].set_visible(False)
    ax4.spines[i].set_visible(False)
    ax5.spines[i].set_visible(False)
    ax6.spines[i].set_visible(False)
    ax7.spines[i].set_visible(False)
    ax8.spines[i].set_visible(False)
    ax9.spines[i].set_visible(False)
    ax10.spines[i].set_visible(False)
    ax11.spines[i].set_visible(False)

### Relatório/dashboard

In [None]:
create_report(df)

### **INSIGHTS**








**DADOS CATEGÓRICOS**


* Um cp maior do que 0 pode ser um fator de risco

* Temos 2x a mais pessoas com o sex = 1 do que sex = 1

* Temos 5x a mais de pessoas com o fbs = 0 do que fbs = 1

* Apenas cerca de 8 pessoas tem um resultado diferente de 0 e 1 na variável "restecg"

* um caa igual a 0 pode ser um fator de risco

* Um exng igual = 0 pode ser um fator de risco

* O sex = 0 tem 3x a mais de chance de um possível ataque cardíaco do que o sex = 1,porém o sex = 0 tem apenas metade dos dados em comparação do sex = 1

* Um thall igual a 2 pode ser um fator de risco

* Um slp = 2 pode ser um fator risco





---

**CORRELAÇÃO DAS VARIÁVEIS**


* Não temos uma relação forte entre as variáveis contínuas

* oldpeak e slp tem uma boa correlação negativa,de quando o oldpeak abaixa o slp aumenta.

* Quando os valores das variáveis(exng,oldpeak,caa) abaixam,o output tende a ser igual a 1

* Quando os valores das variáveis(cp e thalachh) aumentam,o output tende a ser igual a 1

---




**DADOS CONTÍNUOS**

* Todos as variáveis contínuas tem outliers.

* Temos uma média de 52 a 60 anos.

* Trtbps varia e 120 a 140.

* Chol varia de 220 a 280.

* thalachh varia de 135 a 155.

* Oldpeak varia de 0 a 1.60

* Um thalachh acima de 153 pode ser um fator de risco

* Um oldpeak entorno de 0 pode ser um fator de risco








**CAT FEATURES**


* A cp greater than 0 can be a risk factor

* We have 2x more people with sex = 1 than sex = 1

* We have 5x more people with fbs = 0 than fbs = 1

* Only about 8 people have a result other than 0 and 1 in the "restecg" variable

* a caa equal to 0 can be a risk factor

* An exng equal to = 0 can be a risk factor

* Sex = 0 has 3x more chance of a possible heart attack than Sex = 1, but SEX = 0 has only half the data compared to Sex = 1

* A thall equal to 2 can be a risk factor

* An slp = 2 can be a risk factor





---

**CORRELATION OF VARIABLES**


* We do not have a strong relationship between continuous variables

* oldpeak and slp have a good negative correlation, when oldpeak lowers slp increases.

* When the values ​​of the variables (exng,oldpeak,caa) go down, the output tends to be equal to 1

* When the values ​​of the variables (cp and thalachh) increase, the output tends to be equal to 1

---




**CON FEATURES**

* All continuous variables have outliers.

* We have an average age of 52 to 60 years.

* Trtbps ranges from 120 to 140.

* Chol ranges from 220 to 280.

* thalachh ranges from 135 to 155.

* Oldpeak ranges from 0 to 1.60

* A thalachh above 153 may be a risk factor

* An oldpeak around 0 may be a risk factor

#Aprendizado de máquina/Machine learning

## Pré-processamento/Preprocessing





### Divisão entre previsores e classe/Division between predictors and class

In [None]:
df.columns

In [None]:
X_heart= df.iloc[:, 0:12].values
print(X_heart)

In [None]:
y_heart= df.iloc[:,13].values
print(y_heart)

### Escalonamento dos valores/Feature Scaling

In [None]:
scaler = StandardScaler()

In [None]:
X_heart=scaler.fit_transform(X_heart)

### Divisão entre base de treinamento e base de teste/Train-Test split

In [None]:
X_heart_treinamento, X_heart_teste, y_heart_treinamento, y_heart_teste = train_test_split(X_heart, y_heart, test_size = 0.2, random_state = 0)

### Salvando as variáveis/Saving

In [None]:
with open('heart.pkl', mode = 'wb') as f:
  pickle.dump([X_heart_treinamento, y_heart_treinamento, X_heart_teste, y_heart_teste], f)

## Algoritimos de aprendizagem de máquina/Machine Learning Algorithm

### 1-Classificador base - Majority learner-54.45%

---



In [None]:
majority_learner0=165+138
majority_learner1=(165/303)*100
majority_learner1

### 2-Árvores de decisão-77,04%

---



In [None]:
with open('heart.pkl', 'rb') as f:
  X_heart_treinamento, y_heart_treinamento, X_heart_teste, y_heart_teste = pickle.load(f)

In [None]:
X_heart_treinamento.shape, y_heart_treinamento.shape

In [None]:
X_heart_teste.shape, y_heart_teste.shape

In [None]:
arvore_heart = DecisionTreeClassifier(random_state=42)
arvore_heart.fit(X_heart_treinamento, y_heart_treinamento)

In [None]:
previsoes = arvore_heart.predict(X_heart_teste)
previsoes

In [None]:
y_heart_teste

In [None]:
accuracy_score(y_heart_teste,previsoes)*100

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_heart_teste, previsoes))

In [None]:
cm_arvore_decisao = ConfusionMatrix(arvore_heart)
cm_arvore_decisao.fit(X_heart_treinamento, y_heart_treinamento)
cm_arvore_decisao.score(X_heart_teste, y_heart_teste)

### 3-Random forest-88,52%

In [None]:
with open('heart.pkl', 'rb') as f:
  X_heart_treinamento, y_heart_treinamento, X_heart_teste, y_heart_teste = pickle.load(f)

In [None]:
X_heart_treinamento.shape,X_heart_teste.shape

In [None]:
y_heart_treinamento.shape,y_heart_teste.shape

In [None]:
random_heart = RandomForestClassifier(random_state=42)
random_heart.fit(X_heart_treinamento, y_heart_treinamento)

In [None]:
previsoes = random_heart.predict(X_heart_teste)
previsoes

In [None]:
y_heart_teste

In [None]:
accuracy_score(y_heart_teste,previsoes)*100

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_heart_teste, previsoes))

In [None]:
cm_random = ConfusionMatrix(random_heart)
cm_random.fit(X_heart_treinamento, y_heart_treinamento)
cm_random.score(X_heart_teste, y_heart_teste)

### 4-KNN-91,80%

In [None]:
with open('heart.pkl', 'rb') as f:
  X_heart_treinamento, y_heart_treinamento, X_heart_teste, y_heart_teste = pickle.load(f)

In [None]:
X_heart_treinamento.shape, y_heart_treinamento.shape

In [None]:
y_heart_treinamento.shape,y_heart_teste.shape

In [None]:
knn_heart = KNeighborsClassifier()
knn_heart.fit(X_heart_treinamento, y_heart_treinamento)

In [None]:
previsoes = knn_heart.predict(X_heart_teste)
previsoes

In [None]:
y_heart_teste

In [None]:
accuracy_score(y_heart_teste,previsoes)*100

#### Otimização KNN

In [None]:
error_rate = []
  
for i in range(1, 40):
      
    model = KNeighborsClassifier(n_neighbors = i)
    model.fit(X_heart_treinamento, y_heart_treinamento)
    pred_i = model.predict(X_heart_teste)
    error_rate.append(np.mean(pred_i != y_heart_teste))
  
plt.figure(figsize =(10, 8))
plt.plot(range(1, 40), error_rate, color ='blue',
                linestyle ='dashed', marker ='o',
         markerfacecolor ='red', markersize = 10)
  
plt.title('Taxa de erro vs valor do K')
plt.xlabel('K')
plt.ylabel('Taxa de erro')

In [None]:
with open('heart.pkl', 'rb') as f:
  X_heart_treinamento, y_heart_treinamento, X_heart_teste, y_heart_teste = pickle.load(f)

In [None]:
knn_heart = KNeighborsClassifier(n_neighbors=14)
knn_heart.fit(X_heart_treinamento, y_heart_treinamento)

In [None]:
previsoes = knn_heart.predict(X_heart_teste)
previsoes

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_heart_teste, previsoes))

In [None]:
cm_knn = ConfusionMatrix(knn_heart)
cm_knn.fit(X_heart_treinamento, y_heart_treinamento)
cm_knn.score(X_heart_teste, y_heart_teste)

### 5-Regressão logística-83,60%

In [None]:
with open('heart.pkl', 'rb') as f:
  X_heart_treinamento, y_heart_treinamento, X_heart_teste, y_heart_teste = pickle.load(f)

In [None]:
X_heart_treinamento.shape, y_heart_treinamento.shape

In [None]:
y_heart_treinamento.shape,y_heart_teste.shape

In [None]:
rl_heart = LogisticRegression(random_state=42)
rl_heart.fit(X_heart_treinamento, y_heart_treinamento)

In [None]:
previsoes = rl_heart.predict(X_heart_teste)
previsoes

In [None]:
y_heart_teste

In [None]:
accuracy_score(y_heart_teste,previsoes)*100

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_heart_teste, previsoes))

In [None]:
rl_heart = ConfusionMatrix(rl_heart)
rl_heart.fit(X_heart_treinamento, y_heart_treinamento)
rl_heart.score(X_heart_teste, y_heart_teste)

### 6-SVC-88,52

In [None]:
with open('heart.pkl', 'rb') as f:
  X_heart_treinamento, y_heart_treinamento, X_heart_teste, y_heart_teste = pickle.load(f)

In [None]:
X_heart_treinamento.shape, y_heart_treinamento.shape

In [None]:
y_heart_treinamento.shape,y_heart_teste.shape

In [None]:
svc_heart = SVC(random_state=0)
svc_heart.fit(X_heart_treinamento, y_heart_treinamento)

In [None]:
previsoes = svc_heart.predict(X_heart_teste)
previsoes

In [None]:
y_heart_teste

In [None]:
accuracy_score(y_heart_teste,previsoes)

In [None]:
svc_heart = ConfusionMatrix(svc_heart)
svc_heart.fit(X_heart_treinamento, y_heart_treinamento)
svc_heart.score(X_heart_teste, y_heart_teste)

### 7-XGBOOST-85,24%

In [None]:
pip install xgboost

In [None]:
def xgb_classifier(n_estimators, max_depth, reg_alpha,
                   reg_lambda, min_child_weight, num_boost_round,
                   gamma):
    params = {"booster": 'gbtree',
              "objective" : "binary:logistic",
              "eval_metric" : "auc", 
              "is_unbalance": True,
              "n_estimators": int(n_estimators),
              "max_depth" : int(max_depth),
              "reg_alpha" : reg_alpha,
              "reg_lambda" : reg_lambda,
              "gamma": gamma,
              "num_threads" : 20,
              "min_child_weight" : int(min_child_weight),
              "learning_rate" : 0.01,
              "subsample_freq" : 5,
              "seed" : 42,
              "verbosity" : 0,
              "num_boost_round": int(num_boost_round)}
    train_data = xgb.DMatrix(X_heart_treinamento, y_heart_treinamento)
    cv_result = xgb.cv(params,
                       train_data,
                       1000,
                       early_stopping_rounds=100,
                       stratified=True,
                       nfold=3)
    return cv_result['test-auc-mean'].iloc[-1]

In [None]:
pip install bayesian-optimization


In [None]:
from bayes_opt import BayesianOptimization
xgbBO = BayesianOptimization(xgb_classifier, {  "n_estimators": (10, 200),
                                                'max_depth': (4, 40),
                                                'reg_alpha': (0.0, 0.2),
                                                'reg_lambda': (0.0, 0.1),
                                                'min_child_weight': (1, 10),
                                                'num_boost_round': (100, 1000),
                                                "gamma": (0, 10)
                                                })

xgbBO.maximize(n_iter=50, init_points=20)

In [None]:
xgbBO.max

In [None]:
parametros = {'gamma': 0.14999791713495747,
  'max_depth': 40,
  'min_child_weight': 9,
  'n_estimators': 98,
  'num_boost_round': 685,
  'reg_alpha': 0.06268011871402068,
  'reg_lambda': 0.021174755161898484}

In [None]:
xgbc_heart=xgb_classifier=XGBClassifier(num_threads=20,subsample_freq=0.5,verbosity=0,seed=42,max_depth=40,eval_metric="auc",booster='gbtree',gamma=0.14999791713495747,min_child_weight=9,n_estimators=98,reg_alpha=0.06268011871402068,reg_lambda=0.021174755161898484)
xgbc_heart.fit(X_heart_treinamento,y_heart_treinamento)

In [None]:
from sklearn.model_selection import cross_val_score, KFold

In [None]:
previsoes = xgbc_heart.predict(X_heart_teste)
previsoes

In [None]:
y_heart_teste


In [None]:
accuracy_score(y_heart_teste,previsoes)

In [None]:
xgbc_heart = ConfusionMatrix(xgbc_heart)
xgbc_heart.fit(X_heart_treinamento, y_heart_treinamento)
xgbc_heart.score(X_heart_teste, y_heart_teste)