### Init

In [94]:
!pip install imbalanced-learn



In [95]:
!pip install plotly



In [96]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [97]:
%matplotlib inline
import plotly.io as pio
pio.renderers.default = 'iframe'

### Get and format data

In [98]:
df_raw = pd.read_csv("heart.csv")

df_raw.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,Male,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked,0
1,1,Male,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked,0
2,2,Female,42.0,0,0,Yes,Private,Rural,103.0,40.3,Unknown,0
3,3,Male,56.0,0,0,Yes,Private,Urban,64.87,28.8,never smoked,0
4,4,Female,24.0,0,0,No,Private,Rural,73.36,28.8,never smoked,0


In [99]:
df = df_raw.copy()
df = df.rename(columns={"Residence_type": "residence_type"})


df = df.drop(columns=['id'], axis=1)

# Male = 0, Female = 1
df['gender'] = np.where(df['gender'] == 'Male', 0, 1)

# Yes = 0, No = 1
df['ever_married'] = np.where(df['ever_married'] == 'Yes', 0, 1)

# Urban = 0, Rural = 1
df['residence_type'] = np.where(df['residence_type'] == 'Urban', 0, 1)

# LabelEncoder
# work_type ['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
# smoking_status ['never smoked' 'formerly smoked' 'Unknown' 'smokes']
le = LabelEncoder()
df['work_type'] = le.fit_transform(df['work_type'])
df['smoking_status'] = le.fit_transform(df['smoking_status'])

df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,28.0,0,0,0,2,0,79.53,31.1,2,0
1,0,33.0,0,0,0,2,1,78.44,23.9,1,0
2,1,42.0,0,0,0,2,1,103.0,40.3,0,0
3,0,56.0,0,0,0,2,0,64.87,28.8,2,0
4,1,24.0,0,0,1,2,1,73.36,28.8,2,0


### Get train and test

In [100]:
data = df.drop(columns=['stroke'], axis=1)
target = df[['stroke']]

In [101]:
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=1)

In [102]:
sm = SMOTE(random_state=1)
x_train_res, y_train_res = sm.fit_resample(x_train, y_train)

### Pre processing

In [103]:
std_scaler = StandardScaler()
#x_train_res =  pd.DataFrame(std_scaler.fit_transform(x_train_res), 
#                            index=x_train_res.index, columns=x_train_res.columns)

x_train_res.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status
0,1,40.0,0,0,0,2,0,90.67,32.1,0
1,0,10.0,0,0,1,4,1,83.03,18.6,0
2,1,22.0,0,0,1,2,1,107.22,25.3,2
3,1,40.0,0,0,0,2,0,58.96,20.9,0
4,1,13.0,0,0,1,4,1,80.35,36.5,2


In [104]:
n_fatores = x_train_res.shape[1]
pca = PCA(n_components=n_fatores)

pca.fit(x_train_res)

In [105]:
pca.explained_variance_ratio_

array([7.75893042e-01, 2.08750309e-01, 1.43740344e-02, 4.29721335e-04,
       2.84285965e-04, 1.11713887e-04, 1.01686522e-04, 2.82618190e-05,
       1.87675468e-05, 8.17695179e-06])

In [106]:
fatores = [f'F{i+1}' for i in range(n_fatores) ]
fatores

['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10']

In [112]:
# plot da variancia por fator
fig = px.bar(x=fatores, y=pca.explained_variance_ratio_, text=np.around(pca.explained_variance_ratio_, decimals=2),
            title= "Screen Plot")
fig.update_layout(yaxis={'title': 'Procentagem de variância explicada','tickfont':{'size':15}},
                 xaxis={'title': 'Fatores','tickfont':{'size':15}},
                 title={'font':{'size': 25}})
fig.show()

In [108]:
autovalores = pca.explained_variance_ratio_ * n_fatores
autovalores

array([7.75893042e+00, 2.08750309e+00, 1.43740344e-01, 4.29721335e-03,
       2.84285965e-03, 1.11713887e-03, 1.01686522e-03, 2.82618190e-04,
       1.87675468e-04, 8.17695179e-05])

In [109]:
# Com o método de raízes latentes, defina quantos fatores serão utilizados para criar 
# o ranking e imprima o Screenplot.
# http://www.leg.ufpr.br/lib/exe/fetch.php/disciplinas:ppgea2012:pca_teoria.pdf
# Seleção do número de componentes
# * Kaiser ou Método da Raiz Latente: autovalores maiores que 1 (Johnson = 0,7);
# Selecionamos apenas fatores que possuem autovalor > 1, pois assim já teremos um PCA que 'explica' 
# em torno de 98% do conjunto de dados

fatores_selecionados = ['Fator selecionado' if autovalor > 1 else 'Fator não selecionado' for autovalor in autovalores]
fatores_selecionados

['Fator selecionado',
 'Fator selecionado',
 'Fator não selecionado',
 'Fator não selecionado',
 'Fator não selecionado',
 'Fator não selecionado',
 'Fator não selecionado',
 'Fator não selecionado',
 'Fator não selecionado',
 'Fator não selecionado']

In [114]:
pca = PCA(n_components=2)

x_train_pca = pca.fit_transform(x_train_res)

### Processing

In [110]:
# tratamento de dados / std sclaer, etc
# pca
# grid search c/ validação cruzada passando a metrica correta, retornando modelos treinados e aplicando automaticamente o pruning
# para as figuras de merito usar funcão classification report do melhor modelo

In [111]:
# decision tree classifier
# https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html#sphx-glr-auto-examples-tree-plot-cost-complexity-pruning-py