### Init

In [109]:
!pip install imbalanced-learn



In [110]:
!pip install plotly



In [170]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, make_scorer

In [112]:
%matplotlib inline
import plotly.io as pio
pio.renderers.default = 'iframe'

### Get and format data

In [113]:
df_raw = pd.read_csv("heart.csv")

df_raw.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,Male,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked,0
1,1,Male,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked,0
2,2,Female,42.0,0,0,Yes,Private,Rural,103.0,40.3,Unknown,0
3,3,Male,56.0,0,0,Yes,Private,Urban,64.87,28.8,never smoked,0
4,4,Female,24.0,0,0,No,Private,Rural,73.36,28.8,never smoked,0


In [114]:
df = df_raw.copy()
df = df.rename(columns={"Residence_type": "residence_type"})


df = df.drop(columns=['id'], axis=1)

# Male = 0, Female = 1
df['gender'] = np.where(df['gender'] == 'Male', 0, 1)

# Yes = 0, No = 1
df['ever_married'] = np.where(df['ever_married'] == 'Yes', 0, 1)

# Urban = 0, Rural = 1
df['residence_type'] = np.where(df['residence_type'] == 'Urban', 0, 1)

# LabelEncoder
# work_type ['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
# smoking_status ['never smoked' 'formerly smoked' 'Unknown' 'smokes']
le = LabelEncoder()
df['work_type'] = le.fit_transform(df['work_type'])
df['smoking_status'] = le.fit_transform(df['smoking_status'])

df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,28.0,0,0,0,2,0,79.53,31.1,2,0
1,0,33.0,0,0,0,2,1,78.44,23.9,1,0
2,1,42.0,0,0,0,2,1,103.0,40.3,0,0
3,0,56.0,0,0,0,2,0,64.87,28.8,2,0
4,1,24.0,0,0,1,2,1,73.36,28.8,2,0


### Get train and test

In [115]:
data = df.drop(columns=['stroke'], axis=1)
target = df[['stroke']]

In [116]:
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=1)

In [117]:
sm = SMOTE(random_state=1)
x_train_res, y_train_res = sm.fit_resample(x_train, y_train)

### Pre processing

In [118]:
std_scaler = StandardScaler()
x_train_res =  pd.DataFrame(std_scaler.fit_transform(x_train_res), 
                            index=x_train_res.index, columns=x_train_res.columns)

x_train_res.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status
0,1.025548,-0.649553,-0.216724,-0.138696,-0.4531,0.015847,-0.801519,-0.173837,0.532449,-1.381864
1,-0.975089,-2.008879,-0.216724,-0.138696,2.20702,2.203883,1.247631,-0.359487,-1.749565,-1.381864
2,1.025548,-1.465148,-0.216724,-0.138696,2.20702,0.015847,1.247631,0.228325,-0.61701,0.726023
3,1.025548,-0.649553,-0.216724,-0.138696,-0.4531,0.015847,-0.801519,-0.944384,-1.360777,-1.381864
4,1.025548,-1.872946,-0.216724,-0.138696,2.20702,2.203883,1.247631,-0.424611,1.276217,0.726023


In [119]:
n_fatores = x_train_res.shape[1]
pca = PCA(n_components=n_fatores)

pca.fit(x_train_res)

In [120]:
pca.explained_variance_ratio_

array([0.23801727, 0.12658837, 0.10694918, 0.09605588, 0.09480419,
       0.08867193, 0.08544016, 0.07114052, 0.06537144, 0.02696106])

In [121]:
fatores = [f'F{i+1}' for i in range(n_fatores) ]
fatores

['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10']

In [122]:
# plot da variancia por fator
fig = px.bar(x=fatores, y=pca.explained_variance_ratio_, text=np.around(pca.explained_variance_ratio_, decimals=2),
            title= "Screen Plot")
fig.update_layout(yaxis={'title': 'Procentagem de variância explicada','tickfont':{'size':15}},
                 xaxis={'title': 'Fatores','tickfont':{'size':15}},
                 title={'font':{'size': 25}})
fig.show()

In [123]:
autovalores = pca.explained_variance_ratio_ * n_fatores
autovalores

array([2.3801727 , 1.26588372, 1.06949182, 0.96055881, 0.94804191,
       0.88671934, 0.85440158, 0.71140519, 0.65371436, 0.26961057])

In [124]:
# Com o método de raízes latentes, defina quantos fatores serão utilizados para criar 
# o ranking e imprima o Screenplot.
# http://www.leg.ufpr.br/lib/exe/fetch.php/disciplinas:ppgea2012:pca_teoria.pdf
# Seleção do número de componentes
# * Kaiser ou Método da Raiz Latente: autovalores maiores que 1 (Johnson = 0,7);
# Selecionamos apenas fatores que possuem autovalor > 1, pois assim já teremos um PCA que 'explica' 
# em torno de 98% do conjunto de dados

fatores_selecionados = ['Fator selecionado' if autovalor > 1 else 'Fator não selecionado' for autovalor in autovalores]
fatores_selecionados

['Fator selecionado',
 'Fator selecionado',
 'Fator selecionado',
 'Fator não selecionado',
 'Fator não selecionado',
 'Fator não selecionado',
 'Fator não selecionado',
 'Fator não selecionado',
 'Fator não selecionado',
 'Fator não selecionado']

In [125]:
varianca_acumulada = [sum(pca.explained_variance_ratio_[0:i+1]) for i in range(n_fatores)]
varianca_acumulada

[0.2380172699672567,
 0.36460564242805105,
 0.47155482450805825,
 0.5676107055543025,
 0.662414896669617,
 0.7510868310302363,
 0.8365269887643685,
 0.9076675074942089,
 0.9730389430178454,
 1.0000000000000002]

In [126]:
# plot da varianca acumulada por fator
# idealmente, a variancia acumulada deve ser igual a 1
fig = px.bar(x=fatores, y=varianca_acumulada, text=np.around(varianca_acumulada, decimals=2),
            title= "Scree Plot")
fig.update_layout(yaxis={'title': 'Procentagem de variância explicada','tickfont':{'size':15}},
                 xaxis={'title': 'Fatores','tickfont':{'size':15}},
                 title={'font':{'size': 25}})
fig.show()

### Processing

In [171]:
n_fatores = 7
pca = PCA(n_components=n_fatores)

x_train_pca = pca.fit_transform(x_train_res)

x_train_pca.shape

(23472, 7)

In [172]:
dt = DecisionTreeClassifier()

In [173]:
f1 = make_scorer(f1_score, average = 'weighted')

In [177]:
'''param_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],  # Criterion to measure the quality of a split
    'splitter': ['best', 'random'],  # Strategy used to choose the split at each node
    'max_depth': range(5, 50, 5),  # Maximum depth of the tree
    'min_samples_split': [2, 10, 20],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 5, 10],  # Minimum number of samples required to be at a leaf node
    'max_features': [None, 'sqrt', 'log2'],  # Number of features to consider when looking for the best split
    'max_leaf_nodes': [None, 10, 20, 30],  # Grow a tree with `max_leaf_nodes` in best-first fashion
    'min_impurity_decrease': [0.0, 0.1, 0.2], # A node will be split if this split induces a decrease of the impurity greater than or equal to this value
}'''

param_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': range(1, 10, 2),  # Maximum depth of the tree
    'ccp_alpha': np.linspace(0, 0.3, num=5),  # Number of features to consider when looking for the best split
    'min_impurity_decrease': [0.0, 0.1, 0.2], # A node will be split if this split induces a decrease of the impurity greater than or equal to this value
}

grid_dt = GridSearchCV(dt, param_grid, scoring=f1, cv=10)
grid_dt.fit(x_train_pca, y_train_res)


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted sa

In [178]:
print(f"Melhores parâmetros KNN: {grid_dt.best_params_}")

Melhores parâmetros KNN: {'ccp_alpha': 0.0, 'criterion': 'gini', 'max_depth': 9, 'min_impurity_decrease': 0.0}


### Testing

In [179]:
x_test_std_scaler = pd.DataFrame(std_scaler.transform(x_test), 
                            index=x_test.index, columns=x_test.columns)
x_test_pca = pca.transform(x_test_std_scaler)

In [180]:
y_pred = grid_dt.predict(x_test_pca)

In [181]:
grid_dt.score(x_test_pca, y_test)

0.8606776969754334

In [182]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.82      0.89      2936
           1       0.09      0.42      0.15       125

    accuracy                           0.81      3061
   macro avg       0.53      0.62      0.52      3061
weighted avg       0.94      0.81      0.86      3061



In [183]:
# Split treino e teste
# Smote (apenas treino)
# Std Scaler
# PCA 7 componentes
# Treino
# Predict no modelo

In [184]:
# tratamento de dados / std sclaer, etc
# pca
# grid search c/ validação cruzada passando a metrica correta, retornando modelos treinados e aplicando automaticamente o pruning
# para as figuras de merito usar funcão classification report do melhor modelo

In [185]:
# decision tree classifier
# https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html#sphx-glr-auto-examples-tree-plot-cost-complexity-pruning-py