### Init

In [17]:
!pip install imbalanced-learn



In [18]:
!pip install plotly



In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, make_scorer

In [20]:
%matplotlib inline
import plotly.io as pio
pio.renderers.default = 'iframe'

### Get and format data

In [21]:
df_raw = pd.read_csv("heart.csv")

df_raw.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,Male,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked,0
1,1,Male,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked,0
2,2,Female,42.0,0,0,Yes,Private,Rural,103.0,40.3,Unknown,0
3,3,Male,56.0,0,0,Yes,Private,Urban,64.87,28.8,never smoked,0
4,4,Female,24.0,0,0,No,Private,Rural,73.36,28.8,never smoked,0


In [22]:
df = df_raw.copy()
df = df.rename(columns={"Residence_type": "residence_type"})


df = df.drop(columns=['id'], axis=1)

# Male = 0, Female = 1
df['gender'] = np.where(df['gender'] == 'Male', 0, 1)

# Yes = 0, No = 1
df['ever_married'] = np.where(df['ever_married'] == 'Yes', 0, 1)

# Urban = 0, Rural = 1
df['residence_type'] = np.where(df['residence_type'] == 'Urban', 0, 1)

# LabelEncoder
# work_type ['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
# smoking_status ['never smoked' 'formerly smoked' 'Unknown' 'smokes']
le = LabelEncoder()
df['work_type'] = le.fit_transform(df['work_type'])
df['smoking_status'] = le.fit_transform(df['smoking_status'])

df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,28.0,0,0,0,2,0,79.53,31.1,2,0
1,0,33.0,0,0,0,2,1,78.44,23.9,1,0
2,1,42.0,0,0,0,2,1,103.0,40.3,0,0
3,0,56.0,0,0,0,2,0,64.87,28.8,2,0
4,1,24.0,0,0,1,2,1,73.36,28.8,2,0


### Get train and test

In [23]:
data = df.drop(columns=['stroke'], axis=1)
target = df[['stroke']]

In [24]:
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=1)

In [25]:
sm = SMOTE(random_state=1)
#x_train, y_train = sm.fit_resample(x_train, y_train)

### Pre processing

### Processing

In [26]:
dt = DecisionTreeClassifier()

In [27]:
f1 = make_scorer(f1_score, average = 'weighted')

In [92]:
'''param_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],  # Criterion to measure the quality of a split
    'splitter': ['best', 'random'],  # Strategy used to choose the split at each node
    'max_depth': range(5, 50, 5),  # Maximum depth of the tree
    'min_samples_split': [2, 10, 20],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 5, 10],  # Minimum number of samples required to be at a leaf node
    'max_features': [None, 'sqrt', 'log2'],  # Number of features to consider when looking for the best split
    'max_leaf_nodes': [None, 10, 20, 30],  # Grow a tree with `max_leaf_nodes` in best-first fashion
    'min_impurity_decrease': [0.0, 0.1, 0.2], # A node will be split if this split induces a decrease of the impurity greater than or equal to this value
}'''

param_grid = {
    #'max_depth': range(1, 10, 2),  # Maximum depth of the tree
    #'ccp_alpha': np.linspace(0, 0.1, num=5),  # Number of features to consider when looking for the best split
    #'min_samples_leaf': range(1, 10, 2),  # Minimum number of samples required to be at a leaf node
    #'max_leaf_nodes': [None, 5, 10, 15, 20, 25, 30],  # Grow a tree with `max_leaf_nodes` in best-first fashion
    'criterion': ['gini', 'entropy', 'log_loss'],

}

grid_dt = GridSearchCV(dt, param_grid, scoring='recall', cv=5)
grid_dt.fit(x_train, y_train)

In [93]:
print(f"Melhores parâmetros KNN: {grid_dt.best_params_}")

Melhores parâmetros KNN: {'criterion': 'gini'}


### Testing

In [94]:
y_pred = grid_dt.predict(x_test)

In [95]:
grid_dt.score(x_test, y_test)

0.216

In [96]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.96      0.96      2936
           1       0.19      0.22      0.20       125

    accuracy                           0.93      3061
   macro avg       0.58      0.59      0.58      3061
weighted avg       0.93      0.93      0.93      3061



In [97]:
# Split treino e teste.
# Smote (apenas treino)
# Std Scaler
# PCA 7 componentes
# Treino
# Predict no modelo

In [98]:
# tratamento de dados / std sclaer, etc
# pca
# grid search c/ validação cruzada passando a metrica correta, retornando modelos treinados e aplicando automaticamente o pruning
# para as figuras de merito usar funcão classification report do melhor modelo

In [99]:
# decision tree classifier
# https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html#sphx-glr-auto-examples-tree-plot-cost-complexity-pruning-py