# Prática de AutoML usando a biblioteca PyCaret

## Instalação do PyCaret

In [1]:
!pip install -q pycaret

## Importação de bibliotecas

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, precision_score, recall_score

from pycaret import classification
from pycaret.classification import setup

## Carregamento dos dados

In [3]:
url_arquivo = 'https://raw.githubusercontent.com/regispires/minicurso-ia-wtisc-2023/main/datasets/healthcare-dataset-stroke-data.csv'
df = pd.read_csv(url_arquivo)

## Visualização dos dados

In [4]:
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


## Remoção do atributo 'id', que não é util, pois é único para cada amostra

In [6]:
df.drop('id', axis=1, inplace=True)

## Definição de features (atributos, características, X) e rótulos (targets, y)

In [7]:
X = df.drop('stroke', axis=1)
y = df['stroke']

## Divisão do conjunto de dados em conjunto de treino e conjunto de teste
A divisão é feita de forma estratificada (_stratify=y_) para que o mesmo percentual de cada classe existente no conjunto de dados completo também ocorra nos conjuntos de treino e teste.

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4088, 10), (1022, 10), (4088,), (1022,))

### Percentual de cada classe no conjunto de dados completo

In [9]:
y.value_counts(normalize=True)

0    0.951272
1    0.048728
Name: stroke, dtype: float64

In [10]:
train_data = pd.concat([X_train, y_train], axis=1)
train_data

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
845,Female,48.0,0,0,Yes,Private,Urban,69.21,33.1,never smoked,0
3744,Male,15.0,0,0,No,Private,Rural,122.25,21.0,never smoked,0
4183,Female,67.0,0,0,Yes,Self-employed,Rural,110.42,24.9,never smoked,0
3409,Male,44.0,0,0,Yes,Private,Urban,65.41,24.8,smokes,0
284,Male,14.0,0,0,No,Govt_job,Urban,82.34,31.6,Unknown,0
...,...,...,...,...,...,...,...,...,...,...,...
1434,Female,45.0,0,0,Yes,Private,Urban,92.86,35.1,formerly smoked,0
461,Female,16.0,0,0,No,children,Rural,113.47,19.5,Unknown,0
1052,Female,61.0,0,0,Yes,Private,Rural,78.65,36.2,formerly smoked,0
1757,Male,31.0,0,0,Yes,Private,Urban,74.05,26.0,Unknown,0


## Configuração do PyCaret para criação de modelos de classificação

In [11]:
s = classification.setup(data=train_data, target='stroke', session_id=123, normalize=True, fix_imbalance=True)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,stroke
2,Target type,Binary
3,Original data shape,"(4088, 11)"
4,Transformed data shape,"(6671, 20)"
5,Transformed train set shape,"(5444, 20)"
6,Transformed test set shape,"(1227, 20)"
7,Ordinal features,2
8,Numeric features,5
9,Categorical features,5


## Criação e avaliação dos modelos pelo PyCaret

In [12]:
best_model = classification.compare_models(sort='F1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7449,0.8276,0.7626,0.1325,0.2257,0.1558,0.2429,1.335
lda,Linear Discriminant Analysis,0.7361,0.8304,0.7841,0.1314,0.2249,0.1545,0.2457,0.373
svm,SVM - Linear Kernel,0.7372,0.0,0.7769,0.1311,0.2242,0.1538,0.2435,0.511
ridge,Ridge Classifier,0.734,0.0,0.7769,0.1292,0.2214,0.1506,0.2407,0.276
dt,Decision Tree Classifier,0.9105,0.5674,0.1874,0.1665,0.174,0.1278,0.1288,0.37
knn,K Neighbors Classifier,0.8539,0.6332,0.2934,0.1102,0.1598,0.0969,0.1117,0.841
nb,Naive Bayes,0.1975,0.7842,0.9929,0.0568,0.1075,0.0172,0.0895,0.701
xgboost,Extreme Gradient Boosting,0.9423,0.8014,0.0654,0.251,0.1026,0.0812,0.1032,1.364
qda,Quadratic Discriminant Analysis,0.7672,0.6005,0.2648,0.0705,0.087,0.0266,0.039,0.337
lightgbm,Light Gradient Boosting Machine,0.9434,0.8088,0.0429,0.17,0.068,0.0503,0.0642,0.732


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

## Predição dos rótulos para os dados do conjunto de teste

In [13]:
result = classification.predict_model(best_model, X_test)
result

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,prediction_label,prediction_score
3725,Male,63.0,0,0,Yes,Private,Rural,78.230003,34.799999,never smoked,0,0.5748
4481,Female,43.0,0,0,Yes,Private,Urban,86.669998,33.299999,never smoked,0,0.8184
1545,Female,23.0,0,0,No,Private,Urban,126.669998,28.700001,smokes,0,0.8776
1820,Female,21.0,0,0,No,Private,Urban,208.169998,24.900000,never smoked,0,0.9186
1262,Male,67.0,0,0,Yes,Private,Rural,113.339996,26.299999,formerly smoked,1,0.5917
...,...,...,...,...,...,...,...,...,...,...,...,...
1042,Female,82.0,0,0,No,Self-employed,Urban,82.209999,26.000000,never smoked,1,0.8878
1171,Female,34.0,0,0,No,Govt_job,Rural,120.059998,33.000000,never smoked,0,0.8712
3199,Female,50.0,0,0,Yes,Govt_job,Urban,92.150002,20.799999,never smoked,0,0.7417
3208,Male,15.0,0,0,No,children,Rural,62.570000,32.299999,never smoked,0,0.9588


In [14]:
y_pred = result['prediction_label']
y_pred

3725    0
4481    0
1545    0
1820    0
1262    1
       ..
1042    1
1171    0
3199    0
3208    0
1819    0
Name: prediction_label, Length: 1022, dtype: int64

In [15]:
y_pred.value_counts()

0    740
1    282
Name: prediction_label, dtype: int64

## Cálculo da acurácia do modelo para o conjunto de teste usando a biblioteca Scikit Learn

In [16]:
accuracy_score(y_test, y_pred)

0.7534246575342466

## Cálculo da métrica _F1 Score_ para o conjunto de teste usando a biblioteca Scikit Learn

In [17]:
f1_score(y_test, y_pred)

0.24096385542168677

## Mostrando um relatório com algumas métricas de classificação sobre o conjunto de teste

In [18]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.75      0.85       972
           1       0.14      0.80      0.24        50

    accuracy                           0.75      1022
   macro avg       0.56      0.78      0.55      1022
weighted avg       0.95      0.75      0.82      1022

