# PyCaret: Porto Seguro (Classification) - Final

Based on: https://github.com/pycaret/pycaret/blob/master/tutorials/Tutorial%20-%20Binary%20Classification.ipynb

Version used for final presentation.

Used "APC" (to align with H20) - a.k.a. "average_precision"

See: https://github.com/pycaret/pycaret/issues/806

"The area under the Precision-Recall curve (AUPRC) is very useful metric for class imbalance. Here is the source (https://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html)."

Did not use a stopping metric - it only works for "tune_model" (HPT) for a given model

```
early_stopping: bool or str or object, default = False
        Use early stopping to stop fitting to a hyperparameter configuration
        if it performs poorly. Ignored when ``search_library`` is scikit-learn,
        or if the estimator does not have 'partial_fit' attribute. If False or
        None, early stopping will not be used. Can be either an object accepted
        by the search library or one of the following:
    
        - 'asha' for Asynchronous Successive Halving Algorithm
        - 'hyperband' for Hyperband
        - 'median' for Median Stopping Rule
        - If False or None, early stopping will not be used.
```

In [1]:
time_limit = 60  # 1 hour max (confirm with timer)
metric = "APC"  # specify your evaluation metric
stopping = "hyperband"  # specify your stopping metric
label = "target" # name of target or label variable (just happens to be called "target" for this dataset)

## Import

In [13]:
# ! pip3 uninstall snappy
# ! pip3 uninstall python-snappy
# ! pip3 install --user python-snappy  # uccessfully installed python-snappy-0.6.1

In [20]:
import snappy

In [21]:
# Import packages

from pycaret.classification import *
# from sklearn.metrics import average_precision_score  # added for AUCPR (?)

# import numpy as np
# import pandas as pd
# import pycaret

AttributeError: module 'snappy' has no attribute 'compress'

In [7]:
from sklearn.metrics import average_precision_score  # added for AUCPR (?)

In [6]:
import numpy as np

In [5]:
import pandas as pd

In [4]:
import pycaret

In [None]:
# Import data

train_data = pd.read_csv("porto_train.csv")
test_data = pd.read_csv("porto_test.csv")

In [None]:
train_data.shape

In [None]:
test_data.shape

## Tidy

In [None]:
# Replace "-1" with missing value
train_data = train_data.replace(-1, np.nan)
test_data = test_data.replace(-1, np.nan)

## Transform

In [None]:
# Drop variable not used as predictors
train_data = train_data.drop(["id", "fold"], axis=1)
test_data = test_data.drop(["id"], axis=1)

In [None]:
# Convert categorical variables to "category" data type
cat_vars = [col for col in train_data.columns if 'cat' in col]
cat_vars

In [None]:
for col in cat_vars:
    test_data[col] = test_data[col].astype('category')

# TypeError: 'Categorical' with dtype category does not support reduction 'mean'
# cat_vars = cat_vars + ["target"]

for col in cat_vars:
    train_data[col] = train_data[col].astype('category')

In [None]:
# Create separate test data to demonstrate how to make predictions on new examples at inference time:
# y_test = test_data[label]  # values to predict

## Visualize

(skipped)

## Model

In [None]:
%%time

s = setup(train_data, target=label, session_id=123)

add_metric("apc", "APC", average_precision_score, target="pred_proba")

In [None]:
# check all the available models
models()

In [None]:
%%time

# compare baseline models
best = compare_models(sort=metric, budget_time=time_limit)

In [None]:
# predict model on new_data
predictions = predict_model(best, data=test_data)

### AutoML

This function returns the best model out of all trained models in the current setup based on the optimize parameter. Metrics evaluated can be accessed using the `get_metrics` function.

In [None]:
%%time

automl(optimize=metric)

## Communicate

In [None]:
# plot confusion matrix
plot_model(best, plot='confusion_matrix')

In [None]:
# plot AUC
plot_model(best, plot='auc')

In [None]:
plot_model(best, plot='pr')

In [None]:
# plot feature importance
plot_model(best, plot='feature')

In [None]:
# plot class report
plot_model(best, plot='class_report')