# AutoGluon: Porto Seguro (Classification) # 3

Based on: https://www.linkedin.com/learning/automl-build-production-ready-models-quickly/

In [1]:
time_limit = 60*60  # 1 hour max (confirm with timer)
metric = "roc_auc"  # specify your evaluation metric
label = "target" # name of target or label variable (just happens to be called "target" for this dataset)

## Import

In [2]:
# Import packages

from autogluon.tabular import TabularDataset
from autogluon.tabular import TabularPredictor

import numpy as np
import pandas as pd

In [3]:
# Import data

train_data = TabularDataset("porto_train.csv")
test_data = TabularDataset("porto_test.csv")

In [4]:
train_data.shape

(476170, 60)

In [5]:
test_data.shape

(119042, 59)

## Tidy

In [6]:
# Replace "-1" with missing value
train_data = train_data.replace(-1, np.nan)
test_data = test_data.replace(-1, np.nan)

## Transform

In [7]:
# Drop variable not used as predictors
train_data = train_data.drop(["id", "fold"], axis=1)
test_data = test_data.drop(["id"], axis=1)

In [8]:
# Convert categorical variables to "category" data type
cat_vars = [col for col in train_data.columns if 'cat' in col]
cat_vars

['ps_ind_02_cat',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat']

In [9]:
for col in cat_vars:
    test_data[col] = test_data[col].astype('category')
    
cat_vars = cat_vars + ["target"]

for col in cat_vars:
    train_data[col] = train_data[col].astype('category')

In [10]:
# Create separate test data to demonstrate how to make predictions on new examples at inference time:
y_test = test_data[label]  # values to predict

## Visualize

(skipped)

## Model

In [11]:
%%time

predictor = TabularPredictor(
    label=label, 
    eval_metric=metric
).fit(
    train_data,
    time_limit=time_limit
)

No path specified. Models will be saved in: "AutogluonModels/ag-20230713_022955/"
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutogluonModels/ag-20230713_022955/"
AutoGluon Version:  0.7.0
Python Version:     3.9.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu May 4 15:21:22 UTC 2023
Train Data Rows:    476170
Train Data Columns: 57
Label Column: target
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    71251.76 MB
	Tr

CPU times: user 6h 31min 7s, sys: 3min 2s, total: 6h 34min 9s
Wall time: 13min 14s


In [12]:
# Use trained models to make predictions on the new data and then evaluate performance:
y_pred = predictor.predict_proba(test_data)

In [13]:
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

  _warn_prf(average, modifier, msg_start, len(result))
Evaluation: roc_auc on test data: 0.6318303042376591
Evaluations on test data:
{
    "roc_auc": 0.6318303042376591,
    "accuracy": 0.9635842811780716,
    "balanced_accuracy": 0.5,
    "mcc": 0.0,
    "f1": 0.0,
    "precision": 0.0,
    "recall": 0.0
}


In [14]:
%%time

# Evaluate the performance of each individual trained model on test data:
predictor.leaderboard(test_data, silent=True)

CPU times: user 16min 37s, sys: 7.98 s, total: 16min 45s
Wall time: 28.1 s


Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost,0.635058,0.602004,0.276031,0.026782,50.866588,0.276031,0.026782,50.866588,1,True,7
1,XGBoost,0.634824,0.60147,0.901016,0.037667,8.10966,0.901016,0.037667,8.10966,1,True,11
2,WeightedEnsemble_L2,0.63183,0.61235,4.955228,0.358054,705.195761,0.009349,0.00117,1.383225,2,True,14
3,NeuralNetFastAI,0.623164,0.590777,1.451422,0.0802,354.257713,1.451422,0.0802,354.257713,1,True,10
4,LightGBMXT,0.621983,0.591881,0.201733,0.014593,4.68571,0.201733,0.014593,4.68571,1,True,3
5,LightGBMLarge,0.621455,0.587142,0.1607,0.013649,3.417172,0.1607,0.013649,3.417172,1,True,13
6,NeuralNetTorch,0.62074,0.594228,1.161072,0.062192,277.335234,1.161072,0.062192,277.335234,1,True,12
7,LightGBM,0.617411,0.581836,0.148699,0.022847,2.287585,0.148699,0.022847,2.287585,1,True,4
8,ExtraTreesEntr,0.612614,0.60184,1.156338,0.150044,13.24334,1.156338,0.150044,13.24334,1,True,9
9,ExtraTreesGini,0.609756,0.585093,2.085407,0.14587,12.932863,2.085407,0.14587,12.932863,1,True,8


In [15]:
predictor.get_model_best()

'WeightedEnsemble_L2'

In [16]:
%%time

test_leaderboard = predictor.leaderboard(
    test_data, 
    extra_metrics=["accuracy", "balanced_accuracy"], 
    silent=True, 
    extra_info=True
)

CPU times: user 16min 56s, sys: 11.4 s, total: 17min 7s
Wall time: 41.8 s


In [17]:
test_leaderboard.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
model,CatBoost,XGBoost,WeightedEnsemble_L2,NeuralNetFastAI,LightGBMXT,LightGBMLarge,NeuralNetTorch,LightGBM,ExtraTreesEntr,ExtraTreesGini,RandomForestEntr,RandomForestGini,KNeighborsUnif,KNeighborsDist
score_test,0.635058,0.634824,0.63183,0.623164,0.621983,0.621455,0.62074,0.617411,0.612614,0.609756,0.607943,0.606315,0.508486,0.507837
accuracy,0.963609,0.963601,0.963584,0.963542,0.963584,0.963584,0.963584,0.963584,0.963584,0.963584,0.963584,0.963584,0.963257,0.963257
balanced_accuracy,0.500457,0.500231,0.5,0.499978,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.500111,0.49983,0.49983
score_val,0.602004,0.60147,0.61235,0.590777,0.591881,0.587142,0.594228,0.581836,0.60184,0.585093,0.581938,0.582968,0.503897,0.504697
pred_time_test,0.261465,0.907897,5.116733,1.445062,0.192555,0.158141,1.235246,0.139672,1.257729,1.412738,1.140813,1.357284,8.246748,8.940786
pred_time_val,0.026782,0.037667,0.358054,0.0802,0.014593,0.013649,0.062192,0.022847,0.150044,0.14587,0.158861,0.157297,0.992754,0.810642
fit_time,50.866588,8.10966,705.195761,354.257713,4.68571,3.417172,277.335234,2.287585,13.24334,12.932863,21.681599,19.928732,3.993556,0.727796
pred_time_test_marginal,0.261465,0.907897,0.009333,1.445062,0.192555,0.158141,1.235246,0.139672,1.257729,1.412738,1.140813,1.357284,8.246748,8.940786
pred_time_val_marginal,0.026782,0.037667,0.00117,0.0802,0.014593,0.013649,0.062192,0.022847,0.150044,0.14587,0.158861,0.157297,0.992754,0.810642
