# AutoGluon: Porto Seguro (Classification) # 1

Based on: https://auto.gluon.ai/0.1.0/tutorials/tabular_prediction/tabular-quickstart.html

In [1]:
time_limit = 60*60  # 1 hour max (confirm with timer)
metric = "roc_auc"  # specify your evaluation metric
label = "target" # name of target or label variable (just happens to be called "target" for this dataset)

## Import

In [2]:
# Import packages

from autogluon.tabular import TabularDataset
from autogluon.tabular import TabularPredictor

import numpy as np
import pandas as pd

In [3]:
# Import data

train_data = TabularDataset("porto_train.csv")
test_data = TabularDataset("porto_test.csv")

In [4]:
train_data.shape

(476170, 60)

In [5]:
test_data.shape

(119042, 59)

## Tidy

In [6]:
# Replace "-1" with missing value
train_data = train_data.replace(-1, np.nan)
test_data = test_data.replace(-1, np.nan)

## Transform

In [7]:
# Drop variable not used as predictors
train_data = train_data.drop(["id", "fold"], axis=1)
test_data = test_data.drop(["id"], axis=1)

In [8]:
# Convert categorical variables to "category" data type
cat_vars = [col for col in train_data.columns if 'cat' in col]
cat_vars

['ps_ind_02_cat',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat']

In [9]:
for col in cat_vars:
    test_data[col] = test_data[col].astype('category')
    
cat_vars = cat_vars + ["target"]

for col in cat_vars:
    train_data[col] = train_data[col].astype('category')

In [10]:
# Create separate test data to demonstrate how to make predictions on new examples at inference time:
y_test = test_data[label]  # values to predict

## Visualize

(skipped)

## Model 1

In [11]:
%timeit

predictor = TabularPredictor(
    label=label, 
    eval_metric=metric
).fit(
    train_data, 
    time_limit=time_limit
)

No path specified. Models will be saved in: "AutogluonModels/ag-20230709_041912/"
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutogluonModels/ag-20230709_041912/"
AutoGluon Version:  0.7.0
Python Version:     3.9.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu May 4 15:21:22 UTC 2023
Train Data Rows:    476170
Train Data Columns: 57
Label Column: target
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    54314.7 MB
	Tra

In [12]:
# Use trained models to make predictions on the new data and then evaluate performance:
y_pred = predictor.predict_proba(test_data)

In [13]:
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

  _warn_prf(average, modifier, msg_start, len(result))
Evaluation: roc_auc on test data: 0.6318303042376591
Evaluations on test data:
{
    "roc_auc": 0.6318303042376591,
    "accuracy": 0.9635842811780716,
    "balanced_accuracy": 0.5,
    "mcc": 0.0,
    "f1": 0.0,
    "precision": 0.0,
    "recall": 0.0
}


In [19]:
# Evaluate the performance of each individual trained model on test data:
predictor.leaderboard(test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.637412,0.643819,41.09484,107.64513,3184.220309,0.016129,0.088491,109.933267,2,True,14
1,CatBoost_BAG_L1,0.636831,0.64117,1.771478,1.641047,526.517942,1.771478,1.641047,526.517942,1,True,7
2,XGBoost_BAG_L1,0.635973,0.63835,7.723609,4.710784,21.222854,7.723609,4.710784,21.222854,1,True,11
3,NeuralNetFastAI_BAG_L1,0.633692,0.6337,11.07057,8.57473,1404.43992,11.07057,8.57473,1404.43992,1,True,10
4,LightGBMXT_BAG_L1,0.62595,0.632746,1.868313,1.751455,13.131827,1.868313,1.751455,13.131827,1,True,3
5,LightGBMLarge_BAG_L1,0.62545,0.63107,1.680449,1.98663,21.514956,1.680449,1.98663,21.514956,1,True,13
6,LightGBM_BAG_L1,0.624337,0.631292,1.626274,1.626623,12.25913,1.626274,1.626623,12.25913,1,True,4
7,NeuralNetTorch_BAG_L1,0.623265,0.624779,10.279494,8.108717,1017.867617,10.279494,8.108717,1017.867617,1,True,12
8,RandomForestEntr_BAG_L1,0.612346,0.603324,1.565257,27.163695,22.053309,1.565257,27.163695,22.053309,1,True,6
9,ExtraTreesGini_BAG_L1,0.611302,0.601947,1.644346,24.210879,13.278623,1.644346,24.210879,13.278623,1,True,8


## Model 2

In [15]:
%timeit

predictor = TabularPredictor(label, eval_metric=metric).fit(train_data, time_limit=time_limit, presets='best_quality')

No path specified. Models will be saved in: "AutogluonModels/ag-20230709_043303/"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutogluonModels/ag-20230709_043303/"
AutoGluon Version:  0.7.0
Python Version:     3.9.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu May 4 15:21:22 UTC 2023
Train Data Rows:    476170
Train Data Columns: 57
Label Column: target
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Gener

In [16]:
# Use trained models to make predictions on the new data and then evaluate performance:
y_pred = predictor.predict_proba(test_data)

In [17]:
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

  _warn_prf(average, modifier, msg_start, len(result))
Evaluation: roc_auc on test data: 0.6374115862058618
Evaluations on test data:
{
    "roc_auc": 0.6374115862058618,
    "accuracy": 0.9635842811780716,
    "balanced_accuracy": 0.5,
    "mcc": 0.0,
    "f1": 0.0,
    "precision": 0.0,
    "recall": 0.0
}


In [18]:
# Evaluate the performance of each individual trained model on test data:
predictor.leaderboard(test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.637412,0.643819,41.714066,107.64513,3184.220309,0.017151,0.088491,109.933267,2,True,14
1,CatBoost_BAG_L1,0.636831,0.64117,1.440531,1.641047,526.517942,1.440531,1.641047,526.517942,1,True,7
2,XGBoost_BAG_L1,0.635973,0.63835,7.920245,4.710784,21.222854,7.920245,4.710784,21.222854,1,True,11
3,NeuralNetFastAI_BAG_L1,0.633692,0.6337,10.987027,8.57473,1404.43992,10.987027,8.57473,1404.43992,1,True,10
4,LightGBMXT_BAG_L1,0.62595,0.632746,1.557794,1.751455,13.131827,1.557794,1.751455,13.131827,1,True,3
5,LightGBMLarge_BAG_L1,0.62545,0.63107,1.914848,1.98663,21.514956,1.914848,1.98663,21.514956,1,True,13
6,LightGBM_BAG_L1,0.624337,0.631292,1.503453,1.626623,12.25913,1.503453,1.626623,12.25913,1,True,4
7,NeuralNetTorch_BAG_L1,0.623265,0.624779,10.898428,8.108717,1017.867617,10.898428,8.108717,1017.867617,1,True,12
8,RandomForestEntr_BAG_L1,0.612346,0.603324,1.766314,27.163695,22.053309,1.766314,27.163695,22.053309,1,True,6
9,ExtraTreesGini_BAG_L1,0.611302,0.601947,2.383173,24.210879,13.278623,2.383173,24.210879,13.278623,1,True,8


## Communicate

(skipped)