# AutoML: Build Production-Ready Models Quickly!

## Classification with Porto Seguro - Part 2

https://www.linkedin.com/learning/automl-build-production-ready-models-quickly/

In [1]:
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.core.metrics import make_scorer
# from sklearn.model_selection import train_test_split

In [2]:
# train_data = TabularDataset('../01-machine-learning-recap/data/train_data.csv')
# train_data = TabularDataset('train_data_insure.csv')
train_data = TabularDataset("porto_train.csv")
test_data = TabularDataset("porto_test.csv")

In [4]:
train_data.head().T

Unnamed: 0,0,1,2,3,4
id,9.0,13.0,16.0,17.0,20.0
target,0.0,0.0,0.0,0.0,0.0
ps_ind_01,1.0,5.0,0.0,0.0,2.0
ps_ind_02_cat,1.0,4.0,1.0,2.0,1.0
ps_ind_03,7.0,9.0,2.0,0.0,3.0
ps_ind_04_cat,0.0,1.0,0.0,1.0,1.0
ps_ind_05_cat,0.0,0.0,0.0,0.0,0.0
ps_ind_06_bin,0.0,0.0,1.0,1.0,0.0
ps_ind_07_bin,0.0,0.0,0.0,0.0,1.0
ps_ind_08_bin,1.0,1.0,0.0,0.0,0.0


In [4]:
# train, test = train_test_split(train_data, test_size=0.3, shuffle=True)

In [5]:
train_data = train_data.replace(-1, np.nan)
test_data = test_data.replace(-1, np.nan)

In [6]:
train_data = train_data.drop(["id", "fold"], axis=1)
test_data = test_data.drop(["id"], axis=1)

In [7]:
cat_vars = [col for col in train_data.columns if 'cat' in col]
cat_vars

['ps_ind_02_cat',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat']

In [8]:
for col in cat_vars:
    test_data[col] = test_data[col].astype('category')
    
cat_vars = cat_vars + ["target"]

for col in cat_vars:
    train_data[col] = train_data[col].astype('category')

In [9]:
# EVAL_METRIC = "f1"
EVAL_METRIC = "roc_auc"
# SAVE_PATH = "AutoGluonModels_improved"   # Trained models will be saved here
SAVE_PATH = "AutoGluonModels_porto2"   # Trained models will be saved here
# LABEL = "Claim"
LABEL = "target"

## Improving your AutoGluon Model

### 1. Via Feature Engineering

In [10]:
# train["age_of_building"] = train["YearOfObservation"] - train["Date_of_Occupancy"]
# test["age_of_building"] = test["YearOfObservation"] - test["Date_of_Occupancy"]
# train["YearOfObservation"] = train["YearOfObservation"].astype("category")
# test["YearOfObservation"] = test["YearOfObservation"].astype("category")

### 2. Use the Right Preset & refit_full

In [11]:
predictor = TabularPredictor(label=LABEL, path=SAVE_PATH, eval_metric=EVAL_METRIC)
# predictor = predictor.fit(train, presets=['best_quality'], time_limit=100, refit_full='best')
# increase time_limit=100 to 3600
predictor = predictor.fit(train_data, presets=['best_quality'], time_limit=3600, refit_full='best')

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutoGluonModels_porto2/"
AutoGluon Version:  0.7.0
Python Version:     3.9.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu May 4 15:21:22 UTC 2023
Train Data Rows:    476170
Train Data Columns: 57
Label Column: target
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory: 

	Fitting 8 child models (S2F1 - S2F8) | Fitting with ParallelLocalFoldFittingStrategy
	0.633	 = Validation score   (roc_auc)
	24.51s	 = Training   runtime
	3.1s	 = Validation runtime
Completed 2/20 k-fold bagging repeats ...
Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.0s of the 1086.94s of remaining time.
	0.6444	 = Validation score   (roc_auc)
	107.38s	 = Training   runtime
	0.09s	 = Validation runtime
AutoGluon training complete, total runtime = 2621.23s ... Best model: "WeightedEnsemble_L2"
Automatically performing refit_full as a post-fit operation (due to `.fit(..., refit_full=True)`
Refitting models via `predictor.refit_full` using all of the data (combined train and validation)...
	Models trained in this way will have the suffix "_FULL" and have NaN validation score.
	This process is not bound by time_limit, but should take less time than the original `predictor.fit` call.
	To learn more, refer to the `.refit_full` method docstring which explains how "_FU

In [12]:
predictor.leaderboard(extra_info=True, silent=True).head().T

Unnamed: 0,0,1,2,3,4
model,WeightedEnsemble_L2,CatBoost_BAG_L1,XGBoost_BAG_L1,NeuralNetFastAI_BAG_L1,LightGBMXT_BAG_L1
score_val,0.64438,0.641441,0.640715,0.638533,0.63473
pred_time_val,92.404387,2.336168,7.818072,10.105675,2.945169
fit_time,2252.750872,614.037724,24.023688,810.439626,18.635457
pred_time_val_marginal,0.087173,2.336168,7.818072,10.105675,2.945169
fit_time_marginal,107.377994,614.037724,24.023688,810.439626,18.635457
stack_level,2,1,1,1,1
can_infer,True,True,True,True,True
fit_order,14,7,11,10,3
num_features,7,57,57,57,57


### 3. Via CustomMetrics

In [13]:
fp_cost = 1000  # Admin fees spent to investigate potential insurance claim
fn_cost = 10000 # Average insurance claim made
tp_cost = 1000 # Admin fees spent to investigate potential insurance claim
tn_cost = 0

In [14]:
def cost_fn(y_true, y_prob):
    tp = np.where((y_prob >= 0.7) & (y_true==1), tp_cost, 0)
    fp = np.where((y_prob >= 0.7) & (y_true==0), fp_cost, 0)
    tn = np.where((y_prob < 0.7) & (y_true==0), tn_cost, 0)
    fn = np.where((y_prob < 0.7) & (y_true==1), fn_cost, 0)
    return np.sum([tp,fp,tn,fn])

In [15]:
my_scorer = make_scorer(
    name="operating_cost",
    score_func=cost_fn,
    greater_is_better=False,
    needs_proba=True
)

In [17]:
predictor.leaderboard(test_data, extra_metrics=[my_scorer], silent=True).head().T

Unnamed: 0,0,1,2,3,4
model,WeightedEnsemble_L2,CatBoost_BAG_L1,WeightedEnsemble_L2_FULL,XGBoost_BAG_L1,CatBoost_BAG_L1_FULL
score_test,0.63742,0.63694,0.636556,0.636139,0.636111
operating_cost,-43350000,-43350000,-43350000,-43350000,-43350000
score_val,0.64438,0.641441,,0.640715,
pred_time_test,60.105205,2.578153,6.399577,14.968619,0.176755
pred_time_val,92.404387,2.336168,,7.818072,
fit_time,2252.750872,614.037724,321.823792,24.023688,59.295773
pred_time_test_marginal,0.012863,2.578153,0.014527,14.968619,0.176755
pred_time_val_marginal,0.087173,2.336168,,7.818072,
fit_time_marginal,107.377994,614.037724,107.377994,24.023688,59.295773
