In [1]:
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.core.metrics import make_scorer
from sklearn.model_selection import train_test_split

In [2]:
train_data = TabularDataset('../01-machine-learning-recap/data/train_data.csv')

In [3]:
train_data.head()

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code,Claim
0,H14663,2013,1.0,0,N,V,V,U,290.0,1,1960.0,.,1053,0
1,H2037,2015,1.0,0,V,N,O,R,490.0,1,1850.0,4,1053,0
2,H3802,2014,1.0,0,N,V,V,U,595.0,1,1960.0,.,1053,0
3,H3834,2013,1.0,0,V,V,V,U,2840.0,1,1960.0,.,1053,0
4,H5053,2014,1.0,0,V,N,O,R,680.0,1,1800.0,3,1053,0


In [4]:
train, test = train_test_split(train_data, test_size=0.3, shuffle=True)

In [5]:
EVAL_METRIC = "f1"
SAVE_PATH = "AutoGluonModels_improved"   # Trained models will be saved here
LABEL = "Claim"

## Improving your AutoGluon Model

### 1. Via Feature Engineering

In [6]:
train["age_of_building"] = train["YearOfObservation"] - train["Date_of_Occupancy"]
test["age_of_building"] = test["YearOfObservation"] - test["Date_of_Occupancy"]

train["YearOfObservation"] = train["YearOfObservation"].astype("category")
test["YearOfObservation"] = test["YearOfObservation"].astype("category")

### 2. Use the Right Preset & refit_full

In [7]:
predictor = TabularPredictor(label=LABEL, path=SAVE_PATH, eval_metric=EVAL_METRIC)

In [11]:
predictor = predictor.fit(
    train, 
    presets=["best_quality"], 
    time_limit=100, 
    refit_full="best"
)

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 100s
AutoGluon will save models to "AutoGluonModels_improved/"
AutoGluon Version:  0.8.2
Python Version:     3.9.7
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 22.1.0: Sun Oct  9 20:15:09 PDT 2022; root:xnu-8792.41.9~2/RELEASE_ARM64_T6000
Disk Space Avail:   189.54 GB / 494.38 GB (38.3%)
Train Data Rows:    5012
Train Data Columns: 14
Label Column: Claim
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0

	0.39s	 = Training   runtime
	0.19s	 = Validation runtime
Completed 1/20 k-fold bagging repeats ...
Fitting model: WeightedEnsemble_L2 ... Training model for up to 99.7s of the -0.27s of remaining time.
	0.3489	 = Validation score   (f1)
	0.92s	 = Training   runtime
	0.01s	 = Validation runtime
AutoGluon training complete, total runtime = 101.2s ... Best model: "WeightedEnsemble_L2"
Automatically performing refit_full as a post-fit operation (due to `.fit(..., refit_full=True)`
Refitting models via `predictor.refit_full` using all of the data (combined train and validation)...
	Models trained in this way will have the suffix "_FULL" and have NaN validation score.
	This process is not bound by time_limit, but should take less time than the original `predictor.fit` call.
	To learn more, refer to the `.refit_full` method docstring which explains how "_FULL" models differ from normal models.
Fitting 1 L1 models ...
Fitting model: LightGBMXT_BAG_L1_FULL ...
	5.23s	 = Training   runtime
Fitt

In [12]:
predictor.leaderboard(extra_info=True, silent=True).head()

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order,num_features,...,hyperparameters,hyperparameters_fit,ag_args_fit,features,compile_time,child_hyperparameters,child_hyperparameters_fit,child_ag_args_fit,ancestors,descendants
0,LightGBMXT_BAG_L1,0.348921,0.086342,84.461349,0.086342,84.461349,1,True,3,12,...,"{'use_orig_features': True, 'max_base_models':...",{},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...","[Geo_Code, Insured_Period, Building_Type, age_...",,"{'learning_rate': 0.05, 'extra_trees': True}",{'num_boost_round': 325},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...",[],[WeightedEnsemble_L2]
1,WeightedEnsemble_L2,0.348921,0.094679,85.37702,0.008337,0.91567,2,True,6,1,...,"{'use_orig_features': False, 'max_base_models'...",{},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...",[LightGBMXT_BAG_L1],,{'ensemble_size': 100},{'ensemble_size': 1},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...",[LightGBMXT_BAG_L1],[]
2,LightGBM_BAG_L1,0.315315,0.050487,10.859208,0.050487,10.859208,1,True,4,12,...,"{'use_orig_features': True, 'max_base_models':...",{},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...","[Geo_Code, Insured_Period, Building_Type, age_...",,{'learning_rate': 0.05},{'num_boost_round': 52},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...",[],[]
3,RandomForestGini_BAG_L1,0.311321,0.186257,0.388448,0.186257,0.388448,1,True,5,12,...,"{'use_orig_features': True, 'max_base_models':...",{},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...","[Geo_Code, Insured_Period, Building_Type, age_...",,"{'n_estimators': 300, 'max_leaf_nodes': 15000,...",{'n_estimators': 300},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...",[],[]
4,KNeighborsDist_BAG_L1,0.298897,0.010243,0.16523,0.010243,0.16523,1,True,2,5,...,"{'use_orig_features': True, 'max_base_models':...",{},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...","[Insured_Period, age_of_building, Building_Typ...",,{'weights': 'distance'},{},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...",[],[]


### 3. Via CustomMetrics

In [13]:
fp_cost = 1000  # Admin fees spent to investigate potential insurance claim
fn_cost = 10000 # Average insurance claim made
tp_cost = 1000 # Admin fees spent to investigate potential insurance claim
tn_cost = 0

In [14]:
def cost_fn(y_true, y_prob):
    tp = np.where((y_prob >= 0.7) & (y_true==1), tp_cost, 0)
    fp = np.where((y_prob >= 0.7) & (y_true==0), fp_cost, 0)
    tn = np.where((y_prob < 0.7) & (y_true==0), tn_cost, 0)
    fn = np.where((y_prob < 0.7) & (y_true==1), fn_cost, 0)
    return np.sum([tp,fp,tn,fn])

In [17]:
op_scorer = make_scorer(
    name="operating_cost",
    score_func=cost_fn,
    greater_is_better=False,
    needs_proba=True
)

In [18]:
predictor.leaderboard(test, extra_metrics=[op_scorer], silent=True).head()

Unnamed: 0,model,score_test,operating_cost,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,RandomForestGini_BAG_L1,0.353261,-4517000,0.311321,0.095755,0.186257,0.388448,0.095755,0.186257,0.388448,1,True,5
1,LightGBMXT_BAG_L1,0.337143,-4534000,0.348921,0.136519,0.086342,84.461349,0.136519,0.086342,84.461349,1,True,3
2,WeightedEnsemble_L2,0.337143,-4534000,0.348921,0.138251,0.094679,85.37702,0.001732,0.008337,0.91567,2,True,6
3,LightGBMXT_BAG_L1_FULL,0.332386,-4514000,,0.018675,,5.234116,0.018675,,5.234116,1,True,7
4,WeightedEnsemble_L2_FULL,0.332386,-4514000,,0.020068,,6.149786,0.001393,,0.91567,2,True,8
