In [1]:
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.core.metrics import make_scorer
from sklearn.model_selection import train_test_split

In [2]:
train_data = TabularDataset('../01-machine-learning-recap/data/train_data.csv')

In [3]:
train_data.head()

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code,Claim
0,H14663,2013,1.0,0,N,V,V,U,290.0,1,1960.0,.,1053,0
1,H2037,2015,1.0,0,V,N,O,R,490.0,1,1850.0,4,1053,0
2,H3802,2014,1.0,0,N,V,V,U,595.0,1,1960.0,.,1053,0
3,H3834,2013,1.0,0,V,V,V,U,2840.0,1,1960.0,.,1053,0
4,H5053,2014,1.0,0,V,N,O,R,680.0,1,1800.0,3,1053,0


In [4]:
train, test = train_test_split(train_data, test_size=0.3, shuffle=True)

In [5]:
EVAL_METRIC = "f1"
SAVE_PATH = "AutoGluonModels_improved"   # Trained models will be saved here
LABEL = "Claim"

## Improving your AutoGluon Model

### 1. Via Feature Engineering

In [6]:
train["age_of_building"] = train["YearOfObservation"] - train["Date_of_Occupancy"]
test["age_of_building"] = test["YearOfObservation"] - test["Date_of_Occupancy"]

train["YearOfObservation"] = train["YearOfObservation"].astype("category")
test["YearOfObservation"] = test["YearOfObservation"].astype("category")

### 2. Use the Right Preset & refit_full

In [7]:
predictor = TabularPredictor(label=LABEL, path=SAVE_PATH, eval_metric=EVAL_METRIC)

In [9]:
predictor.leaderboard(extra_info=True, silent=True).head()

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order,num_features,...,child_model_type,hyperparameters,hyperparameters_fit,ag_args_fit,features,child_hyperparameters,child_hyperparameters_fit,child_ag_args_fit,ancestors,descendants
0,LightGBMXT_BAG_L2,0.39143,0.698611,92.147948,0.07278,26.811869,2,True,10,21,...,LGBModel,"{'use_orig_features': True, 'max_base_models':...",{},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...","[Residential, Insured_Period, Garden, NumberOf...","{'learning_rate': 0.05, 'extra_trees': True}",{'num_boost_round': 503},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...","[RandomForestEntr_BAG_L1, CatBoost_BAG_L1, Ran...",[WeightedEnsemble_L3]
1,WeightedEnsemble_L3,0.39143,0.704338,92.686797,0.005727,0.538849,3,True,13,1,...,GreedyWeightedEnsembleModel,"{'use_orig_features': False, 'max_base_models'...",{},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...",[LightGBMXT_BAG_L2],{'ensemble_size': 100},{'ensemble_size': 1},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...","[RandomForestEntr_BAG_L1, CatBoost_BAG_L1, Ran...",[]
2,LightGBM_BAG_L1,0.368,0.047821,23.201585,0.047821,23.201585,1,True,4,13,...,LGBModel,"{'use_orig_features': True, 'max_base_models':...",{},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...","[Residential, Date_of_Occupancy, Building_Type...",{'learning_rate': 0.05},{'num_boost_round': 286},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...",[],"[WeightedEnsemble_L3, RandomForestGini_BAG_L2,..."
3,WeightedEnsemble_L2,0.368,0.053262,24.874285,0.005441,1.6727,2,True,9,1,...,GreedyWeightedEnsembleModel,"{'use_orig_features': False, 'max_base_models'...",{},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...",[LightGBM_BAG_L1],{'ensemble_size': 100},{'ensemble_size': 1},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...",[LightGBM_BAG_L1],[]
4,LightGBMXT_BAG_L1,0.351415,0.064293,29.157067,0.064293,29.157067,1,True,3,13,...,LGBModel,"{'use_orig_features': True, 'max_base_models':...",{},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...","[Residential, Date_of_Occupancy, Building_Type...","{'learning_rate': 0.05, 'extra_trees': True}",{'num_boost_round': 372},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...",[],"[LightGBMXT_BAG_L2, LightGBM_BAG_L2, WeightedE..."


### 3. Via CustomMetrics

In [12]:
fp_cost = 1000  # Admin fees spent to investigate potential insurance claim
fn_cost = 10000 # Average insurance claim made
tp_cost = 1000 # Admin fees spent to investigate potential insurance claim
tn_cost = 0

In [11]:
def cost_fn(y_true, y_prob):
    tp = np.where((y_prob >= 0.7) & (y_true==1), tp_cost, 0)
    fp = np.where((y_prob >= 0.7) & (y_true==0), fp_cost, 0)
    tn = np.where((y_prob < 0.7) & (y_true==0), tn_cost, 0)
    fn = np.where((y_prob < 0.7) & (y_true==1), fn_cost, 0)
    return np.sum([tp,fp,tn,fn])