In [2]:
import os, sys
import json
import pickle as pkl
import pandas as pd
import autogluon as ag
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split

In [6]:
data = pd.read_parquet("./artifacts/train_df_for_hypertune.parquet")

In [7]:
target = "target_v1"
target_indeterminate = 'indeterminate_v1'
weight_col = "weight"

In [8]:
seed = 42
# train_df, test_df = train_test_split(data, train_size=0.8, stratify=data[target], random_state=seed)
# train_df, valid_df = train_test_split(train_df, train_size=0.75, stratify=train_df[target], random_state=seed)
train_df, valid_df = train_test_split(data, train_size=0.75, stratify=data[target], random_state=seed)

In [9]:
train_df.shape, valid_df.shape# , test_df.shape

((1706685, 244), (568895, 244))

In [10]:
ranking = pd.read_csv("./artifacts/fsel-ranking.csv", index_col=0)
ranking["rank_mean"] = ranking.mean(axis=1).values

In [11]:
def get_top_k_features(feature_ranking, rank_col, k):
    s = feature_ranking[rank_col].sort_values().head(k)
    return s.index.to_list()

In [12]:
top_100_fts = get_top_k_features(ranking, "rank_mean", 32)

with open("./artifacts/candidatte_features.json", "r") as f:
    combined_fts = json.load(f)["combined_features"]

In [13]:
features = combined_fts
id_col = "id"
meta_cols = [target, id_col]
train_data = TabularDataset(train_df[features+[target, weight_col]])
valid_data = TabularDataset(valid_df[features+[target, weight_col]])
# test_data = TabularDataset(test_df[features+[target, weight_col]])

In [53]:
metric = 'roc_auc'   # 'roc_auc', 'f1', 'average_precision'
auto_stack = True
path = './artifacts/autogluon-prescreen'
os.makedirs(path, exist_ok=True)
hp_tune = True
time_limits = 120*60  # 2mins

# GBM
gbm_options = {  # specifies non-default hyperparameter values for lightGBM gradient boosted trees
    'objective' : 'binary',
    'metric' : 'auc',
    'boosting': 'gbdt',
    'tree_learner': 'serial',
    'boost_from_average': 'false',
    'tree_learner': 'feature',
    'num_leaves' : ag.core.space.Int(lower=5, upper=100, default=10),
    'lambda_l1': ag.core.space.Real(lower=0, upper=30, default=5),
    'lambda_l2': ag.core.space.Real(lower=0, upper=30, default=5),
    'min_data_in_leaf': ag.core.space.Int(lower=10, upper=200, default=30),
    'max_depth' : ag.core.space.Int(lower=2, upper=7, default=3),
    'num_boost_round': ag.core.space.Int(lower=100, upper=2000, default=1000),
    'learning_rate': ag.core.space.Real(1e-3, 1e-1, default=1e-2),
    'feature_fraction' : ag.core.space.Real(0.1, 0.8, default=0.5),
    'early_stopping_round': 200,
    'seed': seed,
    'seed_value': seed
}

cat_options = {}  # CAT

xgb_options = {  # specifies non-default hyperparameter values for lightGBM gradient boosted trees
    'objective' : 'binary',
    'metric' : 'auc',
    'boosting': 'gbdt',
    'tree_learner': 'serial',
    'boost_from_average': 'false',
    'tree_learner': 'feature',
    'num_leaves' : ag.core.space.Int(lower=5, upper=100, default=10),
    'lambda': ag.core.space.Real(lower=0, upper=30, default=5),
    'alpha': ag.core.space.Real(lower=0, upper=30, default=5),
    'min_data_in_leaf': ag.core.space.Int(lower=10, upper=200, default=30),
    'max_depth' : ag.core.space.Int(lower=2, upper=7, default=3),
    'n_estimators': ag.core.space.Int(lower=100, upper=2000, default=1000),
    'learning_rate': ag.core.space.Real(1e-3, 1e-1, default=1e-2),
    'feature_fraction' : ag.core.space.Real(0.1, 0.8, default=0.5),
    'early_stopping_round': 200,
    'seed': seed,
    'seed_value': seed
}  # XT 

# try TRANSF later

In [54]:
tp = TabularPredictor(label=target,
                      problem_type="binary", 
                      eval_metric=metric,
                      sample_weight=weight_col,
                      path=path)



In [55]:
predictor = tp.fit(train_data=train_data, 
                     tuning_data=valid_data,
                     time_limit=time_limits,
                     hyperparameters={'GBM': gbm_options,
#                                       'CAT': cat_options,
                                      'XT': xgb_options
                                     },
                     hyperparameter_tune_kwargs='bayesopt') 

Values in column 'weight' used as sample weights instead of predictive features. Evaluation metrics will ignore sample weights, specify weight_evaluation=True to instead report weighted metrics.
Beginning AutoGluon training ... Time limit = 7200s
AutoGluon will save models to "./artifacts/autogluon-prescreen/"
AutoGluon Version:  0.3.1
Train Data Rows:    1706685
Train Data Columns: 33
Tuning Data Rows:    568895
Tuning Data Columns: 33
Preprocessing data ...
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    297281.67 MB
	Train Data (Original)  Memory Usage: 582.55 MB (0.2% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Sta

Training until validation scores don't improve for 200 rounds
[1000]	train_set's auc: 0.553224	train_set's binary_logloss: 0.681828	valid_set's auc: 0.552421	valid_set's binary_logloss: 0.681978
Did not meet early stopping. Best iteration is:
[1000]	train_set's auc: 0.553224	train_set's binary_logloss: 0.681828	valid_set's auc: 0.552421	valid_set's binary_logloss: 0.681978




Training until validation scores don't improve for 200 rounds




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[514]	train_set's auc: 0.553517	train_set's binary_logloss: 0.681783	valid_set's auc: 0.551981	valid_set's binary_logloss: 0.682036




Training until validation scores don't improve for 200 rounds




Training until validation scores don't improve for 200 rounds




Training until validation scores don't improve for 200 rounds




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[296]	train_set's auc: 0.552941	train_set's binary_logloss: 0.681871	valid_set's auc: 0.55276	valid_set's binary_logloss: 0.681934




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[243]	train_set's auc: 0.553108	train_set's binary_logloss: 0.681846	valid_set's auc: 0.552236	valid_set's binary_logloss: 0.682003




Training until validation scores don't improve for 200 rounds




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[355]	train_set's auc: 0.55311	train_set's binary_logloss: 0.681841	valid_set's auc: 0.552621	valid_set's binary_logloss: 0.681956




Training until validation scores don't improve for 200 rounds
[1000]	train_set's auc: 0.553048	train_set's binary_logloss: 0.681859	valid_set's auc: 0.552583	valid_set's binary_logloss: 0.681967
Early stopping, best iteration is:
[1106]	train_set's auc: 0.55317	train_set's binary_logloss: 0.681834	valid_set's auc: 0.552588	valid_set's binary_logloss: 0.681961




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[162]	train_set's auc: 0.552719	train_set's binary_logloss: 0.68192	valid_set's auc: 0.551841	valid_set's binary_logloss: 0.682073




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[396]	train_set's auc: 0.553308	train_set's binary_logloss: 0.681795	valid_set's auc: 0.552538	valid_set's binary_logloss: 0.681955




Training until validation scores don't improve for 200 rounds




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[743]	train_set's auc: 0.553293	train_set's binary_logloss: 0.681814	valid_set's auc: 0.552554	valid_set's binary_logloss: 0.681955




Training until validation scores don't improve for 200 rounds
[1000]	train_set's auc: 0.552948	train_set's binary_logloss: 0.681882	valid_set's auc: 0.552618	valid_set's binary_logloss: 0.681969
Early stopping, best iteration is:
[1210]	train_set's auc: 0.55319	train_set's binary_logloss: 0.68183	valid_set's auc: 0.552643	valid_set's binary_logloss: 0.681951




Training until validation scores don't improve for 200 rounds
[1000]	train_set's auc: 0.551847	train_set's binary_logloss: 0.682361	valid_set's auc: 0.551478	valid_set's binary_logloss: 0.682418
Did not meet early stopping. Best iteration is:
[1957]	train_set's auc: 0.552973	train_set's binary_logloss: 0.681911	valid_set's auc: 0.552025	valid_set's binary_logloss: 0.682061




Training until validation scores don't improve for 200 rounds
[1000]	train_set's auc: 0.552827	train_set's binary_logloss: 0.681904	valid_set's auc: 0.552679	valid_set's binary_logloss: 0.681965
Early stopping, best iteration is:
[1141]	train_set's auc: 0.552995	train_set's binary_logloss: 0.681869	valid_set's auc: 0.552708	valid_set's binary_logloss: 0.681953




Training until validation scores don't improve for 200 rounds
[1000]	train_set's auc: 0.553386	train_set's binary_logloss: 0.681789	valid_set's auc: 0.552561	valid_set's binary_logloss: 0.681954
Early stopping, best iteration is:
[807]	train_set's auc: 0.553129	train_set's binary_logloss: 0.681842	valid_set's auc: 0.552614	valid_set's binary_logloss: 0.681958




Training until validation scores don't improve for 200 rounds
[1000]	train_set's auc: 0.553285	train_set's binary_logloss: 0.681808	valid_set's auc: 0.552661	valid_set's binary_logloss: 0.681943
Early stopping, best iteration is:
[968]	train_set's auc: 0.553252	train_set's binary_logloss: 0.681816	valid_set's auc: 0.552668	valid_set's binary_logloss: 0.681944




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[509]	train_set's auc: 0.553185	train_set's binary_logloss: 0.681824	valid_set's auc: 0.552598	valid_set's binary_logloss: 0.68195




Training until validation scores don't improve for 200 rounds
[1000]	train_set's auc: 0.553277	train_set's binary_logloss: 0.681812	valid_set's auc: 0.552498	valid_set's binary_logloss: 0.681955
Early stopping, best iteration is:
[1063]	train_set's auc: 0.55336	train_set's binary_logloss: 0.681794	valid_set's auc: 0.552504	valid_set's binary_logloss: 0.68195




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[310]	train_set's auc: 0.55372	train_set's binary_logloss: 0.68179	valid_set's auc: 0.551332	valid_set's binary_logloss: 0.682128




Training until validation scores don't improve for 200 rounds




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[292]	train_set's auc: 0.55322	train_set's binary_logloss: 0.681826	valid_set's auc: 0.55252	valid_set's binary_logloss: 0.681959




Training until validation scores don't improve for 200 rounds
[1000]	train_set's auc: 0.551631	train_set's binary_logloss: 0.682152	valid_set's auc: 0.552029	valid_set's binary_logloss: 0.682123
Did not meet early stopping. Best iteration is:
[1029]	train_set's auc: 0.551672	train_set's binary_logloss: 0.682144	valid_set's auc: 0.552061	valid_set's binary_logloss: 0.682115




Training until validation scores don't improve for 200 rounds




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[608]	train_set's auc: 0.553176	train_set's binary_logloss: 0.681832	valid_set's auc: 0.552634	valid_set's binary_logloss: 0.681954




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[726]	train_set's auc: 0.553095	train_set's binary_logloss: 0.68185	valid_set's auc: 0.552579	valid_set's binary_logloss: 0.681968




Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[602]	train_set's auc: 0.55397	train_set's binary_logloss: 0.681692	valid_set's auc: 0.55196	valid_set's binary_logloss: 0.682012




Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[978]	train_set's auc: 0.55295	train_set's binary_logloss: 0.681886	valid_set's auc: 0.552482	valid_set's binary_logloss: 0.681982




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[199]	train_set's auc: 0.553391	train_set's binary_logloss: 0.681791	valid_set's auc: 0.552374	valid_set's binary_logloss: 0.681973




Training until validation scores don't improve for 200 rounds
[1000]	train_set's auc: 0.553261	train_set's binary_logloss: 0.681814	valid_set's auc: 0.552688	valid_set's binary_logloss: 0.68194
Early stopping, best iteration is:
[995]	train_set's auc: 0.553254	train_set's binary_logloss: 0.681815	valid_set's auc: 0.552691	valid_set's binary_logloss: 0.68194




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[395]	train_set's auc: 0.55303	train_set's binary_logloss: 0.681857	valid_set's auc: 0.552539	valid_set's binary_logloss: 0.681976


	Time limit exceeded
Fitted model: LightGBM/T0 ...
	0.5524	 = Validation score   (roc_auc)
	128.1s	 = Training   runtime
	0.7s	 = Validation runtime
Fitted model: LightGBM/T1 ...
	0.551	 = Validation score   (roc_auc)
	24.68s	 = Training   runtime
	0.27s	 = Validation runtime
Fitted model: LightGBM/T2 ...
	0.552	 = Validation score   (roc_auc)
	96.26s	 = Training   runtime
	0.62s	 = Validation runtime
Fitted model: LightGBM/T3 ...
	0.5486	 = Validation score   (roc_auc)
	28.84s	 = Training   runtime
	0.24s	 = Validation runtime
Fitted model: LightGBM/T4 ...
	0.5505	 = Validation score   (roc_auc)
	30.7s	 = Training   runtime
	0.26s	 = Validation runtime
Fitted model: LightGBM/T5 ...
	0.5509	 = Validation score   (roc_auc)
	33.48s	 = Training   runtime
	0.25s	 = Validation runtime
Fitted model: LightGBM/T6 ...
	0.5528	 = Validation score   (roc_auc)
	61.62s	 = Training   runtime
	0.32s	 = Validation runtime
Fitted model: LightGBM/T7 ...
	0.5522	 = Validation score   (roc_auc)
	59.24s	 =

In [56]:
summary = predictor.fit_summary(2, show_plot=True);
lb = predictor.leaderboard(valid_data, silent=True, extra_info=True)

*** Summary of fit() ***
Estimated performance of each model:
                  model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2   0.552807       1.045775  300.554168                0.173214          89.710923            2       True         35
1           LightGBM/T6   0.552760       0.321375   61.623524                0.321375          61.623524            1       True          7
2          LightGBM/T17   0.552708       0.596124  165.350447                0.596124         165.350447            1       True         18
3          LightGBM/T32   0.552691       0.551186  149.219721                0.551186         149.219721            1       True         33
4          LightGBM/T19   0.552668       0.532320  145.746557                0.532320         145.746557            1       True         20
5          LightGBM/T15   0.552643       0.602852  177.758395                0.602852         177.

In [57]:
time_str = str(dt.datetime.now().timestamp()).split(".")[0]
lb.to_csv(f"./artifacts/autogluon_search_{time_str}.csv")

In [58]:
def get_best_hyperparams(predictor, model_type, leader_board=None):
    """
    Get the hyperparams of the best model of <model_type> from AutoGluon predictor
    
    @returns model rank, model_params
    """
    import numpy as np
    
    if leader_board is None:
        leader_board = predictor.leaderboard(extra_info=True, silent=True)
    
    for rank, row in lb.iterrows():
        if model_type in row['model']:
            return (rank, row['model'], row['hyperparameters'])
    return (np.nan, "No such model found")

model_rank, model_name, lgbm_params = get_best_hyperparams(predictor, model_type='LightGBM', leader_board=lb)
display(model_name)
lgbm_params

'LightGBM/T6'

{'num_boost_round': 1724,
 'num_threads': 96,
 'learning_rate': 0.09461497491247566,
 'objective': 'binary',
 'verbose': -1,
 'boosting_type': 'gbdt',
 'two_round': True,
 'metric': 'auc',
 'boosting': 'gbdt',
 'tree_learner': 'feature',
 'boost_from_average': 'false',
 'num_leaves': 68,
 'lambda_l1': 20.282603527974402,
 'lambda_l2': 1.0736675984836697,
 'min_data_in_leaf': 50,
 'max_depth': 2,
 'feature_fraction': 0.15988559572182998,
 'early_stopping_round': 200,
 'seed': 42,
 'seed_value': 42}