### Optimize ACH with AutoGluon
---
- See example [here](https://github.com/aws/amazon-sagemaker-examples/tree/master/advanced_functionality/autogluon-tabular).
- [Quick Start](https://autogluon.mxnet.io/tutorials/tabular_prediction/tabular-quickstart.html)
- [In-Depth](https://autogluon.mxnet.io/tutorials/tabular_prediction/tabular-indepth.html#model-distillation)
- [Tabular-fit api](https://autogluon.mxnet.io/api/autogluon.task.html#autogluon.task.TabularPrediction.fit)

In [None]:
# !pip uninstall rdsutils --yes
# !pip install -i https://repository.sofi.com/artifactory/api/pypi/pypi/simple rdsutils --no-cache-dir

#### Installation

In [None]:
# !python3 -m pip install --upgrade pip
# !python3 -m pip install --upgrade "mxnet<2.0.0"
# !python3 -m pip install autogluon
# !pip install bokeh==2.0.1

#### Import modules

In [None]:
import os, sys
import pickle as pkl
import pandas as pd
import autogluon as ag
from autogluon import TabularPrediction as task

from rdsutils import datagen
from rdsutils import plot

%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings(action='ignore')

#### Load and Clean data

In [None]:
modeling_df = pd.read_parquet('../../artifacts/20201005/modeling_df_w_baseline_preds.parquet')
modeling_df['account_ending_balance'] = modeling_df['real_ending_balance']
modeling_df['days_since_first_transaction'] = modeling_df['days_since_first_deposit']
modeling_df['ach_target'] = modeling_df['is_returned']

In [None]:
# get boruta features
boruta_features = pkl.load(open('../../artifacts/20201005/boruta_features.pkl', 'rb'))

#### Train Test Split

In [None]:
modeling_dfs = datagen.GroupKFoldGenerator(modeling_df, 5, 
                              strategize_by='ach_target', 
                              groupby='business_account_number')
train, test = next(modeling_dfs)

In [None]:
modeling_dfs_ = datagen.GroupKFoldGenerator(train, 4, 
                              strategize_by='ach_target', 
                              groupby='business_account_number')
train, valid = next(modeling_dfs_)

In [None]:
train.shape, valid.shape, test.shape, modeling_df.shape

#### Set AutoGluon Datasets

In [None]:
target_col = 'ach_target'
features = boruta_features
id_col = 'business_account_number'
pos_wgt_scaling_factor = datagen.get_positive_label_weight(train[target_col])

train_data = task.Dataset(df=train[features+[target_col]+[id_col]])
valid_data = task.Dataset(df=valid[features+[target_col]+[id_col]])
test_data = task.Dataset(df=test[features+[target_col]+[id_col]])

In [None]:
train_data.head()

#### Train Models

`fit` documentations: [here](https://autogluon.mxnet.io/api/autogluon.task.html#autogluon.task.TabularPrediction.fit)

In [None]:
metric = 'f1'   # 'roc_auc', 'f1', 'average_precision'
path = '../../artifacts/autogluon-ach'
os.makedirs(path, exist_ok=True)


hp_tune = True
time_limits = 10*60  # 2mins
cat_options = {  # specifies non-default hyperparameter values for lightGBM gradient boosted trees
    'l2_leaf_reg': ag.space.Real(lower=0, upper=20, default=5),
    'min_data_in_leaf': ag.space.Int(lower=10, upper=50, default=30),
    'depth' : ag.space.Int(lower=2, upper=5, default=3),
    'learning_rate': ag.space.Real(1e-3, 1e-1, default=1e-2),
    'bagging_temperature' : ag.space.Real(0, 100, default=0.5),
    'scale_pos_weight': pos_wgt_scaling_factor,
}

gbm_options = {  # specifies non-default hyperparameter values for lightGBM gradient boosted trees
    'num_leaves' : ag.space.Int(lower=5, upper=20, default=10),
    'lambda_l1': ag.space.Real(lower=0, upper=20, default=5),
    'lambda_l2': ag.space.Real(lower=0, upper=20, default=5),
    'min_data_in_leaf': ag.space.Int(lower=10, upper=50, default=30),
    'max_depth' : ag.space.Int(lower=2, upper=5, default=3),
    'num_boost_round': ag.space.Int(lower=100, upper=2000, default=500),
    'learning_rate': ag.space.Real(1e-3, 1e-1, default=1e-2),
    'feature_fraction' : ag.space.Real(0.1, 0.8, default=0.5),
    'scale_pos_weight': pos_wgt_scaling_factor,
}

predictor = task.fit(train_data=train_data, 
                     tuning_data=valid_data,
                     label=target_col, 
                     output_directory=path, 
                     eval_metric=metric,
                     time_limits=time_limits,
                     hyperparameter_tune=hp_tune,
                     hyperparameters={'CAT':cat_options,
                                      'GBM':gbm_options},
                     search_strategy='skopt') 

In [None]:
try:
    y_true = test_data[target_col]
    test_data = test_data.drop(labels=[target_col], axis=1)
except:
    pass
test_data.head()

In [None]:
predictor = task.load(path)
y_pred = predictor.predict(test_data)
perf = predictor.evaluate_predictions(y_true=y_true, y_pred=y_pred, auxiliary_metrics=True, )

In [None]:
perf

In [None]:
model = None  # 'LightGBMClassifier', 'CatboostClassifier', 'weighted_ensemble_k0_l1'
predictor = task.load(path)
y_pred = predictor.predict_proba(test_data, model=model)
test['autogluon_pred_1005'] = y_pred

In [None]:
predictor.fit_summary(3);

In [None]:
model = 'LightGBMClassifier/trial_18'  # 'LightGBMClassifier', 'CatboostClassifier', 'weighted_ensemble_k0_l1'
predictor = task.load(path)
y_pred = predictor.predict_proba(test_data, model=model)
test['autogluon_pred_1005'] = y_pred

In [None]:
lb = predictor.leaderboard(test, silent=True, extra_info=True)

In [None]:
preds = [(test['deposit_v1_pred'], 'deposit_v1_pred'), 
         (test['boruta_pred_1005'], 'boruta_pred_1005'), 
         (test['autogluon_pred_1005'], 'autogluon_pred_1005')]
title = 'Precision-Recall curve: Baseline Comparison'
plot.plot_pr_curve_mult(test[target_col], preds,
                   title=title, colors = ['r', 'g', 'b'])

In [None]:
plot.plot_auc_curve_mult(test[target_col], preds,
                   title=title, colors = ['r', 'g', 'b'])

In [None]:
def get_best_hyperparams(predictor, model_type, leader_board=None):
    """
    Get the hyperparams of the best model of <model_type> from AutoGluon predictor
    
    @returns model rank, model_params
    """
    import numpy as np
    
    if leader_board is None:
        leader_board = predictor.leaderboard(extra_info=True, silent=True)
    
    for rank, row in lb.iterrows():
        if model_type in row['model']:
            return (rank, row['hyperparameters'])
    return (np.nan, "No such model found")

In [None]:
model_rank, lgbm_params = get_best_hyperparams(predictor, model_type='LightGBM', leader_board=lb)
lgbm_params