# Model Selection
This notebook aims to use AutoGluon to select the best model for the task of predicting if a customer will complete an offer given they view it

In [1]:
pip install autogluon bokeh==2.0.1

Collecting bokeh==2.0.1
  Downloading bokeh-2.0.1.tar.gz (8.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
  Preparing metadata (setup.py) ... [?25done
Collecting jsonschema<4.22,>=4.18 (from autogluon.multimodal==1.2->autogluon)
  Downloading jsonschema-4.21.1-py3-none-any.whl.metadata (7.8 kB)
Collecting omegaconf<2.3.0,>=2.1.1 (from autogluon.multimodal==1.2->autogluon)
  Downloading omegaconf-2.2.3-py3-none-any.whl.metadata (3.9 kB)
Collecting nltk<3.9,>=3.4.5 (from autogluon.multimodal==1.2->autogluon)
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting nvidia-ml-py3==7.352.0 (from autogluon.multimodal==1.2->autogluon)
  Downloading nvidia-ml-py3-7.352.0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting einops<0.9,>=0.7 (from autogluon.tabular[all]==1.2->autogluon)
  Downloading einops-0.8.1-py3-none-any.whl.metadata (13 kB)
Collecting lightgbm

In [2]:
import os
import pickle
import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor
from sklearn.model_selection import train_test_split

In [3]:
import sagemaker

role = sagemaker.get_execution_role() 
session = sagemaker.Session() 
region = session.boto_region_name
bucket = session.default_bucket()

print(bucket)



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker-us-east-1-361673968127


In [5]:
processed_dataset = pd.read_feather('./data/processed/processed_dataset.feather')
processed_dataset

Unnamed: 0,age,became_member_on,income,membership_days,gender_F,gender_M,gender_O,person,offer_id,offer_viewed,...,reward,difficulty,duration,email,mobile,social,web,offer_bogo,offer_discount,offer_informational
0,55.0,2017-07-15,112000.0,376,True,False,False,0610b486422d4921ae7d2bf64640c50b,9b98b8c7a33c4b65b9aebfe6a799e6d9,False,...,5.0,5.0,7.0,1.0,1.0,0.0,1.0,True,False,False
1,75.0,2017-05-09,100000.0,443,True,False,False,78afa995795e4d85b5d9ceeca43f5fef,5a8bc65990b245e5a138643cd4eb9837,True,...,0.0,0.0,3.0,1.0,1.0,1.0,0.0,False,False,True
2,75.0,2017-05-09,100000.0,443,True,False,False,78afa995795e4d85b5d9ceeca43f5fef,9b98b8c7a33c4b65b9aebfe6a799e6d9,True,...,5.0,5.0,7.0,1.0,1.0,0.0,1.0,True,False,False
3,75.0,2017-05-09,100000.0,443,True,False,False,78afa995795e4d85b5d9ceeca43f5fef,ae264e3637204a6fb9bb56bc8210ddfd,True,...,10.0,10.0,7.0,1.0,1.0,1.0,0.0,True,False,False
4,75.0,2017-05-09,100000.0,443,True,False,False,78afa995795e4d85b5d9ceeca43f5fef,f19421c1d4aa40978ebb69ca19b0e20d,True,...,5.0,5.0,5.0,1.0,1.0,1.0,1.0,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47011,83.0,2016-03-07,50000.0,871,True,False,False,9dc1421481194dcd9400aec7c9ae6366,4d5c57ea9a6940dd891ad53e9dbe8da0,True,...,10.0,10.0,5.0,1.0,1.0,1.0,1.0,True,False,False
47012,83.0,2016-03-07,50000.0,871,True,False,False,9dc1421481194dcd9400aec7c9ae6366,9b98b8c7a33c4b65b9aebfe6a799e6d9,True,...,5.0,5.0,7.0,1.0,1.0,0.0,1.0,True,False,False
47013,83.0,2016-03-07,50000.0,871,True,False,False,9dc1421481194dcd9400aec7c9ae6366,ae264e3637204a6fb9bb56bc8210ddfd,True,...,10.0,10.0,7.0,1.0,1.0,1.0,0.0,True,False,False
47014,62.0,2017-07-22,82000.0,369,True,False,False,e4052622e5ba45a8b96b59aba68cf068,2298d6c36e964ae4a3e7e9706d1fb8c2,True,...,3.0,7.0,7.0,1.0,1.0,1.0,1.0,False,True,False


In [8]:
target_column = 'offer_completed_after_view'
non_train_features = ['became_member_on', 'person', 'offer_id', 'offer_viewed']

# Remove Nan features from dataset
print(f'Nan features in target: {processed_dataset[target_column].isna().sum()}')
processed_dataset = processed_dataset[processed_dataset[target_column].notna()]
processed_dataset[target_column] = processed_dataset[target_column].astype(bool)

# Remove features not able to use for train
processed_dataset = processed_dataset.drop(columns=non_train_features, axis=1)

# Set the target column as the first since it is how Sagemaker training expects it
column_order = [target_column] + [col for col in processed_dataset.columns if col != target_column]
processed_dataset = processed_dataset[column_order]

# Define the train, validation and test size ratios
train_ratio = 0.7
validation_ratio = 0.15
test_ratio = 0.15

# Split the dataset into train (70%) and test (30%)
train_dataset, temp_dataset = train_test_split(processed_dataset, test_size=1 - train_ratio, random_state=42)

# Calculate the size ratio of validation and test sets
val_test_ratio = test_ratio / (test_ratio + validation_ratio)

# Split the remaining dataset (X_temp, y_temp) into validation (15%) and test (15%)
val_dataset, test_dataset = train_test_split(temp_dataset, test_size=val_test_ratio, random_state=42)

print('Training set:', train_dataset.shape)
print('Validation set:', val_dataset.shape)
print('Test set:', test_dataset.shape)

Nan features in target: 0
Training set: (32855, 17)
Validation set: (7040, 17)
Test set: (7041, 17)


In [9]:
# Save datases in S3
prefix = 'data'
data_dir = './data/processed'

train_dataset_path = os.path.join(data_dir, 'train.csv')
val_dataset_path = os.path.join(data_dir, 'validation.csv')
test_dataset_path = os.path.join(data_dir, 'test.csv')

train_dataset.to_csv(train_dataset_path, index=False, header=False)
val_dataset.to_csv(val_dataset_path, index=False, header=False)
test_dataset.to_csv(test_dataset_path, index=False, header=False)

# Upload the test.csv, train.csv and validation.csv files which are contained in data_dir to S3 using sess.upload_data().
train_location = session.upload_data(train_dataset_path, key_prefix=prefix)
val_location = session.upload_data(val_dataset_path, key_prefix=prefix)
test_location = session.upload_data(test_dataset_path, key_prefix=prefix)

train_location, val_location, test_location

# Save columns since we will not have them available in S3
print(train_dataset.columns.tolist())

['offer_completed_after_view', 'age', 'income', 'membership_days', 'gender_F', 'gender_M', 'gender_O', 'reward', 'difficulty', 'duration', 'email', 'mobile', 'social', 'web', 'offer_bogo', 'offer_discount', 'offer_informational']


In [10]:
# AutoGluon predictor
predictor = TabularPredictor(
    label='offer_completed_after_view',
    problem_type='binary',
    eval_metric='average_precision'  # Use PR_AUC since it is an unbalanced binary classification problem
).fit(
    train_data=train_dataset,
    tuning_data=val_dataset,
    presets='best_quality',
    time_limit=60 * 30, # 30 minutes of time limit
    use_bag_holdout=True,
    verbosity=2,
)

No path specified. Models will be saved in: "AutogluonModels/ag-20250329_192515"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Fri Feb 14 16:52:40 UTC 2025
CPU Count:          2
Memory Avail:       1.06 GB / 3.76 GB (28.3%)
Disk Space Avail:   4.81 GB / 4.99 GB (96.4%)
	We recommend a minimum available disk space of 10 GB, and large datasets may require more.
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to False. Reason: Skip dynamic_stacking when use_bag_holdout is enabled. (use_bag_holdout=True)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ... Time limit = 1800s
AutoGluon will save models to "/home/sagemaker-user/AutogluonModels/ag-20250329_192515"
Train Data Rows:    32855
Train Data Columns: 16
Tuning Data Rows:    7040
Tuning Data Columns: 16
Label Column:       o

In [11]:
with open('./data/predictor.pickle', 'wb') as handle:
    pickle.dump(predictor, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
with open('./data/predictor.pickle', 'rb') as handle:
    predictor = pickle.load(handle)

In [13]:
# View the summary of the fit
fit_summary = predictor.fit_summary(show_plot=True)

*** Summary of fit() ***
Estimated performance of each model:
                      model  score_val        eval_metric  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       WeightedEnsemble_L3   0.628582  average_precision       1.840786  39.645676                0.005607           0.992467            3       True         10
1   RandomForestEntr_BAG_L2   0.614943  average_precision       1.224754  15.568836                0.344004           7.156762            2       True          8
2  RandomForest_r195_BAG_L2   0.614829  average_precision       1.231750  26.302462                0.351001          17.890388            2       True          9
3   RandomForestGini_BAG_L2   0.614351  average_precision       1.140173  13.606060                0.259423           5.193985            2       True          7
4       WeightedEnsemble_L2   0.595407  average_precision       0.790996   8.823151                0.004061           0.559655  



The best models are an ensemble, a neural netowrk and LightGBM with extra trees parameter (extra randomized trees). For simplicity and interpretably we will use LightGBM, which is available in Sagemaker also.