# Model training
This notebook contains the model training and hyperparameter tuning of the LightGBM model using Sagemaker

In [6]:
import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.parameter import ContinuousParameter, CategoricalParameter, IntegerParameter
from sagemaker.tuner import HyperparameterTuner
from sagemaker import image_uris, model_uris, script_uris
import time
import boto3


In [21]:
role = sagemaker.get_execution_role() 
session = sagemaker.Session() 
region = session.boto_region_name
bucket = session.default_bucket()

column_names = ['offer_completed_after_view', 'age', 'income', 'membership_days', 'gender_F', 'gender_M', 'gender_O', 'reward', 'difficulty', 'duration', 'email', 'mobile', 'social', 'web', 'offer_bogo', 'offer_discount', 'offer_informational']
train_location, val_location, test_location = ('s3://sagemaker-us-east-1-361673968127/data/train.csv', 's3://sagemaker-us-east-1-361673968127/data/validation.csv', 's3://ssagemaker-us-east-1-361673968127/data/test.csv')
train_model_id, train_model_version, train_scope = "lightgbm-classification-model", "*", "training"
training_instance_type = "ml.m5.xlarge"

prefix = 'sagemaker-project'
s3_output_location = f's3://{bucket}/{prefix}/output'

In [14]:
pip install "sagemaker==2.189.0"

Collecting sagemaker==2.189.0
  Downloading sagemaker-2.189.0.tar.gz (893 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m893.3/893.3 kB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
  Preparing metadata (setup.py) ... [?2done
Collecting cloudpickle==2.2.1 (from sagemaker==2.189.0)
  Downloading cloudpickle-2.2.1-py3-none-any.whl.metadata (6.9 kB)
Collecting tblib==1.7.0 (from sagemaker==2.189.0)
  Downloading tblib-1.7.0-py2.py3-none-any.whl.metadata (24 kB)
Downloading cloudpickle-2.2.1-py3-none-any.whl (25 kB)
Downloading tblib-1.7.0-py2.py3-none-any.whl (12 kB)
Building wheels for collected packages: sagemaker
  Building wheel for sagemaker (setup.py) done
[?25h  Created wheel for sagemaker: filename=sagemaker-2.189.0-py2.py3-none-any.whl size=1194972 sha256=737f49e08a3aad8b799a57b624096245f779089af7b457d8e8cfd68ebd681d5c
  Stored in directory: /home/sagemaker-user/.cache/pip/wheels/30/da/20/b1178bdf271b957c3ce2d833a12271282e9571aaa26e817162
Successfully built sag

In [22]:
# Retrieve the docker image
train_image_uri = image_uris.retrieve(
    region=None,
    framework=None,
    model_id=train_model_id,
    model_version=train_model_version,
    image_scope=train_scope,
    instance_type=training_instance_type,
)
# Retrieve the training script
train_source_uri = script_uris.retrieve(
    model_id=train_model_id, model_version=train_model_version, script_scope=train_scope
)
# Retrieve the pre-trained model tarball to further fine-tune
train_model_uri = model_uris.retrieve(
    model_id=train_model_id, model_version=train_model_version, model_scope=train_scope
)

In [23]:
# Retrieve the default hyper-parameters for fine-tuning the model
hyperparameters = sagemaker.hyperparameters.retrieve_default(
    model_id=train_model_id, model_version=train_model_version
)
hyperparameters["metric"] = "average_precision"
hyperparameters

{'num_boost_round': '5000',
 'early_stopping_rounds': '30',
 'metric': 'average_precision',
 'learning_rate': '0.009',
 'num_leaves': '67',
 'feature_fraction': '0.74',
 'bagging_fraction': '0.53',
 'bagging_freq': '5',
 'max_depth': '11',
 'min_data_in_leaf': '26',
 'max_delta_step': '0.0',
 'lambda_l1': '0.0',
 'lambda_l2': '0.0',
 'boosting': 'gbdt',
 'min_gain_to_split': '0.0',
 'scale_pos_weight': '1.0',
 'tree_learner': 'serial',
 'feature_fraction_bynode': '1.0',
 'is_unbalance': 'False',
 'max_bin': '255',
 'num_threads': '0',
 'verbosity': '1',
 'use_dask': 'False'}

In [24]:
hyperparameter_ranges_lgb = {
    'learning_rate': ContinuousParameter(0.01, 0.2), # Learning rate (Step size shrinkage for updates)
    "num_leaves": IntegerParameter(2, 50), # Maximum number of leaves in a tree
    'feature_fraction': ContinuousParameter(0.5, 1), # Fraction of features used (column sample by tree)
    'bagging_fraction': ContinuousParameter(0.5, 1), # Fraction of data used for bagging (subsample)
    "bagging_freq": IntegerParameter(1, 10),
    "max_depth": IntegerParameter(1, 10), # Maximum tree depth, -1 for no constraints
    "min_data_in_leaf": IntegerParameter(1, 30), # Minimum number of data points in a leaf
    'extra_trees': CategoricalParameter([True, False]) # If True when evaluating node splits LightGBM will check only one randomly-chosen threshold for each feature
}

In [25]:
# Create SageMaker Estimator instance
tabular_estimator = Estimator(
    role=role,
    image_uri=train_image_uri,
    source_dir=train_source_uri,
    model_uri=train_model_uri,
    entry_point="transfer_learning.py", # Default one available in image uri
    instance_count=1,
    volume_size=30,
    instance_type=training_instance_type,
    max_run=360000,
    hyperparameters=hyperparameters,
    output_path=s3_output_location,
)

In [19]:
!aws iam attach-role-policy --role-name AmazonSageMaker-ExecutionRole-20250329T195861 --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess



An error occurred (AccessDenied) when calling the AttachRolePolicy operation: User: arn:aws:sts::361673968127:assumed-role/AmazonSageMaker-ExecutionRole-20250329T195861/SageMaker is not authorized to perform: iam:AttachRolePolicy on resource: role AmazonSageMaker-ExecutionRole-20250329T195861 because no identity-based policy allows the iam:AttachRolePolicy action


In [26]:
# Set up the hyperparameter tuner
tuner = HyperparameterTuner(tabular_estimator,
        objective_metric_name='average_precision', # See https://docs.aws.amazon.com/sagemaker/latest/dg/lightgbm-tuning.html
        hyperparameter_ranges=hyperparameter_ranges_lgb,
        metric_definitions=[{"Name": "average_precision", "Regex": "average_precision: ([0-9\\.]+)"}],
        strategy='Bayesian', # Bayesian optimization
        objective_type="Maximize",
        max_jobs=20,
        max_parallel_jobs=3
)

# Start the hyperparameter tuning job
training_job_name = prefix + str(int(time.time()))
tuner.fit({'train': train_location, 'validation': val_location}, logs=True, job_name=training_job_name)
tuner.wait()

.....................................................................................................!
!


In [27]:
tuner.best_estimator(), tuner.best_estimator().hyperparameters()


2025-03-29 22:31:47 Starting - Found matching resource for reuse
2025-03-29 22:31:47 Downloading - Downloading the training image
2025-03-29 22:31:47 Training - Training image download completed. Training in progress.
2025-03-29 22:31:47 Uploading - Uploading generated training model
2025-03-29 22:31:47 Completed - Resource reused by training job: sagemaker-project1743287032-020-52e72e3d

2025-03-29 22:31:47 Starting - Found matching resource for reuse
2025-03-29 22:31:47 Downloading - Downloading the training image
2025-03-29 22:31:47 Training - Training image download completed. Training in progress.
2025-03-29 22:31:47 Uploading - Uploading generated training model
2025-03-29 22:31:47 Completed - Resource reused by training job: sagemaker-project1743287032-020-52e72e3d


(<sagemaker.estimator.Estimator at 0x7f4213ee3410>,
 {'_tuning_objective_metric': 'average_precision',
  'bagging_fraction': '0.8344924836118808',
  'bagging_freq': '9',
  'boosting': '"gbdt"',
  'early_stopping_rounds': '"30"',
  'extra_trees': 'True',
  'feature_fraction': '0.9249693303888324',
  'feature_fraction_bynode': '"1.0"',
  'is_unbalance': '"False"',
  'lambda_l1': '"0.0"',
  'lambda_l2': '"0.0"',
  'learning_rate': '0.19494439569741567',
  'max_bin': '"255"',
  'max_delta_step': '"0.0"',
  'max_depth': '8',
  'metric': '"average_precision"',
  'min_data_in_leaf': '9',
  'min_gain_to_split': '"0.0"',
  'num_boost_round': '"5000"',
  'num_leaves': '35',
  'num_threads': '"0"',
  'sagemaker_container_log_level': '20',
  'sagemaker_job_name': '"sagemaker-project1743287032"',
  'sagemaker_program': '"transfer_learning.py"',
  'sagemaker_region': '"us-east-1"',
  'sagemaker_submit_directory': '"s3://jumpstart-cache-prod-us-east-1/source-directory-tarballs/lightgbm/transfer_learn

In [28]:
import boto3
import pandas as pd

# Function to fetch tuner results as a DataFrame
def tuner_results_to_dataframe(tuner):
    sagemaker_client = boto3.client('sagemaker', region_name=region)
    all_jobs_response = sagemaker_client.list_training_jobs_for_hyper_parameter_tuning_job(
        HyperParameterTuningJobName=tuner.latest_tuning_job.name
    )
    
    all_jobs = all_jobs_response['TrainingJobSummaries']
    
    job_details = []
    for job in all_jobs:
        job_details_response = sagemaker_client.describe_training_job(TrainingJobName=job['TrainingJobName'])
        job_details.append(job_details_response)
    
    df = pd.DataFrame(job_details)
    return df

# Fetch tuner results
tuner_df = tuner_results_to_dataframe(tuner)

# Get best training job details
best_job_name = tuner.best_training_job()
best_job = tuner_df.loc[tuner_df['TrainingJobName'] == best_job_name]

# Get the final metric value
best_job_objective_value = best_job.iloc[0]['FinalMetricDataList'][0]['Value']
best_job_hyperparameters = best_job.iloc[0]['HyperParameters']

print("Best training job:", best_job_name)
print("\nBest hyperparameters:\n", best_job_hyperparameters)
print("\nBest objective value:", best_job_objective_value)

# Expand hyperparameters into columns and keep relevant columns
hyperparameters_expanded = tuner_df['HyperParameters'].apply(pd.Series)
tuner_df_final = pd.concat([tuner_df['TrainingJobName'], hyperparameters_expanded, tuner_df['FinalMetricDataList']], axis=1)

# Extract the objective value from FinalMetricDataList
tuner_df_final['ObjectiveValue'] = tuner_df_final['FinalMetricDataList'].apply(lambda x: x[0]['Value'])

# Calculate time of training and rank by the objective value
tuner_df_final['TrainingTime'] = tuner_df['TrainingEndTime'] - tuner_df['TrainingStartTime']
tuner_df_final = tuner_df_final.sort_values("ObjectiveValue", ascending=False)
tuner_df_final.insert(len(tuner_df_final.columns), 'Rank', range(1, 1+len(tuner_df_final)))

print("\nAll training jobs with expanded hyperparameters, metric value, time, and ranking:\n")
relevant_columns = ['TrainingJobName', 'Rank', 'ObjectiveValue', 'TrainingTime'] + list(hyperparameter_ranges_lgb.keys())

tuner_df_final[relevant_columns].rename(columns={'ObjectiveValue': 'average_precision'})

Best training job: sagemaker-project1743287032-017-47c640b5

Best hyperparameters:
 {'_tuning_objective_metric': 'average_precision', 'bagging_fraction': '0.8344924836118808', 'bagging_freq': '9', 'boosting': '"gbdt"', 'early_stopping_rounds': '"30"', 'extra_trees': 'True', 'feature_fraction': '0.9249693303888324', 'feature_fraction_bynode': '"1.0"', 'is_unbalance': '"False"', 'lambda_l1': '"0.0"', 'lambda_l2': '"0.0"', 'learning_rate': '0.19494439569741567', 'max_bin': '"255"', 'max_delta_step': '"0.0"', 'max_depth': '8', 'metric': '"average_precision"', 'min_data_in_leaf': '9', 'min_gain_to_split': '"0.0"', 'num_boost_round': '"5000"', 'num_leaves': '35', 'num_threads': '"0"', 'sagemaker_container_log_level': '20', 'sagemaker_job_name': '"sagemaker-project1743287032"', 'sagemaker_program': '"transfer_learning.py"', 'sagemaker_region': '"us-east-1"', 'sagemaker_submit_directory': '"s3://jumpstart-cache-prod-us-east-1/source-directory-tarballs/lightgbm/transfer_learning/classification/

Unnamed: 0,TrainingJobName,Rank,average_precision,TrainingTime,learning_rate,num_leaves,feature_fraction,bagging_fraction,bagging_freq,max_depth,min_data_in_leaf,extra_trees
3,sagemaker-project1743287032-017-47c640b5,1,0.683142,0 days 00:00:39.874000,0.1949443956974156,35,0.9249693303888324,0.8344924836118808,9,8,9,True
9,sagemaker-project1743287032-011-5a423cd9,2,0.675376,0 days 00:00:38.757000,0.119348840666589,40,0.552251726785043,0.7598716500576683,8,8,13,True
1,sagemaker-project1743287032-019-b109cca6,3,0.675228,0 days 00:00:38.717000,0.1098036621430943,50,0.6414938734454705,1.0,10,8,19,True
2,sagemaker-project1743287032-018-ca6eab52,4,0.674202,0 days 00:00:38.796000,0.1469975626789213,40,0.8752795960758015,0.8407671591252192,10,8,7,False
5,sagemaker-project1743287032-015-5561bbb3,5,0.671373,0 days 00:00:40.777000,0.0359977408753812,50,0.5831017834137551,0.9579489996212394,10,7,14,False
4,sagemaker-project1743287032-016-c0497ce5,6,0.668965,0 days 00:00:33.628000,0.0902199484923885,46,0.5652155009760103,0.8864030291819285,10,7,7,False
6,sagemaker-project1743287032-014-77c126d6,7,0.66121,0 days 00:00:39.042000,0.0235466948205265,50,0.9001386784924692,0.6125689281800397,10,7,6,False
7,sagemaker-project1743287032-013-aaeb84b3,8,0.660321,0 days 00:00:36.171000,0.1713125414812627,21,0.7407900923482639,1.0,6,8,30,True
0,sagemaker-project1743287032-020-52e72e3d,9,0.658445,0 days 00:00:38.908000,0.0789621834169329,27,0.5150868373430718,1.0,7,7,1,True
8,sagemaker-project1743287032-012-9239bdbd,10,0.618088,0 days 00:00:34.308000,0.0479561773932615,22,0.688215628678962,0.8344526239035519,9,2,9,True


Note: The depth and number of leaves of the first configuration looks reasonable in comparison with the others, and the average precision is significantly higher.

In [29]:
best_job['OutputDataConfig'].iloc[0]

{'KmsKeyId': '',
 'S3OutputPath': 's3://sagemaker-us-east-1-361673968127/sagemaker-project/output',
 'CompressionType': 'GZIP'}

In [None]:
# # Deploy the best model from the hyperparameter tuning job
best_model = tuner.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')


2025-03-29 22:31:47 Starting - Found matching resource for reuse
2025-03-29 22:31:47 Downloading - Downloading the training image
2025-03-29 22:31:47 Training - Training image download completed. Training in progress.
2025-03-29 22:31:47 Uploading - Uploading generated training model
2025-03-29 22:31:47 Completed - Resource reused by training job: sagemaker-project1743287032-020-52e72e3d


--