# Experimenting on the Abalone dataset with Amazon SageMaker XGBoost Algorithm

## Setup environment

In [None]:
!pip install -q sagemaker==2.37.0
!pip install -q sagemaker-experiments==0.1.24

In [None]:
import io
import os
import time
import boto3
import sagemaker
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.analytics import ExperimentAnalytics
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

role = sagemaker.get_execution_role()
region = boto3.Session().region_name
sess = boto3.Session()
sm = sess.client('sagemaker')
bucket = sagemaker.Session().default_bucket()
prefix = 'sagemaker/DEMO-xgboost-inference-script-mode'

## Get data

In [None]:
s3 = boto3.client("s3")
# Load the dataset
FILE_DATA = 'abalone'
s3.download_file("sagemaker-sample-files", f"datasets/tabular/uci_abalone/abalone.libsvm", FILE_DATA)
sagemaker.Session().upload_data(FILE_DATA, bucket=bucket, key_prefix=prefix+'/train')

train_input = TrainingInput("s3://{}/{}/{}/".format(bucket, prefix, "train"), content_type="text/libsvm")

data_inputs = {
    'train': train_input,
    'validation': train_input
}

## Create an experiment

In [None]:
example_experiment = Experiment.create(
    experiment_name=f"abalone-{int(time.time())}", 
    description="finding abalone age", 
    sagemaker_boto_client=sm)
print(example_experiment)

## Train XGBoost models and track with experiments

In [None]:
for i, max_depth in enumerate([2, 3, 5]):
    # create experiment trial
    trial_name = f"abalone-xgboost-{max_depth}-{int(time.time())}"
    trial = Trial.create(
        trial_name=trial_name, 
        experiment_name=example_experiment.experiment_name,
        sagemaker_boto_client=sm,
    )
    # create sagemaker estimator for training
    job_name = "abalone-{}".format(int(time.time()))
    estimator = XGBoost(
        entry_point="abalone.py",
        hyperparameters={
            "max_depth": max_depth,
            "eta": "0.2",
            "gamma": "4",
            "min_child_weight": "6",
            "subsample": "0.7",
            "objective": "reg:squarederror",
            "num_round": "50",
            "verbosity": "2",
        },
        role=role, 
        instance_count=1,
        instance_type="ml.c5.xlarge",
        framework_version="1.2-1",
        disable_profiler=True
    )



    # Now associate the estimator with the Experiment and Trial
    estimator.fit(
        inputs=data_inputs, 
        job_name=job_name,
        experiment_config={
            "TrialName": trial.trial_name,
            "TrialComponentDisplayName": "Training",
        },
        wait=False
    )

    # give it a while before dispatching the next training job
    time.sleep(2)

## Analyze experiment results

In [None]:
search_expression = {
    "Filters":[
        {
            "Name": "DisplayName",
            "Operator": "Equals",
            "Value": "Training",
        }
    ],
}

In [None]:
trial_component_analytics = ExperimentAnalytics(
    sagemaker_session=Session(sess, sm), 
    experiment_name=example_experiment.experiment_name,
    search_expression=search_expression,
    metric_names=['validation:rmse']
)

In [None]:
trial_component_analytics.dataframe()