In [1]:
import numpy as np
import pandas as pd

import boto3
import sagemaker

In [2]:
!pip install sagemaker-experiments

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
from sagemaker.sklearn.estimator import SKLearn

from time import strftime, gmtime
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent

from sagemaker.model_monitor import DataCaptureConfig, DatasetFormat, DefaultModelMonitor
from sagemaker.predictor import csv_serializer
from sagemaker.s3 import S3Uploader, S3Downloader

In [4]:
sess = boto3.Session()
sm = sess.client("sagemaker")
role = sagemaker.get_execution_role()

sagemaker_session = sagemaker.Session()

## Create Datasets and Load into S3

In [None]:
data = pd.read_csv('data/reactor_performance_data.csv')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def data_preparation(dataframe, feature_list, target_variable,
                     test_size=0.4, random_state=42, 
                     print_shapes=False, standardize=False):
    
    X = dataframe[feature_list]
    y = dataframe[target_variable]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    if print_shapes == True:
        print('Data Shapes:')
        print('X_train ', X_train.shape)
        print('y_train ', y_train.shape)
        print('X_test ', X_test.shape)
        print('y_test ', y_test.shape)   

    if standardize == False:
        return X_train, X_test, y_train, y_test
    elif standardize == True:
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        return X_train, X_test, y_train, y_test
    
feature_list = ['Fao', 'Fbo', 'P', 'To', 'Cto', 'm', 'Ta']
target_variable = 'Yc'

X_train, X_test, y_train, y_test = data_preparation(data, feature_list, target_variable,
                                                     test_size=0.4, random_state=42, 
                                                     print_shapes=False, standardize=False)

In [None]:
train_data = pd.concat([X_train, y_train], axis=1)
train_data.to_csv('data/train_data.csv', index=False)

test_data = pd.concat([X_test, y_test], axis=1)
test_data.to_csv('data/test_data.csv', index=False)

In [5]:
account_id = sess.client("sts", region_name=sess.region_name).get_caller_identity()["Account"]
bucket = "sagemaker-studio-{}-{}".format(sess.region_name, account_id)
prefix = "chemreactorml"

In [None]:
# Code is still writing to S3 when bucket exists?
try:
    if sess.region_name == "us-east-1":
        sess.client("s3").create_bucket(Bucket=bucket)
    else:
        sess.client("s3").create_bucket(
            Bucket=bucket, CreateBucketConfiguration={"LocationConstraint": sess.region_name}
        )
except Exception as e:
    print(
        "Looks like you already have a bucket of this name. That's good. Uploading the data files..."
    )

# Return the URLs of the uploaded file, so they can be reviewed or used elsewhere
s3url = sagemaker.s3.S3Uploader.upload("data/train_data.csv", "s3://{}/{}/{}".format(bucket, prefix, "train"))
print(s3url)
s3url = sagemaker.s3.S3Uploader.upload("data/test_data.csv", "s3://{}/{}/{}".format(bucket, prefix, "test"))
print(s3url)

In [6]:
s3_input_train = sagemaker.TrainingInput(
    s3_data="s3://{}/{}/train".format(bucket, prefix), content_type="csv"
)
s3_input_validation = sagemaker.TrainingInput(
    s3_data="s3://{}/{}/test/".format(bucket, prefix), content_type="csv"
)

## Model Training

### Basic Training Run

In [7]:
script_path = 'train.py'

In [8]:
sklearn = SKLearn(
    entry_point=script_path,
    instance_type="ml.c5.xlarge",
    role=role,
    framework_version='0.23-1',
    py_version='py3',
    sagemaker_session=sagemaker_session,
    hyperparameters={"n-estimators": 100},
)

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.


In [9]:
sklearn.fit({'train': s3_input_train, 'test': s3_input_validation})

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2022-04-13-21-44-38-434


2022-04-13 21:44:38 Starting - Starting the training job...
2022-04-13 21:45:04 Starting - Preparing the instances for trainingProfilerReport-1649886278: InProgress
......
2022-04-13 21:46:07 Downloading - Downloading input data...
2022-04-13 21:46:27 Training - Downloading the training image...
2022-04-13 21:47:04 Training - Training image download completed. Training in progress..[34m2022-04-13 21:47:05,023 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2022-04-13 21:47:05,026 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-04-13 21:47:05,035 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2022-04-13 21:47:05,309 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-04-13 21:47:05,320 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-04-13 21:47:05,331 sagemaker

### Train as Experiment

In [52]:
create_date = strftime("%Y-%m-%d-%H-%M-%S", gmtime())

experiment = Experiment.create(
    experiment_name="reactor-performance-prediction-rf-{}".format(create_date),
    description="Using RF model to predict reactor performance",
    sagemaker_boto_client=sm,
)

In [None]:
trial = Trial.create(
    trial_name="rf-model-trial-{}".format(strftime("%Y-%m-%d-%H-%M-%S", gmtime())),
    experiment_name=experiment.experiment_name,
    sagemaker_boto_client=sm,
)

hyperparameters = {"n-estimators": 100}

sklearn = SKLearn(
    entry_point=script_path,
    instance_type="ml.c5.xlarge",
    role=role,
    framework_version='0.23-1',
    py_version='py3',
    sagemaker_session=sagemaker_session,
    hyperparameters=hyperparameters,
    output_path="s3://{}/{}/output".format(bucket, prefix),
    base_job_name='reactor-performance-prediction',
    metric_definitions=[{'Name': 'MAE', 'Regex': 'MAE: (.*)'}]
)

experiment_config={
        "ExperimentName": experiment.experiment_name,
        "TrialName": trial.trial_name,
        "TrialComponentDisplayName": "Training",
    }

In [None]:
sklearn.fit({'train': s3_input_train, 'test': s3_input_validation}, 
            experiment_config=experiment_config)

### Train while tracking hyperparameters

In [None]:
num_estimators = [50, 100, 150]

for num in num_estimators:
    trial = Trial.create(
        trial_name="rf-model-trial-{}-num-estimators-{}".format(strftime("%Y-%m-%d-%H-%M-%S", gmtime()), num),
        experiment_name=experiment.experiment_name,
        sagemaker_boto_client=sm,
    )

    hyperparameters = {"n-estimators": num}

    sklearn = SKLearn(
        entry_point=script_path,
        instance_type="ml.c5.xlarge",
        role=role,
        framework_version='0.23-1',
        py_version='py3',
        sagemaker_session=sagemaker_session,
        hyperparameters=hyperparameters,
        output_path="s3://{}/{}/output".format(bucket, prefix),
        base_job_name='reactor-performance-prediction',
        metric_definitions=[{'Name': 'MAE', 'Regex': 'MAE: (.*)'}]
    )

    experiment_config={
            "ExperimentName": experiment.experiment_name,
            "TrialName": trial.trial_name,
            "TrialComponentDisplayName": "Training",
        }
    
    sklearn.fit({'train': s3_input_train, 'test': s3_input_validation}, 
            experiment_config=experiment_config)

![trial chart](trial_chart.png "Trial Chart")

## Hyperparameter Tuning

In [None]:
# we use the Hyperparameter Tuner
from sagemaker.tuner import IntegerParameter

# Define exploration boundaries
hyperparameter_ranges = {
    "n-estimators": IntegerParameter(20, 200),
}

# create Optimizer
Optimizer = sagemaker.tuner.HyperparameterTuner(
    estimator=sklearn,
    hyperparameter_ranges=hyperparameter_ranges,
    base_tuning_job_name="RF-tuner",
    objective_type="Minimize",
    objective_metric_name="MAE",
    metric_definitions=[{'Name': 'MAE', 'Regex': 'MAE: (.*)'}],
    max_jobs=10,
    max_parallel_jobs=2,
)

In [None]:
Optimizer.fit({"train": s3_input_train, "test": s3_input_validation})

In [None]:
# get tuner results in a df
results = Optimizer.analytics().dataframe()
while results.empty:
    time.sleep(1)
    results = Optimizer.analytics().dataframe()
results.head()

## Deploy Model

In [42]:
endpoint_name = "rf-model-trial-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName = {}".format(endpoint_name))

EndpointName = rf-model-trial-2022-04-13-22-16-17


In [44]:
predictor = sklearn.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name)

INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2022-04-13-22-16-23-943
INFO:sagemaker:Creating endpoint with name rf-model-trial-2022-04-13-22-16-17


----------!

## Send Data to Model

In [47]:
test_data = pd.read_csv('data/test_data.csv')

X_test = test_data.drop('Yc', axis=1)
y_test = test_data['Yc']

In [49]:
test_predictions = predictor.predict(X_test[:10].values)

print(test_predictions)
print(y_test[:10].values)

[0.29148083 0.86474494 0.89719428 0.58604472 0.84153399 0.3472152
 0.82915237 0.88522018 0.33782593 0.67655165]
[0.29324204 0.86783721 0.89651952 0.57084218 0.84409073 0.34799738
 0.829483   0.88796765 0.33927629 0.68364747]


## Clean Up

In [50]:
sagemaker_session.delete_endpoint(predictor.endpoint)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Deleting endpoint with name: rf-model-trial-2022-04-13-22-16-17


In [54]:
def cleanup(experiment):
    """Clean up everything in the given experiment object"""
    for trial_summary in experiment.list_trials():
        trial = Trial.load(trial_name=trial_summary.trial_name)

        for trial_comp_summary in trial.list_trial_components():
            trial_step = TrialComponent.load(
                trial_component_name=trial_comp_summary.trial_component_name
            )
            print("Starting to delete TrialComponent.." + trial_step.trial_component_name)
            sm.disassociate_trial_component(
                TrialComponentName=trial_step.trial_component_name, TrialName=trial.trial_name
            )
            trial_step.delete()
            time.sleep(1)

        trial.delete()

    experiment.delete()


cleanup(experiment)

ResourceNotFound: An error occurred (ResourceNotFound) when calling the ListTrials operation: Experiment 'arn:aws:sagemaker:us-east-1:579156935154:experiment/reactor-performance-prediction-rf-2022-04-13-22-25-01' does not exist.