In [1]:
import numpy as np
import pandas as pd

import boto3
import sagemaker

from sagemaker.sklearn.estimator import SKLearn
from time import strftime, gmtime

In [2]:
boto_session = boto3.Session()
boto_sagemaker = boto_session.client("sagemaker")
role = sagemaker.get_execution_role()

sagemaker_session = sagemaker.Session()

## Create Dataset and Load into S3

In [3]:
data = pd.read_csv('data/reactor_performance_data.csv')

In [4]:
# Based on data_preparation.py

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def data_preparation(dataframe, feature_list, target_variable,
                     test_size=0.4, random_state=42, 
                     print_shapes=False, standardize=False):
    
    X = dataframe[feature_list]
    y = dataframe[target_variable]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    if print_shapes == True:
        print('Data Shapes:')
        print('X_train ', X_train.shape)
        print('y_train ', y_train.shape)
        print('X_test ', X_test.shape)
        print('y_test ', y_test.shape)   

    if standardize == False:
        return X_train, X_test, y_train, y_test
    elif standardize == True:
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        return X_train, X_test, y_train, y_test
    
feature_list = ['Fao', 'Fbo', 'P', 'To', 'Cto', 'm', 'Ta']
target_variable = 'Yc'

X_train, X_test, y_train, y_test = data_preparation(data, feature_list, target_variable,
                                                     test_size=0.4, random_state=42, 
                                                     print_shapes=False, standardize=False)

In [5]:
# Concat X and y to form train and test set files
train_data = pd.concat([X_train, y_train], axis=1)
train_data.to_csv('data/train_data.csv', index=False)

test_data = pd.concat([X_test, y_test], axis=1)
test_data.to_csv('data/test_data.csv', index=False)

In [6]:
account_id = boto_session.client("sts", region_name=boto_session.region_name).get_caller_identity()["Account"]
bucket = "sagemaker-studio-{}-{}".format(boto_session.region_name, account_id)
prefix = "chemreactorml"

In [7]:
# Write data to S3
try:
    if sess.region_name == "us-east-1":
        sess.client("s3").create_bucket(Bucket=bucket)
    else:
        sess.client("s3").create_bucket(
            Bucket=bucket, CreateBucketConfiguration={"LocationConstraint": sess.region_name}
        )
except Exception as e:
    print(
        "Looks like you already have a bucket of this name. That's good. Uploading the data files..."
    )

# Return the URLs of the uploaded file, so they can be reviewed or used elsewhere
s3url = sagemaker.s3.S3Uploader.upload("data/train_data.csv", "s3://{}/{}/{}".format(bucket, prefix, "train"))
print(s3url)
s3url = sagemaker.s3.S3Uploader.upload("data/test_data.csv", "s3://{}/{}/{}".format(bucket, prefix, "test"))
print(s3url)

Looks like you already have a bucket of this name. That's good. Uploading the data files...
s3://sagemaker-studio-us-east-1-579156935154/chemreactorml/train/train_data.csv
s3://sagemaker-studio-us-east-1-579156935154/chemreactorml/test/test_data.csv


In [8]:
# Create train and test variables referencing S3 bucket files for use in model training
s3_input_train = sagemaker.TrainingInput(s3_data="s3://{}/{}/train".format(bucket, prefix), content_type="csv")

s3_input_validation = sagemaker.TrainingInput(s3_data="s3://{}/{}/test/".format(bucket, prefix), content_type="csv")

## Model Training

In [9]:
script_path = 'train.py'

In [10]:
sklearn = SKLearn(entry_point=script_path,
                  instance_type="ml.c5.xlarge",
                  role=role,
                  framework_version='0.23-1',
                  py_version='py3',
                  sagemaker_session=sagemaker_session,
                  hyperparameters={"n-estimators": 100})

In [11]:
sklearn.fit({'train': s3_input_train, 'test': s3_input_validation})

2022-04-16 22:01:25 Starting - Starting the training job...
2022-04-16 22:01:42 Starting - Preparing the instances for trainingProfilerReport-1650146485: InProgress
......
2022-04-16 22:02:47 Downloading - Downloading input data...
2022-04-16 22:03:23 Training - Downloading the training image...
2022-04-16 22:03:54 Uploading - Uploading generated training model[34m2022-04-16 22:03:40,815 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2022-04-16 22:03:40,818 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-04-16 22:03:40,827 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2022-04-16 22:03:41,100 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-04-16 22:03:41,113 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-04-16 22:03:41,126 sagemaker-training-toolkit INFO

## Deploy Model

In [12]:
endpoint_name = "reactor-perf-predict-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName = {}".format(endpoint_name))
print('\n')

predictor = sklearn.deploy(initial_instance_count=1,
                           instance_type="ml.m4.xlarge",
                           endpoint_name=endpoint_name)
print('\nModel Deployed')

EndpointName = reactor-perf-predict-2022-04-16-22-04-38


------!
Model Deployed


## Batch Inference using the Endpoint

In [13]:
# Load new data for inference into S3
new_data_s3url = sagemaker.s3.S3Uploader.upload("data/new_data.csv", "s3://{}/{}/{}".format(bucket, prefix, "new"))
print(new_data_s3url)

s3://sagemaker-studio-us-east-1-579156935154/chemreactorml/new/new_data.csv


In [14]:
def batch_inference(bucket, key, endpoint, feature_list, print_predictions=True, upload_to_s3=True):
    s3 = boto3.client('s3')
    data_object = s3.get_object(Bucket=bucket, Key=key)
    new_data = pd.read_csv(data_object['Body'])
    s3_url = "s3://{}/{}".format(bucket, key)
    print('New data loaded from', s3_url)

    
    print('Endpoint name:', endpoint)
    predictor = sagemaker.predictor.Predictor(endpoint_name=endpoint, 
                                              serializer=sagemaker.serializers.NumpySerializer(),
                                              deserializer=sagemaker.deserializers.NumpyDeserializer())

    X = new_data[feature_list]
    print(len(X), 'observations found')

    y = predictor.predict(X)
    print('Predictions generated for new data')
    
    if print_predictions == True:
        print(y)
        print(len(y), 'predictions')

    if upload_to_s3 == True:
        predictions_file = 'predictions.csv'
        predictions = pd.DataFrame(y).to_csv(predictions_file, index=False, header=None)
        print('Predictions csv file created')
        predictions_s3url = sagemaker.s3.S3Uploader.upload(predictions_file, "s3://{}/{}/{}".format(bucket, prefix, "batchpredictions"))
        print('Predictions saved in', predictions_s3url)

In [16]:
bucket = 'sagemaker-studio-us-east-1-579156935154'
key = 'chemreactorml/new/new_data.csv'
endpoint = 'reactor-perf-predict-2022-04-16-22-04-38'
feature_list = ['Fao', 'Fbo', 'P', 'To', 'Cto', 'm', 'Ta']

batch_inference(bucket, 
                key, 
                endpoint, 
                feature_list=feature_list, 
                print_predictions=True, 
                upload_to_s3=True)

New data loaded from s3://sagemaker-studio-us-east-1-579156935154/chemreactorml/new/new_data.csv
Endpoint name: reactor-perf-predict-2022-04-16-22-04-38
5000 observations found
Predictions generated for new data
[0.67686802 0.68949476 0.7511981  ... 0.93543934 0.2643853  0.83869721]
5000 predictions
Predictions csv file created
Predictions saved in s3://sagemaker-studio-us-east-1-579156935154/chemreactorml/batchpredictions/predictions.csv


## Clean Up

In [17]:
#sagemaker_session.delete_endpoint(predictor.endpoint)
sagemaker_session.delete_endpoint(endpoint)