In [1]:
import os
import time
import boto3
import numpy as np
import pandas as pd
import sagemaker
from sagemaker import get_execution_role
from sagemaker.workflow.pipeline_context import PipelineSession
import json
from sagemaker import ModelPackage
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep
from sagemaker.estimator import Estimator
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import JsonGet
from sagemaker.model_metrics import MetricsSource, ModelMetrics
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.processing import ScriptProcessor
from sagemaker import get_execution_role

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [3]:
sess = boto3.Session()
sm = sess.client("sagemaker")
role = get_execution_role()
sagemaker_session = sagemaker.Session(boto_session=sess)
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

pipeline_session = PipelineSession()

print(bucket)

sagemaker-us-east-1-836402295281


In [4]:
GROUP_NAME = 'GROUP6' # CHANGE THIS TO YOUR FIRST NAME
S3_PATH = f's3://{bucket}/' # S3 path prefix
DATA_PREFIX = f'{GROUP_NAME}/data/' # S3 prefix to store data
MODEL_PREFIX = f'{GROUP_NAME}/model/' # S3 prefix to store the XGBoost training information and model.

BASE_JOB_PROCESSING_NAME = f'{GROUP_NAME}-processing'  # base_job_name for preprocessing
BASE_JOB_TRAINING_NAME = f'{GROUP_NAME}-training'  # base_job_name for training
BASE_JOB_EVALUATION_NAME = f'{GROUP_NAME}-evaluation'  # base_job_name for evaluation

PIPELINE_NAME = f'{GROUP_NAME}-pipeline'  # SageMaker Pipeline name
MODEL_PACKAGE_GROUP_NAME = f'{GROUP_NAME}-ModelPackageGroup'  # Model package group name in the Model Registry

print(f'DATA_PREFIX: {DATA_PREFIX}')
print(f'PIPELINE_NAME: {PIPELINE_NAME}')
print(f'MODEL_PACKAGE_GROUP_NAME: {MODEL_PACKAGE_GROUP_NAME}')

DATA_PREFIX: GROUP6/data/
PIPELINE_NAME: GROUP6-pipeline
MODEL_PACKAGE_GROUP_NAME: GROUP6-ModelPackageGroup


In [24]:
from sagemaker.workflow.parameters import ParameterInteger, ParameterString, ParameterFloat

# Define initial parameters
input_data_path1 = f'{DATA_PREFIX}/US_flights_2023.csv'
input_data_path2 = f'{DATA_PREFIX}/airports_geolocation.csv'
input_data_path3 = f'{DATA_PREFIX}/weather_meteo_by_airport.csv'

# raw input data
input_data1 = ParameterString(name="InputData1", default_value=input_data_path1)
input_data2 = ParameterString(name="InputData2", default_value=input_data_path2)
input_data3 = ParameterString(name="InputData3", default_value=input_data_path3)

# status of newly trained model in registry
model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="Approved")

# processing step parameters
processing_instance_type = ParameterString(
    name="ProcessingInstanceType", default_value="ml.m5.xlarge"
)
processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)

# training step parameters
training_instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge")
training_instance_count = ParameterInteger(name="TrainingInstanceCount", default_value=1)

# model performance step parameters
accuracy_rmse_threshold = ParameterFloat(name="AccuracyRMSEThreshold", default_value=60.0)

# SKLearn version to use
sklearn_framework_version = "1.2-1"

In [6]:
s3_bucket_path = 's3://' + sagemaker.Session().default_bucket() + '/'

s3_bucket_path

's3://sagemaker-us-east-1-836402295281/'

In [7]:
# Specify the local paths to your files and the S3 prefix (directory) to upload to
files = ['US_flights_2023.csv', 'airports_geolocation.csv', 'weather_meteo_by_airport.csv']

# Upload files to S3
for file in files:
    sagemaker_session.upload_data(path=file, key_prefix=DATA_PREFIX)
    
# Specify the paths to your uploaded files
file_paths = [f'{DATA_PREFIX}/US_flights_2023.csv', f'{DATA_PREFIX}/airports_geolocation.csv', f'{DATA_PREFIX}/weather_meteo_by_airport.csv']

In [54]:
# Download test.csv for local testing
s3 = boto3.client('s3')
s3.download_file(bucket,'GROUP6/data/test.csv','test.csv')

### Preprocessing

In [16]:
%%writefile code/preprocess.py
import numpy as np
import pandas as pd
import os
import joblib
from io import StringIO
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
import tarfile
import logging

# Set up logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

try:
    from sagemaker_containers.beta.framework import (
        content_types,
        encoders,
        env,
        modules,
        transformer,
        worker,
        server,
    )
except ImportError:
    pass

RANDOM_STATE = 2024
LABEL_COLUMN = 'Dep_Delay'
feature_columns = ['Day_Of_Week', 'Airline', 'Dep_Airport', 'Dep_CityName', 'DepTime_label', 'Distance_type', 'Manufacturer', 'Model', 'Aicraft_age',
                   'STATE', 'LATITUDE', 'LONGITUDE', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'pres', 'FlightMonth']

one_hot_columns = ['Day_Of_Week', 'Airline', 'Dep_Airport', 'Dep_CityName', 'DepTime_label', 'Distance_type', 'Manufacturer', 'Model', 'STATE', 'FlightMonth']
non_one_hot_columns = ['Aicraft_age', 'LATITUDE', 'LONGITUDE', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'pres']

base_dir = "/opt/ml/processing"
base_output_dir = "/opt/ml/processing/output/"

if __name__ == "__main__":
    logger.debug("Starting preprocessing script")

    # Define the input data path within the processing environment
    input_data_path1 = f'{base_dir}/input/US_flights_2023.csv'
    input_data_path2 = f'{base_dir}/input/airports_geolocation.csv'
    input_data_path3 = f'{base_dir}/input/weather_meteo_by_airport.csv'
    
    # Read the CSV file from the input path
    logger.debug("Reading input data")
    df1 = pd.read_csv(input_data_path1)
    df2 = pd.read_csv(input_data_path2)
    df3 = pd.read_csv(input_data_path3)
    
    # Sample the data to reduce size (e.g., take a 2% sample)
    logger.debug("Sampling data to reduce size")
    df1_sample = df1.sample(frac=0.02, random_state=RANDOM_STATE)

    # Merge sampled data
    logger.debug("Merging dataframes")
    merged_df = pd.merge(df1_sample, df2, left_on='Dep_Airport', right_on='IATA_CODE', how='left')
    
    merged_df['FlightDate'] = pd.to_datetime(merged_df['FlightDate'])
    df3['time'] = pd.to_datetime(df3['time'])
    merged_df['FlightMonth'] = merged_df['FlightDate'].dt.month
    
    df = pd.merge(merged_df, df3, left_on=['Dep_Airport', 'FlightDate'], right_on=['airport_id', 'time'], how='left')

    # Include only relevant columns in feature_data
    feature_data = df[feature_columns]
    label_data = df[LABEL_COLUMN]

    preprocessor = ColumnTransformer(
        transformers=[
            ('onehot', OneHotEncoder(), one_hot_columns),
            ('scaler', StandardScaler(), non_one_hot_columns)
        ],
        remainder='passthrough'
    )

    logger.debug("Applying transformations to the data")
    feature_data_transformed = preprocessor.fit_transform(feature_data)

    # Ensure transformed features have correct shape
    logger.debug(f"Transformed feature shape: {feature_data_transformed.shape}")

    logger.debug("Splitting data into train, validation, and test sets")
    x_train, x_temp, y_train, y_temp = train_test_split(feature_data_transformed, label_data, test_size=0.2, random_state=42)
    x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

    # Ensure y arrays are 2D
    y_train = y_train.values.reshape(-1, 1)
    y_val = y_val.values.reshape(-1, 1)
    y_test = y_test.values.reshape(-1, 1)

    # Verify the shapes
    logger.debug(f"x_train shape: {x_train.shape}, y_train shape: {y_train.shape}")
    logger.debug(f"x_val shape: {x_val.shape}, y_val shape: {y_val.shape}")
    logger.debug(f"x_test shape: {x_test.shape}, y_test shape: {y_test.shape}")

    # Print a few rows of each to verify
    logger.debug(f"First 5 rows of x_train: {x_train[:5]}")
    logger.debug(f"First 5 rows of y_train: {y_train[:5]}")
    
    logger.debug("Combining features and labels")
    train_dataset = pd.DataFrame(np.hstack((y_train, x_train.toarray())))
    val_dataset = pd.DataFrame(np.hstack((y_val, x_val.toarray())))
    test_dataset = pd.DataFrame(np.hstack((y_test, x_test.toarray())))

    # Get feature columns after transformation
    transformed_feature_columns = preprocessor.get_feature_names_out()

    train_dataset.columns = [LABEL_COLUMN] + list(transformed_feature_columns)
    val_dataset.columns = [LABEL_COLUMN] + list(transformed_feature_columns)
    test_dataset.columns = [LABEL_COLUMN] + list(transformed_feature_columns)
    
    logger.debug("Creating output directories if they don't exist")
    os.makedirs(f'{base_output_dir}train', exist_ok=True)
    os.makedirs(f'{base_output_dir}validation', exist_ok=True)
    os.makedirs(f'{base_output_dir}test', exist_ok=True)
    os.makedirs(f'{base_output_dir}preprocessor', exist_ok=True)

    logger.debug("Saving datasets to CSV")
    train_dataset.to_csv(f'{base_output_dir}train/train.csv', header=False, index=False)
    val_dataset.to_csv(f'{base_output_dir}validation/validation.csv', header=False, index=False)
    test_dataset.to_csv(f'{base_output_dir}test/test.csv', header=False, index=False)

    logger.debug("Saving preprocessor model")
    joblib.dump(preprocessor, f'{base_output_dir}preprocessor/preprocessor.joblib')
    
    with tarfile.open(f'{base_output_dir}preprocessor/preprocessor.tar.gz', 'w:gz') as tar_handle:
        tar_handle.add(f'{base_output_dir}preprocessor/preprocessor.joblib', arcname='preprocessor.joblib')

    logger.debug("Preprocessing script completed successfully")

def input_fn(input_data, content_type):
    if content_type == "text/csv":
        df = pd.read_csv(StringIO(input_data), header=None)
        df.columns = transformed_feature_columns if len(df.columns) == len(transformed_feature_columns) else [LABEL_COLUMN] + transformed_feature_columns
        return df
    else:
        raise ValueError("{} not supported by script!".format(content_type))

def output_fn(prediction, accept):
    if accept == "application/json":
        instances = [row.tolist() for row in prediction]
        json_output = {"instances": instances}
        return worker.Response(json.dumps(json_output), mimetype=accept)
    elif accept == "text/csv":
        return worker.Response(encoders.encode(prediction, accept), mimetype=accept)
    else:
        print(f"Warning: {accept} accept type is not supported by this script. Defaulting to text/csv.")
        return worker.Response(encoders.encode(prediction, "text/csv"), mimetype="text/csv")

def predict_fn(input_data, model):
    features = model.transform(input_data)
    if LABEL_COLUMN in input_data:
        return np.insert(features, 0, input_data[LABEL_COLUMN], axis=1)
    else:
        return features

def model_fn(model_dir):
    preprocessor = joblib.load(os.path.join(model_dir, "preprocessor.joblib"))
    return preprocessor


Overwriting code/preprocess.py


In [17]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.sklearn.processing import SKLearnProcessor

sklearn_processor = SKLearnProcessor(
    framework_version='1.2-1',
    role=role,
    instance_type='ml.c5.xlarge',
    instance_count=1 
)

sklearn_processor.run(
    code="code/preprocess.py",
    inputs=[
        ProcessingInput(
            source=S3_PATH + DATA_PREFIX,
            destination="/opt/ml/processing/input"
        ),
    ],
    outputs=[
        ProcessingOutput(
            output_name="scaler_model",
            source="/opt/ml/processing/output/preprocessor",
            destination=S3_PATH + MODEL_PREFIX
        ),
        ProcessingOutput(
            output_name="train",
            source="/opt/ml/processing/output/train",
            destination=S3_PATH + DATA_PREFIX
        ),
        ProcessingOutput(
            output_name="validation",
            source="/opt/ml/processing/output/validation",
            destination=S3_PATH + DATA_PREFIX
        ),
        ProcessingOutput(
            output_name="test",
            source="/opt/ml/processing/output/test",
            destination=S3_PATH + DATA_PREFIX
        ),
    ],
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker:Creating processing-job with name sagemaker-scikit-learn-2024-06-20-13-18-29-805


.............[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mDEBUG:__main__:Starting preprocessing script[0m
[34mDEBUG:__main__:Reading input data[0m
[34mDEBUG:__main__:Sampling data to reduce size[0m
[34mDEBUG:__main__:Merging dataframes[0m
[34mDEBUG:__main__:Applying transformations to the data[0m
[34mDEBUG:__main__:Transformed feature shape: (134868, 813)[0m
[34mDEBUG:__main__:Splitting data into train, validation, and test sets[0m
[34mDEBUG:__main__:x_train shape: (107894, 813), y_train shape: (107894, 1)[0m
[34mDEBUG:__main__:x_val shape: (13487, 813), y_val shape: (13487, 1)[0m
[34mDEBUG:__main__:x_test shape: (13487, 813), y_test shape: (13487, 1)[0m
[34mDEBUG:__main__:First 5 rows of x_train:   (0, 2)#0111.0
  (0, 18)#0111.0
  (0, 120)#0111.0
  (0, 452)#0111.0
  (0, 706)#0111.0
  (0, 712)#0111.0
  (0, 715)#0111.0
  (0, 735)#0111.0
  (0, 758)#0111.0
  (0, 790)#0111.0
  (0, 802)#0111.3382639788382422
  (0, 803)#0110.93544

In [17]:
sklearn_processor.jobs[0].describe()

{'ProcessingInputs': [{'InputName': 'input-1',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-202646161072/GROUP6/data/',
    'LocalPath': '/opt/ml/processing/input',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}},
  {'InputName': 'code',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-202646161072/sagemaker-scikit-learn-2024-06-20-02-35-30-277/input/code/preprocess.py',
    'LocalPath': '/opt/ml/processing/input/code',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}}],
 'ProcessingOutputConfig': {'Outputs': [{'OutputName': 'scaler_model',
    'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-202646161072/GROUP6/model/',
     'LocalPath': '/opt/ml/processing/output/preprocessor',
     'S3UploadMode': 'EndOfJob'},
    'AppManaged': False},
   {'OutputName'

### XGBoost

In [34]:
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.image_uris import retrieve
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner
from time import gmtime, strftime

sess = sagemaker.Session()

container = retrieve("xgboost", region, "1.7-1")

xgb = sagemaker.estimator.Estimator(
    container,
    role,
    base_job_name=BASE_JOB_TRAINING_NAME + '-xgboost',
    instance_count=1,
    instance_type="ml.m5.xlarge",
    output_path=S3_PATH + MODEL_PREFIX + 'xgboost',
    sagemaker_session=sess,
)

xgb.set_hyperparameters(objective='reg:squarederror',num_round=100)

hyperparameter_ranges = {
    'eta': ContinuousParameter(0.001, 0.2),
    'max_depth': IntegerParameter(3, 50),
    'min_child_weight': IntegerParameter(1, 10),
    'colsample_bytree': ContinuousParameter(0.5, 1)
}

objective_metric_name = 'validation:rmse'

tuner = HyperparameterTuner(estimator=xgb,
                            objective_metric_name=objective_metric_name,
                            hyperparameter_ranges=hyperparameter_ranges,
                            objective_type='Minimize',
                            max_jobs=5,
                            max_parallel_jobs=4,
                            strategy='Random')

# Specify training data location
s3_train_data = S3_PATH + DATA_PREFIX + 'train.csv'
s3_validation_data = S3_PATH + DATA_PREFIX + 'validation.csv'

# generating the session.s3_input() format for fit() accepted by the sdk
train_data = sagemaker.inputs.TrainingInput(
    s3_train_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)
validation_data = sagemaker.inputs.TrainingInput(
    s3_validation_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)

# Fit the tuner
tuner.fit(
    inputs={
        'train': train_data,
        'validation': validation_data
    },
    job_name="xgb-randsearch-" + strftime("%Y%m%d-%H-%M-%S", gmtime())
)

# train_args = xgb.fit(
#     inputs={
#         "train": TrainingInput(
#             s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri,
#             content_type="text/csv",
#         ),
#         "validation": TrainingInput(
#             s3_data=step_process.properties.ProcessingOutputConfig.Outputs["val"].S3Output.S3Uri,
#             content_type="text/csv",
#         )
#     }
# )

# step_train_model = TrainingStep(name="TrainXGBModel1", step_args=train_args)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating hyperparameter tuning job with name: xgb-randsearch-20240620-07-03-31


.

KeyboardInterrupt: 

In [25]:
df_tuner = sagemaker.HyperparameterTuningJobAnalytics(
    tuner.latest_tuning_job.job_name
).dataframe()
df_tuner

Unnamed: 0,colsample_bytree,eta,max_depth,min_child_weight,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,0.941825,0.117626,30.0,7.0,xgb-randsearch-20240620-05-16-16-005-5ab7deb7,Completed,56.492619,2024-06-20 05:37:49+00:00,2024-06-20 05:58:22+00:00,1233.0
1,0.940992,0.001262,26.0,10.0,xgb-randsearch-20240620-05-16-16-004-e652a66e,Completed,55.47694,2024-06-20 05:17:14+00:00,2024-06-20 05:37:22+00:00,1208.0
2,0.81616,0.036128,31.0,3.0,xgb-randsearch-20240620-05-16-16-003-fbd05ca3,Completed,55.984032,2024-06-20 05:17:09+00:00,2024-06-20 05:38:13+00:00,1264.0
3,0.723725,0.003638,49.0,9.0,xgb-randsearch-20240620-05-16-16-002-05678414,Completed,54.787861,2024-06-20 05:17:05+00:00,2024-06-20 05:40:48+00:00,1423.0
4,0.535194,0.016688,42.0,8.0,xgb-randsearch-20240620-05-16-16-001-be493fef,Completed,53.900372,2024-06-20 05:17:02+00:00,2024-06-20 05:35:00+00:00,1078.0


In [26]:
best_model_name = df_tuner.sort_values('FinalObjectiveValue',ascending=True).iloc[0]['TrainingJobName']
best_model_name

'xgb-randsearch-20240620-05-16-16-001-be493fef'

In [7]:
best_xgb_name = 'xgb-randsearch-20240619-14-39-43-001-2a4b13da'
best_xgb_path = f'{S3_PATH}{MODEL_PREFIX}xgboost/{best_xgb_name}/output/model.tar.gz'
best_xgb_path

's3://sagemaker-us-east-1-836402295281/GROUP6/model/xgboost/xgb-randsearch-20240619-14-39-43-001-2a4b13da/output/model.tar.gz'

In [9]:
from sagemaker.image_uris import retrieve
from sagemaker.serverless import ServerlessInferenceConfig

# Retrieve the image URI for the linear-learner algorithm
xgb_image = retrieve(framework='xgboost', region=region, version='1.7-1')

# Create a SageMaker model
model = sagemaker.Model(
    image_uri=xgb_image,
    model_data=best_xgb_path,
    role=role,
    sagemaker_session=sagemaker_session
)

# Deploy the model
model.deploy(
    serverless_inference_config=ServerlessInferenceConfig()
)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-06-30-13-31-10-621
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-06-30-13-31-11-338
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-06-30-13-31-11-338


----------------------------*

UnexpectedStatusException: Error hosting endpoint sagemaker-xgboost-2024-06-30-13-31-11-338: Failed. Reason: Image size 11789822572 is greater than supported size 10737418240. Try changing the instance type or reference the troubleshooting page https://docs.aws.amazon.com/sagemaker/latest/dg/async-inference-troubleshooting.html

In [139]:
# Initialize a predictor
predictor = sagemaker.predictor.Predictor(
    endpoint_name='sagemaker-xgboost-2024-06-21-06-26-17-576'
)

test_df = pd.read_csv('test.csv')
payload = test_df.iloc[:10,1:].to_csv(header=False, index=False)
p = predictor.predict(payload, initial_args={"ContentType": "text/csv"})

In [140]:
i = 0
pred_list = []
while i <= len(test_df):
    payload = test_df.iloc[i:i+1000,1:].to_csv(header=False, index=False)
    output = predictor.predict(payload, initial_args={"ContentType": "text/csv"})
    pred = output.decode('utf-8').split('\n')[:-1]
    pred_list += pred
    i += 1000

xgb_predicted = np.array(pred_list)

In [49]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(test_df.iloc[:,0].values,predicted.astype(float))
print(f'XGBoost test RMSE: {np.sqrt(mse)}')

XGBoost test RMSE: 49.99842436026933


In [32]:
%%writefile code/evaluate_xgboost.py

import json
import logging
import math
import pickle
import tarfile

import numpy as np
import pandas as pd
import xgboost
from sklearn.metrics import mean_squared_error

from pathlib import Path

label_column = 'Dep_Delay'

model_tar_path = '/opt/ml/processing/model/model.tar.gz'

if __name__ == "__main__":
    ## Your code to perform model evaluation on testing dataset, and 
    ## store the evaluation report
    with tarfile.open(model_tar_path, 'r:gz') as tar:
        tar.extractall(path='./model')
    
    xgb_model = xgboost.Booster()
    xgb_model.load_model('./model/xgboost-model')
    
    test_path = "/opt/ml/processing/test/"
    df = pd.read_csv(test_path + "/test.csv")

    x_test = xgboost.DMatrix(df.iloc[:,1:].values)
    y_test = df.iloc[:,0].values
    y_pred = xgb_model.predict(x_test)
    score = np.sqrt(mean_squared_error(y_test, y_pred))
    print("\nTest RMSE :", score)

    # Available metrics to add to model: https://docs.aws.amazon.com/sagemaker/latest/dg/model-monitor-model-quality-metrics.html
    report_dict = {
        "regression_metrics": {
            "rmse": {"value": score, "standard_deviation": "NaN"},
        },
    }

    output_dir = "/opt/ml/processing/evaluation"
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    evaluation_path = f"{output_dir}/evaluation.json"
    with open(evaluation_path, "w") as f:
        f.write(json.dumps(report_dict))

Overwriting code/evaluate_xgboost.py


In [33]:
from sagemaker.workflow.properties import PropertyFile
from sagemaker.sklearn.processing import ScriptProcessor

s3_test_data = f'{S3_PATH}{DATA_PREFIX}test.csv'

xgb_eval_image = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=region,
    version='1.7-1'
)

evaluate_model_processor = ScriptProcessor(
    role=role,
    image_uri=xgb_eval_image,
    command=["python3"],
    instance_count=1,
    instance_type='ml.m5.xlarge',
    sagemaker_session=sess,
)

# Create a PropertyFile
# A PropertyFile is used to be able to reference outputs from a processing step, for instance to use in a condition step.
# For more information, visit https://docs.aws.amazon.com/sagemaker/latest/dg/build-and-manage-propertyfile.html
evaluation_report = PropertyFile(
    name="EvaluationReport", output_name="evaluation", path="evaluation.json"
)

# eval_args = 
evaluate_model_processor.run(
    inputs=[
        ProcessingInput(
            source=best_model_path,
            destination="/opt/ml/processing/model",
        ),
        ProcessingInput(
            source=s3_test_data,
            destination="/opt/ml/processing/test",
        ),
    ],
    outputs=[
        ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation", destination=f'{S3_PATH}{GROUP_NAME}/evaluation/xgboost/'),
    ],
    code="code/evaluate_xgboost.py",
)

# step_evaluate_model = ProcessingStep(
#     name="EvaluateXgboost",
#     step_args=eval_args,
#     property_files=[evaluation_report],
# )

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating processing-job with name sagemaker-xgboost-2024-06-20-15-23-50-390


.............[34mTest RMSE : 49.99842436026933[0m



### Linear Regression

In [36]:
# Linear Regression
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.amazon.randomcutforest import RandomCutForest
from time import gmtime, strftime
from sagemaker.image_uris import retrieve

sess = sagemaker.Session()
container = retrieve("linear-learner", region, version="1")

# Setup the Random Forest estimator
lr = sagemaker.estimator.Estimator(
    container,
    role,
    input_mode='File',
    base_job_name=BASE_JOB_TRAINING_NAME + 'linear',
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path=S3_PATH + MODEL_PREFIX + 'linear',
    sagemaker_session=sess,
)

lr.set_hyperparameters(
    predictor_type="regressor",
    optimizer="adam",
    mini_batch_size=100,
    use_lr_scheduler=True
)

# Hyperparameters can be tuned as per the specific requirements
hyperparameter_ranges = {
    'learning_rate': ContinuousParameter(0.01, 0.2),
    'wd': ContinuousParameter(0.0, 0.01)
}

objective_metric_name = 'validation:rmse'  # Adjust the metric name based on what is relevant for Random Forest in SageMaker

tuner_lr = HyperparameterTuner(estimator=lr,
                            objective_metric_name=objective_metric_name,
                            hyperparameter_ranges=hyperparameter_ranges,
                            objective_type='Minimize',
                            max_jobs=5,
                            max_parallel_jobs=5,
                            strategy='Random')

# Define training and validation datasets
# Specify training data location
s3_train_data = S3_PATH + DATA_PREFIX + 'train.csv'
s3_validation_data = S3_PATH + DATA_PREFIX + 'validation.csv'

# generating the session.s3_input() format for fit() accepted by the sdk
train_data = sagemaker.inputs.TrainingInput(
    s3_train_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)
validation_data = sagemaker.inputs.TrainingInput(
    s3_validation_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)

tuner_lr.fit(
    inputs={
        'train': train_data,
        'validation': validation_data
    },
    job_name="lr-randsearch-" + strftime("%Y%m%d-%H-%M-%S", gmtime())
)

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating hyperparameter tuning job with name: lr-randsearch-20240620-15-39-11


...

KeyboardInterrupt: 

In [19]:
df_tuner_lr = sagemaker.HyperparameterTuningJobAnalytics(
    tuner_lr.latest_tuning_job.job_name
).dataframe()
df_tuner_lr

Unnamed: 0,learning_rate,wd,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,0.041387,0.007256,lr-randsearch-20240620-13-40-45-005-03b1ccb1,Completed,54.109787,2024-06-20 13:41:55+00:00,2024-06-20 13:53:37+00:00,702.0
1,0.039622,0.007937,lr-randsearch-20240620-13-40-45-004-4a059713,Completed,54.105003,2024-06-20 13:41:55+00:00,2024-06-20 13:53:36+00:00,701.0
2,0.195047,0.000752,lr-randsearch-20240620-13-40-45-003-6d7fd290,Completed,54.368877,2024-06-20 13:41:39+00:00,2024-06-20 13:55:36+00:00,837.0
3,0.015839,0.008427,lr-randsearch-20240620-13-40-45-002-81033a3e,Completed,54.115536,2024-06-20 13:41:48+00:00,2024-06-20 13:52:23+00:00,635.0
4,0.118939,0.006393,lr-randsearch-20240620-13-40-45-001-d346b7c3,Completed,54.113075,2024-06-20 13:41:59+00:00,2024-06-20 13:54:30+00:00,751.0


In [50]:
best_linear_name = 'lr-randsearch-20240620-13-40-45-004-4a059713'
best_linear_path = f'{S3_PATH}{MODEL_PREFIX}linear/{best_linear_name}/output/model.tar.gz'
best_linear_path

's3://sagemaker-us-east-1-836402295281/GROUP6/model/linear/lr-randsearch-20240620-13-40-45-004-4a059713/output/model.tar.gz'

In [None]:
# Retrieve the image URI for the linear-learner algorithm
image_uri = retrieve(framework='linear-learner', region=region, version='1')

# Create a SageMaker model
model = sagemaker.Model(
    image_uri=image_uri,
    model_data=best_linear_path,
    role=role,
    sagemaker_session=sagemaker_session
)

# Deploy the model
model.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge'
)

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating model with name: linear-learner-2024-06-21-07-03-28-228
INFO:sagemaker:Creating endpoint-config with name linear-learner-2024-06-21-07-03-28-931
INFO:sagemaker:Creating endpoint with name linear-learner-2024-06-21-07-03-28-931


-------

In [143]:
endpoint_name = 'linear-learner-2024-06-21-07-03-28-931'

# Initialize a predictor
predictor = sagemaker.predictor.Predictor(
    endpoint_name=endpoint_name
)

In [145]:
i = 0
pred_list = []
while i <= len(test_df):
    payload = test_df.iloc[i:i+1000,1:].to_csv(header=False, index=False)
    output = predictor.predict(payload, initial_args={"ContentType": "text/csv"})
    output = json.loads(output.decode('utf-8'))
    pred = [out['score'] for out in output['predictions']]
    pred_list += pred
    i += 1000

predicted_linear = np.array(pred_list).astype(float)

In [60]:
mse = mean_squared_error(test_df.iloc[:,0].values,predicted)
print(f'Linear learner test RMSE: {np.sqrt(mse)}')

Linear learner test RMSE: 50.139236350073425


### Factorization Machine

In [10]:
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.image_uris import retrieve
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner
from time import gmtime, strftime

fm_container = retrieve('factorization-machines',region)

fm = sagemaker.estimator.Estimator(
    fm_container,
    role,
    base_job_name=BASE_JOB_TRAINING_NAME + '-fm',
    instance_count=1,
    instance_type="ml.m5.xlarge",
    output_path=S3_PATH + MODEL_PREFIX + 'fm',
    sagemaker_session=sagemaker_session,
)

fm.set_hyperparameters(feature_dim=813,
                       num_factors=64,
                       predictor_type='regressor',
                       mini_batch_size=1000)

hyperparameter_ranges = {
    'factors_wd': ContinuousParameter(1e-8, 1e-4),
    'linear_wd': ContinuousParameter(1e-8, 1e-4),
    'bias_wd': ContinuousParameter(1e-8, 1e-4),
    'epochs': IntegerParameter(10, 100)
}

objective_metric_name = 'validation:rmse'

fm_tuner = HyperparameterTuner(estimator=fm,
                            objective_metric_name='test:rmse',
                            hyperparameter_ranges=hyperparameter_ranges,
                            objective_type='Minimize',
                            max_jobs=5,
                            max_parallel_jobs=5,
                            strategy='Random')

# Specify training data location
s3_train_data = S3_PATH + DATA_PREFIX + 'train.csv'
s3_validation_data = S3_PATH + DATA_PREFIX + 'validation.csv'

# generating the session.s3_input() format for fit() accepted by the sdk
train_data = sagemaker.inputs.TrainingInput(
    s3_train_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)
validation_data = sagemaker.inputs.TrainingInput(
    s3_validation_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)

# Fit the tuner
fm_tuner.fit(
    inputs={
        'train': train_data,
        'test': validation_data
    },
    job_name="fm-randsearch-" + strftime("%Y%m%d-%H-%M-%S", gmtime())
)

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: 1.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating hyperparameter tuning job with name: fm-randsearch-20240621-03-28-46


.................................................................................................................................................................................................................!


In [11]:
df_tuner_fm = sagemaker.HyperparameterTuningJobAnalytics(
    fm_tuner.latest_tuning_job.job_name
).dataframe()
df_tuner_fm

Unnamed: 0,bias_wd,epochs,factors_wd,linear_wd,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,9.768333e-07,21.0,4.933619e-05,2e-06,fm-randsearch-20240621-03-28-46-005-29bc72cf,Completed,149.103622,2024-06-21 03:29:41+00:00,2024-06-21 03:39:11+00:00,570.0
1,5.242554e-05,29.0,1.475626e-08,5e-05,fm-randsearch-20240621-03-28-46-004-1f596440,Completed,102.370323,2024-06-21 03:29:41+00:00,2024-06-21 03:40:16+00:00,635.0
2,3.816215e-06,41.0,1.000964e-06,1e-06,fm-randsearch-20240621-03-28-46-003-5c321590,Completed,94.614632,2024-06-21 03:29:41+00:00,2024-06-21 03:42:52+00:00,791.0
3,3.218829e-08,48.0,5.05809e-05,6e-06,fm-randsearch-20240621-03-28-46-002-148f5a50,Completed,80.167862,2024-06-21 03:29:34+00:00,2024-06-21 03:45:11+00:00,937.0
4,2.630834e-07,50.0,1.659995e-08,1.9e-05,fm-randsearch-20240621-03-28-46-001-7acd1054,Completed,78.112427,2024-06-21 03:29:35+00:00,2024-06-21 03:45:12+00:00,937.0


In [64]:
best_fm_name = df_tuner_fm.sort_values('FinalObjectiveValue',ascending=True).iloc[0]['TrainingJobName']
best_fm_path = f'{S3_PATH}{MODEL_PREFIX}fm/{best_fm_name}/output/model.tar.gz'
best_fm_path

's3://sagemaker-us-east-1-836402295281/GROUP6/model/fm/fm-randsearch-20240621-03-28-46-001-7acd1054/output/model.tar.gz'

In [65]:
# Retrieve the image URI for the linear-learner algorithm
image_uri = retrieve(framework='factorization-machines', region=region, version='1')

# Create a SageMaker model
model = sagemaker.Model(
    image_uri=image_uri,
    model_data=best_fm_path,
    role=role,
    sagemaker_session=sagemaker_session
)

# Deploy the model
model.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge'
)

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating model with name: factorization-machines-2024-06-21-07-39-47-084
INFO:sagemaker:Creating endpoint-config with name factorization-machines-2024-06-21-07-39-47-771
INFO:sagemaker:Creating endpoint with name factorization-machines-2024-06-21-07-39-47-771


-----------!

In [146]:
endpoint_name = 'factorization-machines-2024-06-21-07-39-47-771'

# Initialize a predictor
predictor = sagemaker.predictor.Predictor(
    endpoint_name=endpoint_name
)

from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

class FMSerializer(JSONSerializer):
    def serialize(self, data):
        js = {"instances": []}
        for row in data:
            js["instances"].append({"features": row.tolist()})
        return json.dumps(js)

predictor.serializer = FMSerializer()
predictor.deserializer = JSONDeserializer()

In [147]:
i = 0
pred_list = []
while i <= len(test_df):
    payload = test_df.iloc[i:i+1000,1:].values
    output = predictor.predict(payload, initial_args={"ContentType": "application/json"})
    pred = [out['score'] for out in output['predictions']]
    pred_list += pred
    i += 1000

predicted_fm = np.array(pred_list).astype(float)

In [92]:
mse = mean_squared_error(test_df.iloc[:,0].values,predicted)
print(f'Factorization Machines test RMSE: {np.sqrt(mse)}')

Factorization Machines test RMSE: 73.72534226067884


### LightGBM

In [18]:
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.image_uris import retrieve
from sagemaker import image_uris, model_uris, script_uris
from sagemaker import hyperparameters
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner
from time import gmtime, strftime

train_model_id, train_model_version, train_scope = "lightgbm-regression-model", "*", "training"
training_instance_type = "ml.m5.xlarge"

# Retrieve the docker image
train_image_uri = image_uris.retrieve(
    region=None,
    framework=None,
    model_id=train_model_id,
    model_version=train_model_version,
    image_scope=train_scope,
    instance_type='ml.m5.xlarge'
)

# Retrieve the training script
train_source_uri = script_uris.retrieve(
    model_id=train_model_id, model_version=train_model_version, script_scope=train_scope
)

train_model_uri = model_uris.retrieve(
    model_id=train_model_id, model_version=train_model_version, model_scope=train_scope
)

hyperparameters = hyperparameters.retrieve_default(
    model_id=train_model_id, model_version=train_model_version
)


lgbm = sagemaker.estimator.Estimator(
    role=role,
    base_job_name=BASE_JOB_TRAINING_NAME + '-lgbm',
    image_uri=train_image_uri,
    source_dir=train_source_uri,
    model_uri=train_model_uri,
    entry_point="transfer_learning.py",
    instance_count=1,
    instance_type='ml.m5.xlarge',
    max_run=360000,
    hyperparameters=hyperparameters,
    output_path=S3_PATH + MODEL_PREFIX + 'lgbm',
)

fm.set_hyperparameters(feature_dim=813,
                       num_factors=64,
                       predictor_type='regressor',
                       mini_batch_size=1000)

hyperparameter_ranges = {
    "learning_rate": ContinuousParameter(1e-4, 1, scaling_type="Logarithmic"),
    "num_boost_round": IntegerParameter(2, 30),
    "early_stopping_rounds": IntegerParameter(2, 30),
    "num_leaves": IntegerParameter(10, 50),
    "feature_fraction": ContinuousParameter(0, 1),
    "bagging_fraction": ContinuousParameter(0, 1),
    "bagging_freq": IntegerParameter(1, 10),
    "max_depth": IntegerParameter(5, 30),
    "min_data_in_leaf": IntegerParameter(5, 50),
}

# objective_metric_name = 'validation:rmse'

# lgbm_tuner = HyperparameterTuner(
#     lgbm,
#     "rmse",
#     hyperparameter_ranges,
#     [{"Name": "rmse", "Regex": "rmse: ([0-9\\.]+)"}],
#     max_jobs=5,
#     max_parallel_jobs=5,
#     objective_type="Minimize",
#     strategy='Random'
# )

# lgbm_tuner.fit({"training": training_dataset_s3_path}, logs=True)

lgbm_tuner = HyperparameterTuner(estimator=lgbm,
                            objective_metric_name='rmse',
                            hyperparameter_ranges=hyperparameter_ranges,
                            metric_definitions=[{"Name": "rmse", "Regex": "rmse: ([0-9\\.]+)"}],
                            objective_type='Minimize',
                            max_jobs=5,
                            max_parallel_jobs=5,
                            strategy='Random')

# Specify training data location
s3_train_data = S3_PATH + DATA_PREFIX + 'train.csv'
s3_validation_data = S3_PATH + DATA_PREFIX + 'validation.csv'

# generating the session.s3_input() format for fit() accepted by the sdk
train_data = sagemaker.inputs.TrainingInput(
    s3_train_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)
validation_data = sagemaker.inputs.TrainingInput(
    s3_validation_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)

# Fit the tuner
lgbm_tuner.fit(
    inputs={
        'train': train_data,
        'validation': validation_data
    },
    job_name="lgbm-rand-" + strftime("%Y%m%d-%H-%M-%S", gmtime())
)

INFO:sagemaker:Creating hyperparameter tuning job with name: lgbm-rand-20240621-04-14-39


...................................!


In [19]:
df_tuner_lgbm = sagemaker.HyperparameterTuningJobAnalytics(
    lgbm_tuner.latest_tuning_job.job_name
).dataframe()
df_tuner_lgbm

Unnamed: 0,bagging_fraction,bagging_freq,early_stopping_rounds,feature_fraction,learning_rate,max_depth,min_data_in_leaf,num_boost_round,num_leaves,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,0.571666,5.0,8.0,0.161938,0.007978,29.0,30.0,19.0,45.0,lgbm-rand-20240621-04-14-39-005-af474496,Completed,53.001099,2024-06-21 04:15:34+00:00,2024-06-21 04:17:18+00:00,104.0
1,0.560695,9.0,10.0,0.295746,0.000187,28.0,11.0,8.0,17.0,lgbm-rand-20240621-04-14-39-004-543173dd,Completed,53.100601,2024-06-21 04:15:34+00:00,2024-06-21 04:17:12+00:00,98.0
2,0.258755,6.0,14.0,0.381989,0.000261,15.0,17.0,13.0,37.0,lgbm-rand-20240621-04-14-39-003-fccd40f6,Completed,53.098,2024-06-21 04:15:32+00:00,2024-06-21 04:17:15+00:00,103.0
3,0.2462,7.0,25.0,0.923566,0.000251,18.0,27.0,10.0,21.0,lgbm-rand-20240621-04-14-39-002-3f679207,Completed,53.098701,2024-06-21 04:15:30+00:00,2024-06-21 04:17:09+00:00,99.0
4,0.019957,2.0,11.0,0.46404,0.379247,22.0,6.0,16.0,34.0,lgbm-rand-20240621-04-14-39-001-2f125bcc,Completed,53.174,2024-06-21 04:15:27+00:00,2024-06-21 04:17:11+00:00,104.0


In [95]:
best_lgbm_name = df_tuner_lgbm.sort_values('FinalObjectiveValue',ascending=True).iloc[0]['TrainingJobName']
best_lgbm_path = f'{S3_PATH}{MODEL_PREFIX}lgbm/{best_lgbm_name}/output/model.tar.gz'
best_lgbm_path

's3://sagemaker-us-east-1-836402295281/GROUP6/model/lgbm/lgbm-rand-20240621-04-14-39-005-af474496/output/model.tar.gz'

In [96]:
model_id, model_version, scope = "lightgbm-regression-model", "*", "inference"

# Retrieve the inference docker container uri
deploy_image_uri = image_uris.retrieve(
    region=None,
    framework=None,
    image_scope=scope,
    model_id=model_id,
    model_version=model_version,
    instance_type='ml.m5.xlarge',
)
# Retrieve the inference script uri
deploy_source_uri = script_uris.retrieve(
    model_id=train_model_id, model_version=train_model_version, script_scope=scope
)

# Create a SageMaker model
model = sagemaker.Model(
    image_uri=deploy_image_uri,
    source_dir=deploy_source_uri,
    model_data=best_lgbm_path,
    role=role,
    entry_point="inference.py",
    sagemaker_session=sagemaker_session
)

# Deploy the model
model.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge'
)

INFO:sagemaker:Repacking model artifact (s3://sagemaker-us-east-1-836402295281/GROUP6/model/lgbm/lgbm-rand-20240621-04-14-39-005-af474496/output/model.tar.gz), script artifact (s3://jumpstart-cache-prod-us-east-1/source-directory-tarballs/lightgbm/inference/regression/v1.2.1/sourcedir.tar.gz), and dependencies ([]) into single tar.gz file located at s3://sagemaker-us-east-1-836402295281/pytorch-inference-2024-06-21-08-26-13-434/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: sagemaker-jumpstart-2024-06-21-08-26-15-013
INFO:sagemaker:Creating endpoint-config with name sagemaker-jumpstart-2024-06-21-08-26-15-754
INFO:sagemaker:Creating endpoint with name sagemaker-jumpstart-2024-06-21-08-26-15-754


------!

In [148]:
endpoint_name = 'sagemaker-jumpstart-2024-06-21-08-26-15-754'

# Initialize a predictor
predictor = sagemaker.predictor.Predictor(
    endpoint_name=endpoint_name
)

In [149]:
i = 0
pred_list = []
while i <= len(test_df):
    payload = test_df.iloc[i:i+1000,1:].to_csv(header=False,index=False)
    output = predictor.predict(payload, initial_args={"ContentType": "text/csv"})
    output = json.loads(output.decode('utf-8'))
    pred = [out for out in output['prediction']]
    pred_list += pred
    i += 1000

predicted_lgbm = np.array(pred_list).astype(float)

In [158]:
average = sum([predicted_lgbm,predicted_linear,predicted_fm,xgb_predicted.astype(float)]) / len(xgb_predicted)

In [156]:
xgb_predicted.astype(float)

array([ 1.47925758, 16.74202919,  1.46071875, ..., 11.01685238,
       -1.37145734,  6.0683918 ])

In [159]:
mse = mean_squared_error(test_df.iloc[:,0].values,average)
print(f'LightGBM test RMSE: {np.sqrt(mse)}')

LightGBM test RMSE: 52.26957501090619


In [106]:
runtime = boto3.client('sagemaker-runtime')

In [132]:
from io import StringIO
pd.read_csv(StringIO(test_df.iloc[:10,1:].to_csv(header=False,index=False)),header=None)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,803,804,805,806,807,808,809,810,811,812
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-2.002819,0.695288,1.208326,1.498681,1.006576,-0.310289,-0.103059,-0.317248,1.817442,-0.580364
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.614921,-0.117039,-0.419113,-0.26652,-0.337168,-0.310289,-0.103059,-0.550507,-0.044904,0.905086
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.450429,0.006493,0.68405,0.278295,0.801424,-0.310289,-0.103059,0.346643,-1.002682,-1.315103
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.356379,0.492032,0.170697,0.223814,0.062878,-0.310289,-0.103059,-1.4028,-1.020418,-0.404665
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.935445,0.610784,-1.970095,-1.835588,-1.937352,-0.310289,-0.103059,0.535045,-0.3819,3.348892
5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.171078,-1.464266,-0.572027,-0.321001,-0.737214,-0.310289,-0.103059,0.86699,3.857154,-0.181049
6,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.188101,-1.456071,-0.648483,-0.440861,-0.850048,-0.310289,-0.103059,-1.483543,-1.321941,0.55369
7,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.44075,-1.253349,0.356378,0.64877,0.114166,-0.245758,-0.103059,0.436358,0.593615,-0.564391
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.623049,-0.107179,-1.183681,-0.985676,-1.188548,1.457841,-0.103059,1.494995,1.498182,1.927332
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.171078,-1.464266,-0.45188,-0.201142,-0.562835,-0.310289,-0.103059,0.80419,3.094479,-0.037296


In [133]:
response = runtime.invoke_endpoint(
    EndpointName='sagemaker-jumpstart-2024-06-21-08-26-15-754',
    ContentType='text/csv',
    Body=test_df.iloc[:10,1:].to_csv(header=False,index=False)
)['Body'].read().decode()

In [134]:
response

'{"prediction": [11.986738938680132, 11.420523108240525, 12.404603853311844, 11.471671713807924, 11.575639670241783, 12.006904046860821, 11.832086977913049, 11.582282082322754, 11.805968793416355, 11.784793945532899]}'

In [135]:
%%writefile code/lambda_helper.py
import numpy as np
import pandas as pd
from io import StringIO
import json
import boto3

# Initialize boto3 client for SageMaker runtime
runtime = boto3.client('sagemaker-runtime')

# Endpoint names of your four models
xgboost = 'sagemaker-xgboost-2024-06-21-06-26-17-576'
linear = 'linear-learner-2024-06-21-07-03-28-931'
factor = 'factorization-machines-2024-06-21-07-39-47-771'
lgbm = 'sagemaker-jumpstart-2024-06-21-08-26-15-754'

def serialize(self, data):
        js = {"instances": []}
        for row in data:
            js["instances"].append({"features": row.tolist()})
        return json.dumps(js)

def lambda_handler(event, context):
    features = event['features']
    
    responses = []
    
    response_xgb = runtime.invoke_endpoint(
        EndpointName=xgboost,
        ContentType='text/csv',
        Body=features
    )
    result_xgb = response_xgb['Body'].read().decode().split('\n')[:-1]
    responses.append(np.array(result_xgb).astype(float))
    
    response_linear = runtime.invoke_endpoint(
        EndpointName=linear,
        ContentType='text/csv',
        Body=features
    )
    response_linear = json.loads(response_linear['Body'].read().decode())
    result_linear = [out['score'] for out in response_linear['predictions']]
    responses.append(np.array(result_linear).astype(float))
    
    df = pd.read_csv(StrintIO(features),header=None)
    response_fm = runtime.invoke_endpoint(
        EndpointName=factor,
        ContentType='application/json',
        Body=serialize(df)
    )
    response_fm = json.loads(response_fm['Body'].read().decode())
    result_fm = [out['score'] for out in response_fm['predictions']]
    responses.append(np.array(result_fm).astype(float))
    
    response_lgbm = runtime.invoke_endpoint(
        EndpointName=lgbm,
        ContentType='text/csv',
        Body=features
    )
    response_lgbm = json.loads(response_lgbm['Body'].read().decode())
    result_lgbm = [out for out in response_lgbm['prediction']]
    responses.append(np.array(result_lgbm).astype(float))
    
    # Aggregate or process responses as needed
    # For example, you can average the predictions if they are numerical
    aggregated_result = aggregate_responses(responses)
    
    return {
        'statusCode': 200,
        'body': json.dumps(aggregated_result.tolist())
    }

def aggregate_responses(responses):
    # Example aggregation: average predictions
    aggregated = sum(responses) / len(responses)
    return aggregated


Writing code/lambda_helper.py


In [138]:
!zip code/lambda.zip code/lambda_helper.py

/bin/bash: line 1: zip: command not found


In [137]:
# Initialize clients
iam_client = boto3.client('iam')
lambda_client = boto3.client('lambda')

# Role and Lambda function settings
role_name = 'LambdaSageMakerInvokeRole'
function_name = 'MLE-Ensemble'
runtime = 'python3.8'
handler = 'lambda_helper.lambda_handler'

# Read the deployment package
# with open('lambda_function.zip', 'rb') as f:
#     zipped_code = f.read()

# Create the Lambda function
response = lambda_client.create_function(
    FunctionName=function_name,
    Runtime=runtime,
    Role=role,  # Replace with your Lambda role ARN
    Handler=handler,
    Code='code/lambda_helper.py',
    Timeout=300,  # 5 minutes
    MemorySize=128
)

print(f"Created Lambda function with ARN: {response['FunctionArn']}")


ParamValidationError: Parameter validation failed:
Invalid type for parameter Code, value: code/lambda_helper.py, type: <class 'str'>, valid types: <class 'dict'>

### New Design

In [17]:
%%writefile code/train_inference.py

import argparse
import os

from io import StringIO
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor

import joblib
import json
import sys
import json
import pickle
import xgboost
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor


if __name__ == "__main__":
    print("Training Started")
    parser = argparse.ArgumentParser()

    # Sagemaker specific arguments. Defaults are set in the environment variables.
    parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
    parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
    parser.add_argument("--validation", type=str, default=os.environ["SM_CHANNEL_VALIDATION"])
    parser.add_argument("--num_round", type=int, default=6)
    parser.add_argument("--max_depth", type=int, default=5)
    parser.add_argument("--eta", type=float, default=0.2)
    parser.add_argument("--objective", type=str, default="reg:squarederror")
    parser.add_argument("--alpha", type=float, default=1.0)
    args = parser.parse_args()
    print("Got Args: {}".format(args))

    # Load training and validation sets
    train_path = os.path.join(args.train, 'train.csv')
    train_df = pd.read_csv(train_path, header=None)
    
    validation_path = os.path.join(args.validation, 'validation.csv')
    validation_df = pd.read_csv(validation_path, header=None)
    
    # First column is label
    X_train = train_df.iloc[:, 1:].values
    y_train = train_df.iloc[:, 0].values
    
    X_validation = validation_df.iloc[:, 1:].values
    y_validation = validation_df.iloc[:, 0].values
    
    
    """
    Define and Train catboost
    """    
    catboost_hyperparameters = {
        "max_depth": args.max_depth,
        "eta": args.eta,
    }

    cat = CatBoostRegressor(**catboost_hyperparameters)
    cat.fit(X_train,y_train,logging_level='Silent')

    model_catboost = os.path.join(args.model_dir, 'catboost.dump')
    cat.save_model(model_catboost)
    print('Finished training catboost')
    
    
    """
    Train the XGBoost model
    """
    xgb_hyperparameters = {
        "max_depth": args.max_depth,
        "eta": args.eta,
        "objective": args.objective,
        "num_boost_round": args.num_round,
    }

    dtrain = xgboost.DMatrix(data=X_train,label=y_train)
    xgb = xgboost.train(
        params=xgb_hyperparameters,
        dtrain=dtrain
    )    

    model_xgb = os.path.join(args.model_dir,"xgboost-model")
    pickle.dump(xgb, open(model_xgb, "wb"))
    print('Finished training xgboost')
    
    
    """
    Train the elastic net model
    """
    en_hyperparameters = {
        'alpha': args.alpha
    }
    
    en = ElasticNet(**en_hyperparameters)
    en.fit(X_train,y_train)
    
    model_en = os.path.join(args.model_dir,'elastic-net')
    pickle.dump(en, open(model_en, 'wb'))
    print('Finished training elastic net')
    
    
    """
    Train the random forest model
    """
    rf_hyperparameters = {
        'max_depth': args.max_depth
    }
    
    rf = RandomForestRegressor(**rf_hyperparameters)
    rf.fit(X_train,y_train)
    
    model_rf = os.path.join(args.model_dir,'random-forest')
    pickle.dump(rf, open(model_rf, 'wb'))
    print('Finished training random forest')
    
    # Calculate validation rmse and generate weights
    predictions = []
    predictions.append(cat.predict(X_validation))

    dval = xgboost.DMatrix(X_validation)
    predictions.append(xgb.predict(dval))
    
    predictions.append(en.predict(X_validation))
    predictions.append(rf.predict(X_validation))
    
    # Calculate RMSEs on validation set
    scores = []
    for pred in predictions:
        scores.append(np.sqrt(mean_squared_error(y_validation,pred)))
    print(f'Validation RMSE for Catboost: {scores[0]:.2f}, XGBoost: {scores[1]:.2f}, Elastic Net: {scores[2]:.2f}, Random Forest: {scores[3]:.2f}')
    
    # Calculate weights by taking reciprocals of test RMSEs
    weights = 1 / np.array(scores)
    weights /= np.sum(weights)
    
    # Save weights to ensemble model directory
    weights_dir = os.path.join(args.model_dir, 'weights.json')
    json.dump(weights.tolist(), open(weights_dir,'w'))
    

def input_fn(input_data, content_type):
    dtype=None
    payload = StringIO(input_data)
    
    return np.genfromtxt(payload, dtype=dtype, delimiter=",")

def model_fn(model_dir):
    """Deserialized and return fitted model

    Note that this should have the same name as the serialized model in the main method
    """
    catboost_model = CatBoostRegressor()
    catboost_model.load_model(os.path.join(model_dir, 'catboost.dump'))
    
    xgb = pickle.load(open(os.path.join(model_dir, 'xgboost-model'), "rb"))
    en = pickle.load(open(os.path.join(model_dir, 'elastic-net'), "rb"))
    rf = pickle.load(open(os.path.join(model_dir, 'random-forest'), "rb"))
    
    weights = [0.25] * 4
    weights_path = os.path.join(model_dir, 'weights.json')
    if os.path.isfile(weights_path):
        weights = json.load(open(weights_path,'r'))
    
    all_model = [catboost_model, xgb, en, rf, weights]
    return all_model


def predict_fn(input_data, model):
    weights = model[-1]

    predictions_cat = model[0].predict(input_data) * weights[0]

    dtest = xgb.DMatrix(input_data)
    predictions_xgb = model[1].predict(dtest) * weights[1]
    
    predictions_en = model[2].predict(input_data) * weights[2]
    predictions_rf = model[3].predict(input_data) * weights[3]
    
    return np.sum(np.array([predictions_cat, predictions_xgb, predictions_en, predictions_rf]), axis=0)

Overwriting code/train_inference.py


In [12]:
%%writefile code/requirements.txt
pandas
catboost
xgboost

Writing code/requirements.txt


In [15]:
from sagemaker.sklearn.estimator import SKLearn

hyperparameters = {}

params = {
    "entry_point": "train_inference.py",
    "source_dir": "code",
    "instance_type": 'ml.m5.xlarge',
    "instance_count": 1,
    "hyperparameters": hyperparameters,
    "role": role,
    "base_job_name": "ensemble-model",
    "framework_version": "1.0-1",
    "metric_definitions":[
       {'Name': 'validation:rmse', 'Regex': 'validation-rmse:(.*?);'}
    ],
    'output_path': S3_PATH + MODEL_PREFIX + 'ensemble'
}

estimator = SKLearn(**params)

In [18]:
from time import gmtime, strftime

# Specify training data location
s3_train_data = S3_PATH + DATA_PREFIX + 'train.csv'
s3_validation_data = S3_PATH + DATA_PREFIX + 'validation.csv'

# generating the session.s3_input() format for fit() accepted by the sdk
train_data = sagemaker.inputs.TrainingInput(
    s3_train_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)
validation_data = sagemaker.inputs.TrainingInput(
    s3_validation_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)

# Fit the tuner
estimator.fit(
    inputs={
        'train': train_data,
        'validation': validation_data
    },
    job_name="ensemble-" + strftime("%Y%m%d-%H-%M-%S", gmtime())
)

INFO:sagemaker:Creating training-job with name: ensemble-20240701-11-10-16


2024-07-01 11:10:16 Starting - Starting the training job...
2024-07-01 11:10:35 Starting - Preparing the instances for training...
2024-07-01 11:11:03 Downloading - Downloading input data......
2024-07-01 11:12:14 Training - Training image download completed. Training in progress...[34m2024-07-01 11:12:24,192 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2024-07-01 11:12:24,195 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-07-01 11:12:24,198 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-07-01 11:12:24,213 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2024-07-01 11:12:24,466 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/miniconda3/bin/python -m pip install -r requirements.txt[0m
[34mCollecting catboost (from -r requirements.txt (line 2))
  Down

In [24]:
from sagemaker.serverless.serverless_inference_config import ServerlessInferenceConfig

serverless_config = ServerlessInferenceConfig(
    memory_size_in_mb=4096,
    max_concurrency=1,
)

predictor = estimator.deploy(serverless_inference_config=serverless_config)

INFO:sagemaker:Creating model with name: ensemble-model-2024-06-30-19-36-24-829
INFO:sagemaker:Creating endpoint-config with name ensemble-model-2024-06-30-19-36-24-829
INFO:sagemaker:Creating endpoint with name ensemble-model-2024-06-30-19-36-24-829


------!

In [None]:
test_df = pd.read_csv('test.csv',header=None)

In [26]:
endpoint_name = 'ensemble-model-2024-06-30-19-36-24-829'

# Initialize a predictor
predictor = sagemaker.predictor.Predictor(
    endpoint_name=endpoint_name
)

In [27]:
pred = predictor.predict(
    test_df.iloc[:10,1:].to_csv(header=False,index=False),
    initial_args={"ContentType": "text/csv"}
)
pred.decode('utf-8')

'[14.01961888060059, 6.464386136834358, 20.18643940917133, 5.616834506258041, 6.13431000677692, 10.912351542143117, 13.04572546573923, 19.169639490005743, 11.554679724879048, 11.797232157711086]'

In [28]:
predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: ensemble-model-2024-06-30-19-36-24-829
INFO:sagemaker:Deleting endpoint with name: ensemble-model-2024-06-30-19-36-24-829


In [19]:
%%writefile code/evaluate_ensemble.py
import os
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor

import tarfile
import json
import sys
import json
import pickle
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from pathlib import Path

if __name__ == '__main__':
    model_tar_path = '/opt/ml/processing/model/model.tar.gz'
    ensemble_path = '/opt/ml/processing/ensemble'
    model_dir = os.path.join(ensemble_path,'model')
    with tarfile.open(model_tar_path, 'r:gz') as tar:
        tar.extractall(path=model_dir)
    
    # Load models and weights
    model = []
    catboost_model = CatBoostRegressor()
    catboost_model.load_model(os.path.join(model_dir, 'catboost.dump'))
    model.append(catboost_model)
    
    model.append(pickle.load(open(os.path.join(model_dir, 'xgboost-model'), "rb")))
    model.append(pickle.load(open(os.path.join(model_dir, 'elastic-net'), "rb")))
    model.append(pickle.load(open(os.path.join(model_dir, 'random-forest'), "rb")))
    
    weights = json.load(open(os.path.join(model_dir, 'weights.json'), 'r'))
    
    # Load test file    
    test_path = "/opt/ml/processing/test/"
    test_df = pd.read_csv(test_path + "test.csv", header=None)
    X_test = test_df.iloc[:,1:].values
    y_test = test_df.iloc[:,0].values
    
    # Make predictions on test set
    predictions = []
    predictions.append(model[0].predict(X_test))

    dtest = xgb.DMatrix(X_test)
    predictions.append(model[1].predict(dtest))
    
    predictions.append(model[2].predict(X_test))
    predictions.append(model[3].predict(X_test))
    
    predictions.append(np.sum(np.array(predictions).T * np.array(weights),axis=1))
    
    # Calculate RMSEs on test set
    scores = []
    for pred in predictions:
        scores.append(np.sqrt(mean_squared_error(y_test,pred)))
    print(f'Test RMSE for Catboost: {scores[0]:.2f}, XGBoost: {scores[1]:.2f}, Elastic Net: {scores[2]:.2f}, Random Forest: {scores[3]:.2f}, Ensemble model: {scores[4]:.2f}')
    
    # Save weights and test RMSE
    output_dict = {
        'test_rmse': scores
    }
    
    output_dir = "/opt/ml/processing/evaluation"
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    evaluation_path = f"{output_dir}/evaluation.json"
    json.dump(output_dict, open(evaluation_path,'w'))

Overwriting code/evaluate_ensemble.py


In [22]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor

est_cls = sagemaker.sklearn.estimator.SKLearn
sklearn_framework_version = "1.0-1"

processor = FrameworkProcessor(
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    estimator_cls=est_cls,
    framework_version=sklearn_framework_version,
    base_job_name='ensamble-evaluate',
    sagemaker_session=sagemaker_session,
)

s3_test_data = S3_PATH + DATA_PREFIX + 'test.csv'

processor.run(
    inputs=[
        ProcessingInput(
            source='s3://sagemaker-us-east-1-836402295281/GROUP6/model/ensemble/ensemble-20240701-11-10-16/output/model.tar.gz',
            destination="/opt/ml/processing/model",
        ),
        ProcessingInput(
            source=s3_test_data,
            destination="/opt/ml/processing/test",
        ),
    ],
    outputs=[
        ProcessingOutput(output_name="evaluation",
                         source="/opt/ml/processing/evaluation",
                         destination=S3_PATH + f'{GROUP_NAME}/evaluate/ensemble'
                        ),
    ],
    code="evaluate_ensemble.py",
    source_dir='code'
)

INFO:sagemaker.processing:Uploaded code to s3://sagemaker-us-east-1-836402295281/ensamble-evaluate-2024-07-01-12-15-56-950/source/sourcedir.tar.gz
INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-us-east-1-836402295281/ensamble-evaluate-2024-07-01-12-15-56-950/source/runproc.sh
INFO:sagemaker:Creating processing-job with name ensamble-evaluate-2024-07-01-12-15-56-950


.............[34mFound existing installation: typing 3.7.4.3[0m
[34mUninstalling typing-3.7.4.3:
  Successfully uninstalled typing-3.7.4.3[0m
[34mCollecting catboost (from -r requirements.txt (line 2))
  Downloading catboost-1.2.5-cp38-cp38-manylinux2014_x86_64.whl.metadata (1.2 kB)[0m
[34mCollecting xgboost (from -r requirements.txt (line 3))
  Downloading xgboost-2.1.0-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)[0m
[34mCollecting graphviz (from catboost->-r requirements.txt (line 2))
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)[0m
[34mCollecting matplotlib (from catboost->-r requirements.txt (line 2))
  Downloading matplotlib-3.7.5-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.7 kB)[0m
[34mCollecting plotly (from catboost->-r requirements.txt (line 2))
  Downloading plotly-5.22.0-py3-none-any.whl.metadata (7.1 kB)[0m
[34mCollecting contourpy>=1.0.1 (from matplotlib->catboost->-r requirements.txt (line 2))
  Downloading 

### Pipeline

In [None]:
# Processing Step

from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.workflow.steps import ProcessingStep

sklearn_processor = SKLearnProcessor(
    framework_version='1.2-1',
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name=BASE_JOB_PROCESSING_NAME,
    sagemaker_session=pipeline_session
)

process_args = sklearn_processor.run(
    code="code/preprocess.py",
    inputs=[
        ProcessingInput(
            source=S3_PATH + DATA_PREFIX,
            destination="/opt/ml/processing/input"
        ),
    ],
    outputs=[
        ProcessingOutput(
            output_name="scaler_model",
            source="/opt/ml/processing/output/preprocessor",
            destination=S3_PATH + MODEL_PREFIX
        ),
        ProcessingOutput(
            output_name="train",
            source="/opt/ml/processing/output/train",
            destination=S3_PATH + DATA_PREFIX
        ),
        ProcessingOutput(
            output_name="validation",
            source="/opt/ml/processing/output/validation",
            destination=S3_PATH + DATA_PREFIX
        ),
        ProcessingOutput(
            output_name="test",
            source="/opt/ml/processing/output/test",
            destination=S3_PATH + DATA_PREFIX
        ),
    ],
)

step_process = ProcessingStep(
    name="PreprocessData",
    step_args=processor_args,
)

In [None]:
# Train and hyperparameter tuning step

from sagemaker.sklearn.estimator import SKLearn
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TuningStep

params = {
    "entry_point": "train_inference.py",
    "source_dir": "code",
    "instance_type": training_instance_type,
    "instance_count": training_instance_type,
    "role": role,
    "framework_version": sklearn_framework_version,
    "metric_definitions":[
       {'Name': 'validation:rmse', 'Regex': 'validation-rmse:(.*?);'}
    ],
    'output_path': S3_PATH + MODEL_PREFIX + 'ensemble',
    'base_job_name': BASE_JOB_TRAINING_NAME,
    'sagemaker_session': pipeline_session
}

estimator = SKLearn(**params)

hyperparameters = {
    'num_round': IntegerParameter(5,20),
    'max_depth': IntegerParameter(5,20),
    'eta': ContinuousParameter(0.001,0.2),
    'alpha': ContinuousParameter(0.0,2.0)
}

metric_definitions = [{"Name": "validation:rmse", "Regex": "validation-rmse:([0-9\\.]+)"}]
objective_metric_name = "validation:rmse"

tuner = HyperparameterTuner(
    estimator, 
    objective_metric_name,
    hyperparameter_ranges, 
    metric_definitions,
    max_jobs=5, 
    max_parallel_jobs=5,
    objective_type='Minimize'
)

# Fit the tuner
tune_args = tuner.fit(
    inputs={
        "train": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri,
            content_type="text/csv",
        ),
        "validation": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["validation"].S3Output.S3Uri,
            content_type="text/csv",
        )
    },
    include_cls_metadata=False
)

step_tune_model = TuningStep(name='TuneEnsemble', step_args=tune_args)

In [None]:
# Evaluation Step

from sagemaker.workflow.properties import PropertyFile
from sagemaker.processing import FrameworkProcessor

est_cls = sagemaker.sklearn.estimator.SKLearn

processor = FrameworkProcessor(
    role=role,
    instance_count=processing_instance_count,
    instance_type=processing_instance_type,
    estimator_cls=est_cls,
    framework_version=sklearn_framework_version,
    base_job_name=BASE_JOB_EVALUATION_NAME,
    sagemaker_session=pipeline_session,
)

# Create a PropertyFile
# A PropertyFile is used to be able to reference outputs from a processing step, for instance to use in a condition step.
# For more information, visit https://docs.aws.amazon.com/sagemaker/latest/dg/build-and-manage-propertyfile.html
evaluation_report = PropertyFile(
    name="EvaluationReport", output_name="evaluation", path="evaluation.json"
)

eval_args = processor.run(
    inputs=[
        ProcessingInput(
            source=step_tune_model.get_top_model_s3_uri(
                top_k=0,
                s3_bucket=bucket,
                prefix=MODEL_PREFIX + 'ensemble'
            ),
            destination="/opt/ml/processing/model",
        ),
        ProcessingInput(
            source=step_process.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri,
            destination="/opt/ml/processing/test",
        ),
    ],
    outputs=[
        ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"),
    ],
    code="evaluate_ensemble.py",
    source_dir='code'
)

step_evaluate_model = ProcessingStep(
    name="EvaluateModelPerformance",
    step_args=eval_args,
    property_files=[evaluation_report],
)

In [None]:
# Model Step

from sagemaker.model import Model
from sagemaker.sklearn.model import SKLearnModel
from sagemaker import PipelineModel


scaler_model_s3 = "{}/model.tar.gz".format(
    step_process.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
)

scaler_model = SKLearnModel(
    model_data=scaler_model_s3,
    role=role,
    sagemaker_session=pipeline_session,
    entry_point="code/preprocess.py",
    framework_version=sklearn_framework_version,
)

scaler_model.env = {"SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT":"text/csv"}

ensemble_model = SKLearnModel(
    model_data=step_tune_model.get_top_model_s3_uri(
        top_k=0,
        s3_bucket=bucket,
        prefix=MODEL_PREFIX + 'ensemble'
    ),
    entry_point='train_inference.py',
    source_dir='code'
    sagemaker_session=pipeline_session,
    role=role
)

pipeline_model = PipelineModel(
    models=[scaler_model, ensemble_model], role=role, sagemaker_session=pipeline_session
)

from sagemaker.model_metrics import MetricsSource, ModelMetrics
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.step_collections import RegisterModel


evaluation_s3_uri = "{}/evaluation.json".format(
    step_evaluate_model.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
)

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri=evaluation_s3_uri,
        content_type="application/json",
    )
)

register_args = pipeline_model.register(
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.m5.large", "ml.m5.xlarge"],
    transform_instances=["ml.m5.xlarge"],
    model_package_group_name=MODEL_PACKAGE_GROUP_NAME,
    model_metrics=model_metrics,
    approval_status=model_approval_status,
)

step_register_pipeline_model = ModelStep(
    name="PipelineModel",
    step_args=register_args,
)

In [None]:
# Condition Step

from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import JsonGet

# Create accuracy condition to ensure the model meets performance requirements.
# Models with a test accuracy lower than the condition will not be registered with the model registry.
cond_lte = ConditionLessThanOrEqualTo(
    left=JsonGet(
        step_name=step_evaluate_model.name,
        property_file=evaluation_report,
        json_path="test_rmse[4]",
    ),
    right=accuracy_rmse_threshold,
)

# Create a Sagemaker Pipelines ConditionStep, using the condition above.
# Enter the steps to perform if the condition returns True / False.
step_cond = ConditionStep(
    name="RMSE-Lower-Than-Threshold-Condition",
    conditions=[cond_lte],
    if_steps=[step_register_pipeline_model],  # step_register_model, step_register_scaler,
    else_steps=[],
)

In [None]:
from sagemaker.workflow.pipeline import Pipeline

# Create a Sagemaker Pipeline.
# Each parameter for the pipeline must be set as a parameter explicitly when the pipeline is created.
# Also pass in each of the steps created above.
# Note that the order of execution is determined from each step's dependencies on other steps,
# not on the order they are passed in below.
pipeline = Pipeline(
    name=PIPELINE_NAME,
    parameters=[
        training_instance_type,
        processing_instance_type,
        processing_instance_count,
        input_data,
        model_approval_status,
        accuracy_rmse_threshold,
    ],
    steps=[step_process, step_tune_model, step_evaluate_model, step_cond],
)

In [None]:
pipeline.upsert(role_arn=role)

In [None]:
execution = pipeline.start()

In [None]:
execution.wait()