In [60]:
!pip install -r ./requirements.txt


import pandas as pd

import numpy as np

import os
from catboost import CatBoost, CatBoostRegressor, Pool

from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split
import sagemaker

import os


def preprocess(raw_data):
    df = pd.read_csv(raw_data)
    df.drop(['ProductId', 'OrderID', 'OnTimeDelivery', 'OrderDate'], axis=1, inplace=True)
    train, test = train_test_split(df, test_size=0.2)
    train, validation = train_test_split(train, test_size=0.2)
    print("Completed running the processing job")
    return pd.DataFrame(train), pd.DataFrame(validation), pd.DataFrame(test)



def train(
    train_df,
    validation_df,
    categorical_features_names,
    target = "ExpectedShippingDays",
    iterations=100,
    learning_rate=0.01,
    n_estimators=4000,
):
    y_train = train_df.loc[:, target]
    train_df.drop([target], axis=1, inplace=True)
    y_validation = validation_df.loc[:, target]
    validation_df.drop([target], axis=1, inplace=True)
    train_pool = Pool(train_df, label=y_train, cat_features=categorical_features_names)
    val_pool = Pool(validation_df, label=y_validation, cat_features=categorical_features_names)
    model = CatBoostRegressor(custom_metric= ['R2', 'RMSE'], learning_rate=learning_rate, n_estimators=n_estimators)
    model.fit(train_pool, eval_set=val_pool, verbose=2000, plot=True)
    return model



def evaluate(model, test_df,target = "ExpectedShippingDays",):
    y_test = test_df.loc[:, target]
    test_df.drop([target], axis=1, inplace=True)
    predictions = model.predict(test_df)

    mse = mean_squared_error(y_test, predictions)
    std = np.std(y_test - predictions)
    report_dict = {
        "regression_metrics": {
            "mse": {"value": mse, "standard_deviation": std},
        },
    }
    return report_dict


csv_path = f"canvas-sample-shipping-logs.csv"

train_df, val_df, test_df = preprocess(csv_path)
categorical_features_names = ['ShippingPriority' ,'ShippingOrigin', 'InBulkOrder', 'Carrier']
model = train(train_df, val_df, categorical_features_names)
report = evaluate(model, test_df)
print(f"evaluation report: {report}")

Completed running the processing job


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 2.2410334	test: 2.3993268	best: 2.3993268 (0)	total: 2.49ms	remaining: 9.95s
2000:	learn: 0.4448381	test: 0.8959218	best: 0.8959218 (2000)	total: 4.75s	remaining: 4.75s
3999:	learn: 0.2990407	test: 0.8893453	best: 0.8893406 (3998)	total: 9.85s	remaining: 0us

bestTest = 0.8893405541
bestIteration = 3998

Shrink model to first 3999 iterations.
evaluation report: {'regression_metrics': {'mse': {'value': 0.466421757666663, 'standard_deviation': 0.6827846904969538}}}


In [61]:
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.pipeline import Pipeline
from sagemaker import get_execution_role
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
)
from sagemaker.workflow.function_step import step

!sudo chmod 777 lost+found


pipeline_session = PipelineSession()
region = pipeline_session.boto_region_name
default_bucket = pipeline_session.default_bucket()
input_path = f"s3://{default_bucket}/canvas/sample_dataset/canvas-sample-shipping-logs.csv"

# Set path to config file
os.environ["SAGEMAKER_USER_CONFIG_OVERRIDE"] = os.getcwd()


categorical_features_names = ['ShippingPriority' ,'ShippingOrigin', 'InBulkOrder', 'Carrier']
instance_count = ParameterInteger(
    name="InstanceCount",
    default_value=1
)

instance_type = ParameterString(
    name="InstanceType",
    default_value='ml.m5.large'
)


delayed_data = step(preprocess, name="ShippingPreprocess")(input_path)
delayed_model = step(train, name="ShippingTrain")(train_df=delayed_data[0],
                                                  validation_df=delayed_data[1],
                                                  categorical_features_names=categorical_features_names)
delayed_evaluation_result = step(evaluate, name="ShippingEval")(model=delayed_model,
                                                              test_df=delayed_data[2])

steps = [delayed_evaluation_result]

pipeline = Pipeline(
    name="ShippingPipeline",
    parameters=[
        instance_count,
        instance_type,
    ],
    steps=steps,
    sagemaker_session=pipeline_session
)
role = sagemaker.get_execution_role()
pipeline.upsert(role_arn=role)
execution = pipeline.start()


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.Dependencies
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.IncludeLocalWorkDir
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.CustomFileFilter.IgnoreNamePatterns
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.InstanceType


2024-02-11 21:01:59,691 sagemaker.remote_function INFO     Uploading serialized function code to s3://sagemaker-us-east-1-376337229415/ShippingPipeline/ShippingEval/2024-02-11-21-01-56-816/function
2024-02-11 21:01:59,814 sagemaker.remote_function INFO     Uploading serialized function arguments to s3://sagemaker-us-east-1-376337229415/ShippingPipeline/ShippingEval/2024-02-11-21-01-56-816/arguments
2024-02-11 21:02:00,078 sagemaker.remote_function INFO     Copied dependencies file at './requirements.txt' to '/tmp/tmp01kgfr6p/requirements.txt'
2024-02-11 21:02:00,140 sagemaker.remote_function INFO     Successfully uploaded dependencies and pre execution scripts to 's3://sagemaker-us-east-1-376337229415/ShippingPipeline/ShippingEval/2024-02-11-21-01-56-816/pre_exec_script_and_dependencies'
2024-02-11 21:02:00,149 sagemaker.remote_function INFO     Copied user workspace to '/tmp/tmprwmmz8i5/temp_workspace/sagemaker_remote_function_workspace'
2024-02-11 21:02:00,282 sagemaker.remote_functi

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.Dependencies
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.IncludeLocalWorkDir
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.CustomFileFilter.IgnoreNamePatterns
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.InstanceType


2024-02-11 21:02:03,040 sagemaker.remote_function INFO     Uploading serialized function code to s3://sagemaker-us-east-1-376337229415/ShippingPipeline/ShippingTrain/2024-02-11-21-01-56-816/function
2024-02-11 21:02:03,126 sagemaker.remote_function INFO     Uploading serialized function arguments to s3://sagemaker-us-east-1-376337229415/ShippingPipeline/ShippingTrain/2024-02-11-21-01-56-816/arguments
2024-02-11 21:02:03,242 sagemaker.remote_function INFO     Copied dependencies file at './requirements.txt' to '/tmp/tmpodtwx81t/requirements.txt'
2024-02-11 21:02:03,271 sagemaker.remote_function INFO     Successfully uploaded dependencies and pre execution scripts to 's3://sagemaker-us-east-1-376337229415/ShippingPipeline/ShippingTrain/2024-02-11-21-01-56-816/pre_exec_script_and_dependencies'


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.Dependencies
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.IncludeLocalWorkDir
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.CustomFileFilter.IgnoreNamePatterns
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.InstanceType


2024-02-11 21:02:05,852 sagemaker.remote_function INFO     Uploading serialized function code to s3://sagemaker-us-east-1-376337229415/ShippingPipeline/ShippingPreprocess/2024-02-11-21-01-56-816/function
2024-02-11 21:02:05,939 sagemaker.remote_function INFO     Uploading serialized function arguments to s3://sagemaker-us-east-1-376337229415/ShippingPipeline/ShippingPreprocess/2024-02-11-21-01-56-816/arguments
2024-02-11 21:02:06,004 sagemaker.remote_function INFO     Copied dependencies file at './requirements.txt' to '/tmp/tmpu0tg_898/requirements.txt'
2024-02-11 21:02:06,035 sagemaker.remote_function INFO     Successfully uploaded dependencies and pre execution scripts to 's3://sagemaker-us-east-1-376337229415/ShippingPipeline/ShippingPreprocess/2024-02-11-21-01-56-816/pre_exec_script_and_dependencies'
2024-02-11 21:02:06,427 sagemaker.remote_function INFO     Uploading serialized function code to s3://sagemaker-us-east-1-376337229415/ShippingPipeline/ShippingEval/2024-02-11-21-02-0

In [63]:
execution.list_steps()

[{'StepName': 'ShippingPreprocess',
  'StartTime': datetime.datetime(2024, 2, 11, 21, 2, 9, 324000, tzinfo=tzlocal()),
  'StepStatus': 'Starting',
  'Metadata': {},
  'AttemptCount': 1}]

In [None]:
train, val, test = execution.result(step_name="ShippingPreprocess")

In [None]:
train

Unnamed: 0,ActualShippingDays,ExpectedShippingDays,Carrier,YShippingDistance,XShippingDistance,InBulkOrder,ShippingOrigin,ShippingPriority
257,12,12,MicroCarrier,-105,59,Bulk Order,Las Vegas,Air
155,13,13,BigBird,36,-100,Bulk Order,New York City,Standard
191,14,14,BigBird,176,22,Single Order,San Francisco,Express
607,12,13,BigBird,-87,-55,Single Order,Atlanta,Air
643,18,13,BigBird,77,134,Bulk Order,Houston,Standard
...,...,...,...,...,...,...,...,...
239,15,14,GlobalFreight,-167,-13,Bulk Order,New York City,Ground
185,13,14,GlobalFreight,-183,30,Bulk Order,Houston,Standard
625,13,12,Shipper,-251,76,Bulk Order,Seattle,Air
719,10,11,BigBird,156,35,Single Order,Salt Lake City,Express


In [None]:
execution.result(step_name="ShippingTrain")

<catboost.core.CatBoostRegressor at 0x7f59768abb50>

In [None]:
execution.result(step_name="ShippingEval")

{'regression_metrics': {'mse': {'value': 0.41388702907116,
   'standard_deviation': 0.642945277086124}}}

In [None]:
from s3fs.core import S3FileSystem
s3_file = S3FileSystem()
bucket = "sagemaker-us-east-1-376337229415"
key= "ShippingPipeline/90cd8j1l6co1/ShippingTrain/results/payload.pkl"
data = pickle.load(s3_file.open('{}/{}'.format(bucket, key)))
data

<catboost.core.CatBoostRegressor at 0x7f596d9deec0>

In [81]:
pipeline_execution_summaries = pipeline.list_executions()['PipelineExecutionSummaries']
pipeline_execution_list = [{k:v for k,v in dct.items() if k != 'PipelineExecutionArn'} for dct in pipeline_execution_summaries]

In [83]:
pipeline_execution_list[:2] 

[{'StartTime': datetime.datetime(2024, 2, 11, 21, 2, 8, 107000, tzinfo=tzlocal()),
  'PipelineExecutionStatus': 'Succeeded',
  'PipelineExecutionDisplayName': 'execution-1707685328158'},
 {'StartTime': datetime.datetime(2024, 2, 11, 20, 19, 11, 983000, tzinfo=tzlocal()),
  'PipelineExecutionStatus': 'Succeeded',
  'PipelineExecutionDisplayName': 'execution-1707682752048'}]