In [12]:
!pip install -r ./requirements.txt


import pandas as pd

import numpy as np

import os
from catboost import CatBoost, CatBoostRegressor, Pool

from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split
import sagemaker
from s3fs.core import S3FileSystem
import pickle
import os


def preprocess(raw_data):
    df = pd.read_csv(raw_data)
    df.drop(['ProductId', 'OrderID', 'OnTimeDelivery', 'OrderDate'], axis=1, inplace=True)
    train, test = train_test_split(df, test_size=0.2)
    train, validation = train_test_split(train, test_size=0.2)
    print("Completed running the processing job")
    return pd.DataFrame(train), pd.DataFrame(validation), pd.DataFrame(test)



def train(
    train_df,
    validation_df,
    categorical_features_names,
    target = "ExpectedShippingDays",
    iterations=100,
    learning_rate=0.01,
    n_estimators=4000,
):
    y_train = train_df.loc[:, target]
    train_df.drop([target], axis=1, inplace=True)
    y_validation = validation_df.loc[:, target]
    validation_df.drop([target], axis=1, inplace=True)
    train_pool = Pool(train_df, label=y_train, cat_features=categorical_features_names)
    val_pool = Pool(validation_df, label=y_validation, cat_features=categorical_features_names)
    model = CatBoostRegressor(custom_metric= ['R2', 'RMSE'], learning_rate=learning_rate, n_estimators=n_estimators)
    model.fit(train_pool, eval_set=val_pool, verbose=2000, plot=True)
    return model



def evaluate(model, test_df,target = "ExpectedShippingDays",):
    y_test = test_df.loc[:, target]
    test_df.drop([target], axis=1, inplace=True)
    predictions = model.predict(test_df)

    mse = mean_squared_error(y_test, predictions)
    std = np.std(y_test - predictions)
    report_dict = {
        "regression_metrics": {
            "mse": {"value": mse, "standard_deviation": std},
        },
    }
    return report_dict


csv_path = f"canvas-sample-shipping-logs.csv"

train_df, val_df, test_df = preprocess(csv_path)
categorical_features_names = ['ShippingPriority' ,'ShippingOrigin', 'InBulkOrder', 'Carrier']
model = train(train_df, val_df, categorical_features_names)
report = evaluate(model, test_df)
print(f"evaluation report: {report}")

[31mERROR: Could not find a version that satisfies the requirement pickle (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pickle[0m[31m
[0mCompleted running the processing job


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 2.2470706	test: 2.4700825	best: 2.4700825 (0)	total: 1.92ms	remaining: 7.69s
2000:	learn: 0.4195061	test: 0.7770199	best: 0.7729271 (1450)	total: 5.03s	remaining: 5.03s
3999:	learn: 0.2772115	test: 0.7829538	best: 0.7729271 (1450)	total: 10.6s	remaining: 0us

bestTest = 0.7729271004
bestIteration = 1450

Shrink model to first 1451 iterations.
evaluation report: {'regression_metrics': {'mse': {'value': 0.6220480970026173, 'standard_deviation': 0.7812581495477107}}}


In [3]:
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.pipeline import Pipeline
from sagemaker import get_execution_role
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
)
from sagemaker.workflow.function_step import step

!sudo chmod 777 lost+found


pipeline_session = PipelineSession()
region = pipeline_session.boto_region_name
default_bucket = pipeline_session.default_bucket()
input_path = f"s3://{default_bucket}/canvas/sample_dataset/canvas-sample-shipping-logs.csv"

# Set path to config file
os.environ["SAGEMAKER_USER_CONFIG_OVERRIDE"] = os.getcwd()


categorical_features_names = ['ShippingPriority' ,'ShippingOrigin', 'InBulkOrder', 'Carrier']
instance_count = ParameterInteger(
    name="InstanceCount",
    default_value=1
)

instance_type = ParameterString(
    name="InstanceType",
    default_value='ml.m5.large'
)


delayed_data = step(preprocess, name="ShippingPreprocess")(input_path)
delayed_model = step(train, name="ShippingTrain")(train_df=delayed_data[0],
                                                  validation_df=delayed_data[1],
                                                  categorical_features_names=categorical_features_names)
delayed_evaluation_result = step(evaluate, name="ShippingEval")(model=delayed_model,
                                                              test_df=delayed_data[2])

steps = [delayed_evaluation_result]

pipeline = Pipeline(
    name="ShippingPipeline",
    parameters=[
        instance_count,
        instance_type,
    ],
    steps=steps,
    sagemaker_session=pipeline_session
)
role = sagemaker.get_execution_role()
pipeline.upsert(role_arn=role)
execution = pipeline.start()


chmod: cannot access ‘lost+found’: No such file or directory


INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.Dependencies
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.IncludeLocalWorkDir
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.CustomFileFilter.IgnoreNamePatterns
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.InstanceType


2024-02-11 23:27:48,089 sagemaker.remote_function INFO     Uploading serialized function code to s3://sagemaker-us-east-1-376337229415/ShippingPipeline/ShippingEval/2024-02-11-23-27-45-725/function
2024-02-11 23:27:48,160 sagemaker.remote_function INFO     Uploading serialized function arguments to s3://sagemaker-us-east-1-376337229415/ShippingPipeline/ShippingEval/2024-02-11-23-27-45-725/arguments
2024-02-11 23:27:48,468 sagemaker.remote_function INFO     Copied dependencies file at './requirements.txt' to '/tmp/tmp6tbt7xtr/requirements.txt'
2024-02-11 23:27:48,501 sagemaker.remote_function INFO     Successfully uploaded dependencies and pre execution scripts to 's3://sagemaker-us-east-1-376337229415/ShippingPipeline/ShippingEval/2024-02-11-23-27-45-725/pre_exec_script_and_dependencies'
2024-02-11 23:27:48,507 sagemaker.remote_function INFO     Copied user workspace to '/tmp/tmpy_j08ppk/temp_workspace/sagemaker_remote_function_workspace'
2024-02-11 23:27:48,615 sagemaker.remote_functi

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.Dependencies
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.IncludeLocalWorkDir
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.CustomFileFilter.IgnoreNamePatterns
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.InstanceType


2024-02-11 23:27:51,338 sagemaker.remote_function INFO     Uploading serialized function code to s3://sagemaker-us-east-1-376337229415/ShippingPipeline/ShippingTrain/2024-02-11-23-27-45-725/function
2024-02-11 23:27:51,443 sagemaker.remote_function INFO     Uploading serialized function arguments to s3://sagemaker-us-east-1-376337229415/ShippingPipeline/ShippingTrain/2024-02-11-23-27-45-725/arguments
2024-02-11 23:27:51,558 sagemaker.remote_function INFO     Copied dependencies file at './requirements.txt' to '/tmp/tmp779_8mp9/requirements.txt'
2024-02-11 23:27:51,586 sagemaker.remote_function INFO     Successfully uploaded dependencies and pre execution scripts to 's3://sagemaker-us-east-1-376337229415/ShippingPipeline/ShippingTrain/2024-02-11-23-27-45-725/pre_exec_script_and_dependencies'


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.Dependencies
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.IncludeLocalWorkDir
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.CustomFileFilter.IgnoreNamePatterns
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.InstanceType


2024-02-11 23:27:53,822 sagemaker.remote_function INFO     Uploading serialized function code to s3://sagemaker-us-east-1-376337229415/ShippingPipeline/ShippingPreprocess/2024-02-11-23-27-45-725/function
2024-02-11 23:27:53,927 sagemaker.remote_function INFO     Uploading serialized function arguments to s3://sagemaker-us-east-1-376337229415/ShippingPipeline/ShippingPreprocess/2024-02-11-23-27-45-725/arguments
2024-02-11 23:27:54,002 sagemaker.remote_function INFO     Copied dependencies file at './requirements.txt' to '/tmp/tmpg0t7l55b/requirements.txt'
2024-02-11 23:27:54,050 sagemaker.remote_function INFO     Successfully uploaded dependencies and pre execution scripts to 's3://sagemaker-us-east-1-376337229415/ShippingPipeline/ShippingPreprocess/2024-02-11-23-27-45-725/pre_exec_script_and_dependencies'
2024-02-11 23:27:54,509 sagemaker.remote_function INFO     Uploading serialized function code to s3://sagemaker-us-east-1-376337229415/ShippingPipeline/ShippingEval/2024-02-11-23-27-5

In [4]:
execution.list_steps()

[{'StepName': 'ShippingPreprocess',
  'StartTime': datetime.datetime(2024, 2, 11, 23, 27, 57, 362000, tzinfo=tzlocal()),
  'StepStatus': 'Executing',
  'Metadata': {'TrainingJob': {'Arn': 'arn:aws:sagemaker:us-east-1:376337229415:training-job/pipelines-2xiedtcy98hu-ShippingPreprocess-V81jz3pBpV'}},
  'AttemptCount': 1}]

In [5]:
train, val, test = execution.result(step_name="ShippingPreprocess")

In [6]:
train

Unnamed: 0,ActualShippingDays,ExpectedShippingDays,Carrier,YShippingDistance,XShippingDistance,InBulkOrder,ShippingOrigin,ShippingPriority
280,14,13,Shipper,256,-15,Bulk Order,Seattle,Standard
660,18,18,Shipper,-256,-10,Single Order,San Francisco,Air
901,14,13,BigBird,20,-180,Bulk Order,Salt Lake City,Ground
510,10,11,Shipper,-140,17,Bulk Order,Houston,Express
48,28,18,BigBird,150,-63,Single Order,New York City,Ground
...,...,...,...,...,...,...,...,...
984,11,11,BigBird,79,71,Single Order,New York City,Express
789,14,14,BigBird,39,83,Single Order,Atlanta,Express
619,15,14,BigBird,-54,79,Single Order,Atlanta,Express
424,17,15,BigBird,-209,-8,Single Order,Houston,Ground


In [7]:
execution.result(step_name="ShippingTrain")

<catboost.core.CatBoostRegressor at 0x7f1670f46890>

In [8]:
execution.result(step_name="ShippingEval")

{'regression_metrics': {'mse': {'value': 0.5919529590720277,
   'standard_deviation': 0.7691113944992932}}}

In [13]:

s3_file = S3FileSystem()
bucket = "sagemaker-us-east-1-376337229415"
key= "ShippingPipeline/90cd8j1l6co1/ShippingTrain/results/payload.pkl"
data = pickle.load(s3_file.open('{}/{}'.format(bucket, key)))
data

<catboost.core.CatBoostRegressor at 0x7f167092ad70>

In [14]:
pipeline_execution_summaries = pipeline.list_executions()['PipelineExecutionSummaries']
pipeline_execution_list = [{k:v for k,v in dct.items() if k != 'PipelineExecutionArn'} for dct in pipeline_execution_summaries]

In [15]:
pipeline_execution_list[:2] 

[{'StartTime': datetime.datetime(2024, 2, 11, 23, 27, 56, 39000, tzinfo=tzlocal()),
  'PipelineExecutionStatus': 'Succeeded',
  'PipelineExecutionDisplayName': 'execution-1707694076111'},
 {'StartTime': datetime.datetime(2024, 2, 11, 23, 16, 17, 754000, tzinfo=tzlocal()),
  'PipelineExecutionStatus': 'Failed',
  'PipelineExecutionDisplayName': 'execution-1707693377803',
  'PipelineExecutionFailureReason': 'Step failure: One or multiple steps failed.'}]