In [176]:
import boto3
import sagemaker
import sagemaker.session


region = boto3.Session().region_name
sagemaker_session = sagemaker.session.Session()
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()
model_package_group_name = f"MobPricePackageGroupName"

link to tutorial : https://docs.aws.amazon.com/sagemaker/latest/dg/define-pipeline.html

In [177]:
input_data_uri = f"s3://ars-mlops-projects/mobile-price-prediction/data/raw_data/train.csv"

In [178]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
)


processing_instance_count = ParameterInteger(
    name="ProcessingInstanceCount",
    default_value=1
)
processing_instance_type = ParameterString(
    name="ProcessingInstanceType",
    default_value="ml.m5.xlarge"
)
training_instance_type = ParameterString(
    name="TrainingInstanceType",
    default_value="ml.m5.xlarge"
)
model_approval_status = ParameterString(
    name="ModelApprovalStatus",
    default_value="PendingManualApproval"
)
input_data = ParameterString(
    name="InputData",
    default_value=input_data_uri,
)


In [179]:
!mkdir -p mp

In [180]:
%%writefile mp/preprocessing.py

import argparse
import os
import requests
import tempfile
import numpy as np
import pandas as pd
import pathlib
import logging
import boto3


from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split


logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())

def merge_two_dicts(x, y):
    """Merges two dicts, returning a new copy."""
    z = x.copy()
    z.update(y)
    return z


if __name__ == "__main__":
    logger.debug("Starting preprocessing.")
    parser = argparse.ArgumentParser()
    parser.add_argument("--input-data", type=str, required=True)
    args = parser.parse_args()

    base_dir = "/opt/ml/processing"
    pathlib.Path(f"{base_dir}/data").mkdir(parents=True, exist_ok=True)
    input_data = args.input_data
    bucket = input_data.split("/")[2]
    key = "/".join(input_data.split("/")[3:])

    logger.info("Downloading data from bucket: %s, key: %s", bucket, key)
    fn = f"{base_dir}/data/Bank_Personal_Loan_Modelling.csv"
    s3 = boto3.resource("s3")
    s3.Bucket(bucket).download_file(key, fn)

    logger.debug("Reading downloaded data.")
    df = pd.read_csv(fn)
    os.unlink(fn)
    
    x_new = df.drop('price_range',axis=1)
    y_new = df['price_range']
    print(x_new.shape)
    print(y_new.shape)
    
    x_new['screen_res'] = x_new['px_width'] * x_new['px_height']
    x_new = x_new.drop(labels=['px_height','px_width'],axis=1)
    
    x_train_new,x_test_new,y_train_new,y_test_new = train_test_split(x_new,y_new,test_size=0.3,random_state=42)
    
    train = pd.concat((y_train_new,x_train_new),axis=1)
    test  = pd.concat((y_test_new,x_test_new),axis=1)
    
    train.to_csv(f"{base_dir}/train/train.csv",index=False)
    test.to_csv(f"{base_dir}/test/validation.csv", index=False)


Overwriting mp/preprocessing.py


# Defining a Pipeline

## 3 . Create an instance of an SKLearnProcessor to pass in to the processing step.



In [181]:
from sagemaker.sklearn.processing import SKLearnProcessor

role1 = "arn:aws:iam::832173187970:role/service-role/AmazonSageMaker-ExecutionRole-20211213T210605"
framework_version = "0.23-1"

sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    instance_type='ml.m5.xlarge',
    instance_count=processing_instance_count,
    base_job_name="sklearn-mp-process",
    role=role1,
)

4. Create a processing step. This step takes in the SKLearnProcessor, the input and output channels, and the preprocessing.py script that you created. This is very similar to a processor instance's run method in the SageMaker Python SDK. The input_data parameter passed into ProcessingStep is the input data of the step itself. This input data is used by the processor instance when it runs.

Note the  "train, "validation, and "test" named channels specified in the output configuration for the processing job. Step Properties such as these can be used in subsequent steps and resolve to their runtime values at execution

In [182]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep
step_process = ProcessingStep(
        name="PreProcessingStep",
        processor=sklearn_processor,
        inputs=[
          ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),  
        ],
        outputs=[
            ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
            ProcessingOutput(output_name="test", source="/opt/ml/processing/test")
        ],
        code=("mp/preprocessing.py"),
        job_arguments=["--input-data", input_data],
    )


## Step 4: Define a Training step


To define a training step

Specify the model path where you want to save the models from training.

In [183]:
model_path = f"s3://ars-mlops-projects/mobile-price-prediction/ModelArtifacts/"

Configure an estimator for the XGBoost algorithm and the input dataset. The training_instance_type is passed into the estimator. A typical training script loads data from the input channels, configures training with hyperparameters, trains a model, and saves a model to model_dir so that it can be hosted later. SageMaker uploads the model to Amazon S3 in the form of a model.tar.gz at the end of the training job.

In [184]:
from sagemaker.estimator import Estimator
training_instance_type = 'ml.m5.xlarge'
image_uri = sagemaker.image_uris.retrieve(
        framework="xgboost",
        region=region,
        version="1.0-1",
        py_version="py3",
        instance_type=training_instance_type,
)
xgb_train = Estimator(
        image_uri=image_uri,
        instance_type=training_instance_type,
        instance_count=1,
        output_path=model_path,
        role=role,
)
xgb_train.set_hyperparameters(
        objective="reg:linear",
        num_round=50,
        max_depth=5,
        eta=0.2,
        gamma=4,
        min_child_weight=6,
        subsample=0.7,
        silent=0
)

Create a TrainingStep using the estimator instance and properties of the ProcessingStep. In particular, pass in the S3Uri of the "train" and "validation" output channel to the TrainingStep. 



In [185]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep
step_train = TrainingStep(
        name="ModelTraining",
        estimator=xgb_train,
        inputs={
            "train": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "train"
                ].S3Output.S3Uri,
                content_type="text/csv"
            ),
            "validation": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "test"
                ].S3Output.S3Uri,
                content_type="text/csv"
            )
        },
    )


### Step 5: Define a Processing Step for Model Evaluation


In [186]:
%%writefile mp/evaluation.py
import json
import pathlib
import pickle
import tarfile
import joblib
import numpy as np
import pandas as pd
import xgboost
import logging

from sklearn import metrics
from sklearn.metrics import mean_squared_error

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())


if __name__ == "__main__":
    model_path = f"/opt/ml/processing/model/model.tar.gz"
    with tarfile.open(model_path) as tar:
        tar.extractall(path=".")
    logger.debug("Reading model")
    model = pickle.load(open("xgboost-model", "rb"))
    logger.debug("Reading test path")
    logger.info("Reading test file")
    test_path = "/opt/ml/processing/test/test.csv"
    df = pd.read_csv(test_path)
    
    y_test = df.iloc[:, 0].to_numpy()
    df.drop(df.columns[0], axis=1, inplace=True)
    #df.drop(index=0,axis=0,inplace=True)
    X_test = xgboost.DMatrix(data=df.values,label=y_test)
    
    predictions = model.predict(X_test)
    logger.debug("predictions done.")

    #acc = metrics.accuracy_score(y_test, predictions)
    #std = np.std(y_test - predictions)
    mse = mean_squared_error(y_out,y_test)
    report_dict = {
        "regression_metrics": {
            "mse": {
                "value": mse
            },
        },
    }

    output_dir = "/opt/ml/processing/evaluation"
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    logger.info("Writing out evaluation report with mse: %f", mse)
    evaluation_path = f"{output_dir}/evaluation.json"
    with open(evaluation_path, "w") as f:
        f.write(json.dumps(report_dict))

Overwriting mp/evaluation.py


In [187]:
from sagemaker.processing import ScriptProcessor


script_eval = ScriptProcessor(
    image_uri=image_uri,

    command=["python3"],
    instance_type=processing_instance_type,
    instance_count=1,
    base_job_name="script-mp-eval",
    role=role,
)

In [188]:
from sagemaker.workflow.properties import PropertyFile

print("Starting Evauluation step")
evaluation_report = PropertyFile(
    name="EvaluationReport",
    output_name="evaluation",
    path="evaluation.json"
)

step_eval = ProcessingStep(
    name="ModelEvaluation",
    processor=script_eval,
    inputs=[
        ProcessingInput(
            source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
            destination="/opt/ml/processing/model"
        ),
        ProcessingInput(
            source=step_process.properties.ProcessingOutputConfig.Outputs[
                "test"
            ].S3Output.S3Uri,
            destination="/opt/ml/processing/test"
        )
    ],
    outputs=[
        ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"),
    ],
    code="mp/evaluation.py",
    property_files=[evaluation_report],

)

Starting Evauluation step


## Step 6 : Create Model Step

### To define a CreateModelStep for batch transformation
Create a SageMaker model. Pass in the S3ModelArtifacts property from the step_train training step.

In [215]:
from sagemaker.model import Model
from sagemaker.inputs import CreateModelInput
from sagemaker.workflow.steps import CreateModelStep
model = Model(image_uri=image_uri, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,sagemaker_session=sagemaker_session,role=role1)

#Define the model input for your SageMaker model.
inputs = CreateModelInput(instance_type="ml.m5.large",)
step_create_model = CreateModelStep(model=model,inputs=inputs,name='MobilePriceCreateModel',)


### Step 8 : Define a Regsiter Model step to Create Model

In [223]:
from sagemaker.model_metrics import ModelMetrics, MetricsSource
from sagemaker.workflow.step_collections import RegisterModel

step_register = RegisterModel(
    name="MobilePriceRegisterModel",
    estimator=xgb_train,
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.t2.medium"],
    transform_instances=["ml.m5.xlarge"],
    model_package_group_name=model_package_group_name
    
)

### Step 10: Create a pipeline

To create a pipeline

Define the following for your pipeline: name, parameters, and steps. Names must be unique within an (account, region) pair.



In [224]:
from sagemaker.workflow.pipeline import Pipeline
pipeline_name = f"MobPricePipeline"
pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type, 
            processing_instance_count,
            training_instance_type,
            model_approval_status,
            input_data,
        ],
        steps=[step_process, step_train, step_create_model,step_register],
    )


Examine the JSON pipeline definition to ensure that it's well-formed.

In [225]:
import json

json.loads(pipeline.definition())

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'ProcessingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.xlarge'},
  {'Name': 'ProcessingInstanceCount', 'Type': 'Integer', 'DefaultValue': 1},
  {'Name': 'ModelApprovalStatus',
   'Type': 'String',
   'DefaultValue': 'PendingManualApproval'},
  {'Name': 'InputData',
   'Type': 'String',
   'DefaultValue': 's3://ars-mlops-projects/mobile-price-prediction/data/raw_data/train.csv'}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'PreProcessingStep',
   'Type': 'Processing',
   'Arguments': {'ProcessingResources': {'ClusterConfig': {'InstanceType': 'ml.m5.xlarge',
      'InstanceCount': {'Get': 'Parameters.ProcessingInstanceCount'},
      'VolumeSizeInGB': 30}},
    'AppSpecification': {'ImageUri': '720646828776.dkr.ecr.ap-south-1.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3',
     'Con

# Running a Pipeline

### Step 1: Start the Pipeline
First, you need to start the pipeline.



In [226]:
json.loads(pipeline.definition())

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'ProcessingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.xlarge'},
  {'Name': 'ProcessingInstanceCount', 'Type': 'Integer', 'DefaultValue': 1},
  {'Name': 'ModelApprovalStatus',
   'Type': 'String',
   'DefaultValue': 'PendingManualApproval'},
  {'Name': 'InputData',
   'Type': 'String',
   'DefaultValue': 's3://ars-mlops-projects/mobile-price-prediction/data/raw_data/train.csv'}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'PreProcessingStep',
   'Type': 'Processing',
   'Arguments': {'ProcessingResources': {'ClusterConfig': {'InstanceType': 'ml.m5.xlarge',
      'InstanceCount': {'Get': 'Parameters.ProcessingInstanceCount'},
      'VolumeSizeInGB': 30}},
    'AppSpecification': {'ImageUri': '720646828776.dkr.ecr.ap-south-1.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3',
     'Con

2. Submit the pipeline definition to the SageMaker Pipelines service to create a pipeline if it doesn't exist, or update the pipeline if it does. The role passed in is used by SageMaker Pipelines to create all of the jobs defined in the steps.



In [227]:
pipeline.upsert(role_arn=role1)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


{'PipelineArn': 'arn:aws:sagemaker:ap-south-1:832173187970:pipeline/mobpricepipeline',
 'ResponseMetadata': {'RequestId': 'c650af02-06d2-4d04-99dc-5fc030ae0202',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'c650af02-06d2-4d04-99dc-5fc030ae0202',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '85',
   'date': 'Sat, 20 Aug 2022 15:12:18 GMT'},
  'RetryAttempts': 0}}

3. Start a pipeline execution.

In [228]:
execution = pipeline.start()

In [194]:
import json
import pathlib
import pickle
import tarfile
import joblib
import numpy as np
import pandas as pd
import xgboost
import boto3

In [195]:
with tarfile.open('model/model.tar.gz') as tar:
    tar.extractall(path='/root/AWS Projects/AWS-Projects/Custom Pipelines/mobprice/model/')

In [196]:
model = pickle.load(open("/root/AWS Projects/AWS-Projects/Custom Pipelines/mobprice/model/xgboost-model", "rb"))

  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.



In [197]:
test_df = pd.read_csv('data/test.csv')
test_df.head(3)

Unnamed: 0,price_range,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,screen_res
0,0,0,2.5,0,3,1,25,0.6,200,2,5,686,8,6,11,1,1,0,339288
1,2,0,0.5,0,7,1,8,0.5,138,8,16,2563,19,17,19,1,0,0,271150
2,1,0,2.9,0,9,0,14,0.4,196,7,18,1316,8,1,8,1,1,0,278936


In [198]:
cols.values

array([0.00000e+00, 0.00000e+00, 2.50000e+00, 0.00000e+00, 3.00000e+00,
       1.00000e+00, 2.50000e+01, 6.00000e-01, 2.00000e+02, 2.00000e+00,
       5.00000e+00, 6.86000e+02, 8.00000e+00, 6.00000e+00, 1.10000e+01,
       1.00000e+00, 1.00000e+00, 0.00000e+00, 3.39288e+05])

In [199]:
df = pd.read_csv('data/test.csv')
df.head(3)

Unnamed: 0,price_range,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,screen_res
0,0,0,2.5,0,3,1,25,0.6,200,2,5,686,8,6,11,1,1,0,339288
1,2,0,0.5,0,7,1,8,0.5,138,8,16,2563,19,17,19,1,0,0,271150
2,1,0,2.9,0,9,0,14,0.4,196,7,18,1316,8,1,8,1,1,0,278936


In [200]:
y_test = df.iloc[:, 0].to_numpy()
df.drop(df.columns[0], axis=1, inplace=True)

In [201]:
df.loc[0].values

array([0.00000e+00, 2.50000e+00, 0.00000e+00, 3.00000e+00, 1.00000e+00,
       2.50000e+01, 6.00000e-01, 2.00000e+02, 2.00000e+00, 5.00000e+00,
       6.86000e+02, 8.00000e+00, 6.00000e+00, 1.10000e+01, 1.00000e+00,
       1.00000e+00, 0.00000e+00, 3.39288e+05])

In [202]:
X_test = xgboost.DMatrix(data=df.values,label=y_test)

In [203]:
y_out = model.predict(X_test)

In [204]:
from sklearn.metrics import mean_squared_error

In [205]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_out,y_test)

3.367345189299442