# Sagemaker Pipeline with Preprocess and Train Steps
The following python application will create a pipeline that processes data from an S3 bucket, stores in the same bucket and then trains a model on that data.

The file in the repository/Datasets/pipeline-dataset-dirty/dirty.csv is missing 10 age values, lines 10-14 and 4980-4984.
As a result of the preprocessing script, the first 5 missing value lines will end up in the training input and the latter 5 will end up in the validation input.

In [None]:
import boto3 
import sagemaker
from sagemaker import image_uris

from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.model import Model
from sagemaker.inputs import CreateModelInput
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.steps import TrainingStep
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.workflow.pipeline_context import PipelineSession


# Initialize session
### Local
# from sagemaker.workflow.pipeline_context import LocalPipelineSession
# local_pipeline_session = LocalPipelineSession()
# pipeline_session = local_pipeline_session
### Remote
pipeline_session = PipelineSession()

role = "INSERT_ARN"
s3_bucket = "INSERT_BUCKET"

# Define Parameters
xgb_image_uri = image_uris.retrieve(framework='xgboost',region='us-east-1', version='1.7-1')
input_process_path = f"s3://{s3_bucket}/pipeline-dataset-dirty/dirty.csv"
model_path = f"s3://{s3_bucket}/pipeline-model/"



#### Processing Step for Feature Engineering
####  See example at the following URL for a more in-depth demonstration of feature engineering preprocessing
####    https://github.com/aws/amazon-sagemaker-examples/blob/4534bff4b5b5062af5789d98c4ddca01b0cb5d1f/sagemaker-pipelines/tabular/abalone_build_train_deploy/sagemaker-pipelines-preprocess-train-evaluate-batch-transform.ipynb
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

framework_version = "1.2-1"

sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    instance_type="ml.m5.xlarge",
    instance_count=1,
    base_job_name="sklearn-process",
    role=role,
    sagemaker_session=pipeline_session,
)


processor_args = sklearn_processor.run(
    inputs=[
        ProcessingInput(source=input_process_path, destination="/opt/ml/processing/input"),
    ],
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
    ],
    code="preprocessing.py",
)

step_process = ProcessingStep(name="DataProcess", step_args=processor_args)


#### Train Step
xgb_train = Estimator(
    image_uri=xgb_image_uri,
    instance_type="ml.m5.xlarge",
    instance_count=1,
    output_path=model_path,
    role=role,
    sagemaker_session=pipeline_session,
)
xgb_train.set_hyperparameters(
    objective="reg:squarederror",
    num_round=50,
    max_depth=5,
    eta=0.2,
    subsample=0.7
)

# Use estimator directly in the TrainingStep insteaad of calling fit()
step_train = TrainingStep(
    name="Train",
    estimator=xgb_train,
    inputs={
        "train": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri,
            content_type="text/csv",
        ),
        "validation": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["validation"].S3Output.S3Uri,
            content_type="text/csv",
        ),
    }
)


model = Model(
    image_uri=xgb_image_uri,
    sagemaker_session=pipeline_session,
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    role=role,
)
step_create_model = ModelStep(
    name="CreateModel",
    step_args=model.create(instance_type="ml.m5.large", accelerator_type="ml.eia1.medium"),
)


pipeline_name = "ADGUPipeline"
pipeline = Pipeline(
    name=pipeline_name,
    steps=[step_process, step_train, step_create_model],
)

pipeline.upsert(role_arn=role)
execution = pipeline.start()
execution.describe()