In [17]:
# Importing necessary libraries
import os
import json
import boto3
import pandas as pd
import sagemaker
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.parameters import ParameterString, ParameterInteger, ParameterFloat
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput, ScriptProcessor
from sagemaker.workflow.steps import ProcessingStep, TrainingStep, CreateModelStep
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.fail_step import FailStep
from sagemaker.workflow.properties import PropertyFile
from sagemaker.model_metrics import MetricsSource, ModelMetrics
from sagemaker.model import Model
from sagemaker.inputs import TrainingInput, TransformInput
from sagemaker.lineage.visualizer import LineageTableVisualizer
from sagemaker.estimator import Estimator
from sagemaker.sklearn.estimator import SKLearn
from sagemaker import image_uris
from sagemaker.xgboost.model import XGBoostModel
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.workflow.conditions import ConditionGreaterThan
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import JsonGet

In [18]:
# Creating session variables
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
sm_session = sagemaker.Session()
pipeline_session = PipelineSession()
bucket = sm_session.default_bucket()
prefix = "student-anxiety-pipeline"

LOCAL_DIR = "local_artifacts"
os.makedirs(LOCAL_DIR, exist_ok=True)
print(f"Region: {region}")
print(f"Role: {role}")

Region: us-east-1
Role: arn:aws:iam::303848588930:role/LabRole


In [19]:
# Creating parameters
processing_instance_type = ParameterString(name="ProcessingInstanceType", default_value="ml.m5.xlarge")
training_instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge")
processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)

input_data = ParameterString(
    name="InputData",
    default_value=f"s3://{bucket}/path/to/student_anxiety_raw.csv"
)


In [20]:
# Preprocessing step
preprocess_script_path = "code/preprocessing.py"

sklearn_processor = SKLearnProcessor(
    framework_version="1.2-1",
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name="student-anxiety-preprocess",
    role=role,
    sagemaker_session=pipeline_session,
)

preprocess_step = ProcessingStep(
    name="PreprocessStudentData",
    processor=sklearn_processor,
    code=preprocess_script_path,
    outputs=[
        # SKLearn outputs
        ProcessingOutput(output_name="train_sklearn", source="/opt/ml/processing/train_sklearn"),
        ProcessingOutput(output_name="val_sklearn", source="/opt/ml/processing/validation_sklearn"),
        ProcessingOutput(output_name="test_sklearn", source="/opt/ml/processing/test_sklearn"),

        # XGBoost outputs
        ProcessingOutput(output_name="train_xgb", source="/opt/ml/processing/train_xgb"),
        ProcessingOutput(output_name="val_xgb", source="/opt/ml/processing/validation_xgb"),
        ProcessingOutput(output_name="test_xgb", source="/opt/ml/processing/test_xgb"),
    ],
    job_arguments=[
        "--BUCKET", bucket,
        "--AWS_REGION", region
    ]
)


INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [21]:
# Training step for baseline model
baseline_estimator = SKLearn(
    entry_point="train.py",
    source_dir="baseline_lr",
    role=role,
    framework_version="1.2-1",
    instance_type=processing_instance_type,
    instance_count=1,
    sagemaker_session=pipeline_session
)

baseline_train_input = TrainingInput(
    s3_data=preprocess_step.properties.ProcessingOutputConfig.Outputs["train_sklearn"].S3Output.S3Uri,
    content_type="text/csv"
)
baseline_val_input = TrainingInput(
    s3_data=preprocess_step.properties.ProcessingOutputConfig.Outputs["val_sklearn"].S3Output.S3Uri,
    content_type="text/csv"
)

train_step_sklearn = TrainingStep(
    name="TrainBaselineModel",
    estimator=baseline_estimator,
    inputs={"train": baseline_train_input, "validation": baseline_val_input}
)




In [22]:
# Training step for XGBoost model
num_classes = 3

xgb_container = image_uris.retrieve(
    framework="xgboost",
    region=region,
    version="1.7-1"
)

xgb_estimator = Estimator(
    image_uri=xgb_container,
    role=role,
    instance_count=1,
    instance_type=processing_instance_type,
    sagemaker_session=pipeline_session,
    hyperparameters={
        "objective": "multi:softprob",
        "num_class": num_classes,
        "num_round": 300,
        "max_depth": 6,
        "eta": 0.1,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "eval_metric": "mlogloss"
    }
)

xgb_train_input = TrainingInput(
    s3_data=preprocess_step.properties.ProcessingOutputConfig.Outputs["train_xgb"].S3Output.S3Uri,
    content_type="text/csv"
)
xgb_val_input = TrainingInput(
    s3_data=preprocess_step.properties.ProcessingOutputConfig.Outputs["val_xgb"].S3Output.S3Uri,
    content_type="text/csv"
)

train_step_xgb = TrainingStep(
    name="TrainXGBoostModel",
    estimator=xgb_estimator,
    inputs={"train": xgb_train_input, "validation": xgb_val_input}
)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [23]:
# Evaluation step

# Evaluation script path
evaluation_script_path = "code/evaluation.py"

evaluation_processor = SKLearnProcessor(
    framework_version="1.2-1",
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name="evaluate-student-models",
    sagemaker_session=pipeline_session,
)

evaluation_report = PropertyFile(
    name="EvaluationReport",
    output_name="evaluation",
    path="evaluation.json"
)

evaluate_step = ProcessingStep(
    name="EvaluateModels",
    processor=evaluation_processor,
    code=evaluation_script_path,
    inputs=[
        # SKLearn test data
        ProcessingInput(
            source=preprocess_step.properties.ProcessingOutputConfig.Outputs["test_sklearn"].S3Output.S3Uri,
            destination="/opt/ml/processing/test_sklearn"
        ),
        # XGBoost test data
        ProcessingInput(
            source=preprocess_step.properties.ProcessingOutputConfig.Outputs["test_xgb"].S3Output.S3Uri,
            destination="/opt/ml/processing/test_xgb"
        ),
        # Trained SKLearn model
        ProcessingInput(
            source=train_step_sklearn.properties.ModelArtifacts.S3ModelArtifacts,
            destination="/opt/ml/processing/sklearn_model"
        ),
        # Trained XGBoost model
        ProcessingInput(
            source=train_step_xgb.properties.ModelArtifacts.S3ModelArtifacts,
            destination="/opt/ml/processing/xgb_model"
        ),
    ],
    outputs=[
        ProcessingOutput(
            output_name="evaluation",
            source="/opt/ml/processing/output"
        )
    ],
    property_files=[evaluation_report]
)


INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [24]:
# Conditional Model Registration
f1_threshold = 0.5

# SKLearn model
sklearn_model = SKLearnModel(
    model_data=train_step_sklearn.properties.ModelArtifacts.S3ModelArtifacts,
    role=role,
    entry_point="train.py",
    framework_version="1.2-1",
    sagemaker_session=pipeline_session
)

register_sklearn_model = ModelStep(
    name="RegisterSKLearnModel",
    step_args=sklearn_model.register(
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.m5.large"],
        transform_instances=["ml.m5.large"],
        model_package_group_name="StudentAnxiety-SKLearn"
    )
)

condition_register_sklearn = ConditionStep(
    name="RegisterSKLearnIfF1Good",
    conditions=[
        ConditionGreaterThan(
            left=JsonGet(
                step_name=evaluate_step.name,
                property_file=evaluation_report,
                json_path="SKLearn.F1"
            ),
            right=f1_threshold
        )
    ],
    if_steps=[register_sklearn_model],
    else_steps=[]
)

# XGBoost model
xgb_model = XGBoostModel(
    model_data=train_step_xgb.properties.ModelArtifacts.S3ModelArtifacts,
    role=role,
    framework_version="1.7-1",
    sagemaker_session=pipeline_session
)

register_xgb_model = ModelStep(
    name="RegisterXGBoostModel",
    step_args=xgb_model.register(
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.m5.large"],
        transform_instances=["ml.m5.large"],
        model_package_group_name="StudentAnxiety-XGBoost"
    )
)

condition_register_xgb = ConditionStep(
    name="RegisterXGBIfF1Good",
    conditions=[
        ConditionGreaterThan(
            left=JsonGet(
                step_name=evaluate_step.name,
                property_file=evaluation_report,
                json_path="XGBoost.F1"
            ),
            right=f1_threshold
        )
    ],
    if_steps=[register_xgb_model],
    else_steps=[]
)



INFO:sagemaker.image_uris:Ignoring unnecessary instance type: ml.m5.large.


In [25]:
# Pipeline definition
pipeline = Pipeline(
    name="StudentAnxietyPipeline",
    parameters=[
        processing_instance_count,
        processing_instance_type,
        training_instance_type,
    ],
    steps=[
        preprocess_step,
        train_step_sklearn,
        train_step_xgb,
        evaluate_step,
        condition_register_sklearn,
        condition_register_xgb
    ],
    sagemaker_session=pipeline_session
)

In [26]:
# Executing pipeline
pipeline.upsert(role_arn=role)

execution = pipeline.start()
print(f"Pipeline execution started: {execution.arn}")



Pipeline execution started: arn:aws:sagemaker:us-east-1:303848588930:pipeline/StudentAnxietyPipeline/execution/uyiqvk2sgi7r
