# Electronics Sales Pipeline - Multi-Category

Run XGBoost hyperparameter optimization per product category using Optuna.

In [None]:
import os
import sagemaker
from sagemaker.workflow.notebook_job_step import NotebookJobStep
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.pipeline_context import LocalPipelineSession, PipelineSession

In [None]:
def define_category_pipeline_steps(
    category_configs: dict,
    image_uri: str,
    notebook_artifacts: str,
    input_notebook_name: str,
    kernel_name: str = "python3",
    instance_type: str = "ml.m5.xlarge",
    role: str = None,
) -> list:
    """
    Define notebook job steps per product category.
    """
    pipeline_steps = []
    if role is None:
        role = sagemaker.get_execution_role()

    for category, config in category_configs.items():
        nb_job_params = {
            "categories_to_train": f'["{category}"]',
            **{k: str(v) for k, v in config.items()},
        }

        step_name = f"electronics-train-{category}"

        nb_step = NotebookJobStep(
            name=step_name,
            description=f"XGBoost training for {category}",
            notebook_job_name=step_name,
            image_uri=image_uri,
            kernel_name=kernel_name,
            display_name=step_name,
            role=role,
            s3_root_uri=notebook_artifacts,
            input_notebook=input_notebook_name,
            instance_type=instance_type,
            parameters=nb_job_params,
            max_runtime_in_seconds=3600,
            max_retry_attempts=2,
        )
        pipeline_steps.append(nb_step)

    return pipeline_steps

## Configuration

In [None]:
# Pipeline settings
pipeline_name = "electronics-sales-pipeline"
train_notebook = "fm_train.ipynb"
kernel_name = "python3"

# AWS settings - UPDATE THESE
bucket = "your-bucket-name"
region = "us-east-1"
image_uri = f"arn:aws:sagemaker:{region}:885854791233:image/sagemaker-distribution-cpu"
notebook_artifacts = f"s3://{bucket}/electronics-training"
instance_type = "ml.m5.xlarge"

# MLflow setup
os.environ["MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"] = "true"
# os.environ["MLFLOW_TRACKING_URI"] = "your-mlflow-tracking-uri"

In [None]:
# Category-specific training configurations
# Each category runs as a separate pipeline step

category_configs = {
    "smartphones": {
        "n_trials": 50,
        "n_rows_per_category": 1000,
        "experiment_name": "electronics-smartphones",
        "test_size": 0.25,
        "seed": 42,
    },
    "laptops": {
        "n_trials": 50,
        "n_rows_per_category": 1000,
        "experiment_name": "electronics-laptops",
        "test_size": 0.25,
        "seed": 42,
    },
    "tablets": {
        "n_trials": 50,
        "n_rows_per_category": 1000,
        "experiment_name": "electronics-tablets",
        "test_size": 0.25,
        "seed": 42,
    },
    "accessories": {
        "n_trials": 50,
        "n_rows_per_category": 1000,
        "experiment_name": "electronics-accessories",
        "test_size": 0.25,
        "seed": 42,
    },
}

print(f"Categories: {list(category_configs.keys())}")

## Define Pipeline Steps

In [None]:
pipeline_steps = define_category_pipeline_steps(
    category_configs,
    image_uri,
    notebook_artifacts,
    train_notebook,
    kernel_name,
    instance_type,
)

print(f"Created {len(pipeline_steps)} pipeline steps:")
for step in pipeline_steps:
    print(f"  - {step.name}")

## Local Mode Execution (Testing)

In [None]:
# Uncomment to test locally (requires Docker Compose)
# session = LocalPipelineSession()
# pipeline = Pipeline(name=pipeline_name, steps=pipeline_steps, sagemaker_session=session)
# role = sagemaker.get_execution_role()
# pipeline.create(role)
# execution = pipeline.start()

## SageMaker Pipeline Execution

In [None]:
session = PipelineSession()
role = sagemaker.get_execution_role()

pipeline = Pipeline(
    name=pipeline_name,
    steps=pipeline_steps,
    sagemaker_session=session,
)

pipeline.upsert(role_arn=role)
execution = pipeline.start()

print(f"Pipeline: {pipeline_name}")
print(f"Execution: {execution.arn}")

In [None]:
# Monitor execution
execution.describe()

In [None]:
# Wait for completion (optional)
# execution.wait()