# SageMaker Pipeline for ResNet Defect Detection

This notebook creates a SageMaker Pipeline with:
1. Data preprocessing step using sklearn container
2. Model training step using PyTorch container

In [1]:
import boto3
import sagemaker
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.pytorch import PyTorch
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.parameters import ParameterString, ParameterInteger
import os

# Configuration
pipeline_name = "ResNet50-ADL-Pipeline"
base_dir = os.path.dirname(os.path.abspath('__file__'))

# Defect classes to train - comma-separated list or 'all' for all classes
# DEFECT_CLASSES = "lighting_panel,shifted_grab_handle,frosted_window,Diffuser_cover"
DEFECT_CLASSES = "Diffuser_cover"
# DEFECT_CLASSES = "all"  # Uncomment to train on all discovered classes

# Initialize SageMaker pipeline session
pipeline_session = sagemaker.workflow.pipeline_context.PipelineSession()
bucket_name = pipeline_session.default_bucket()
project_prefix = "/".join(pipeline_session.default_bucket_prefix.split("/")[0:2])
role = sagemaker.get_execution_role()

print(f"s3://{bucket_name}/{project_prefix}/shared/")
dataset_location = f'{bucket_name}/{project_prefix}/shared/Dataset'
dataprocessing_output = f'{bucket_name}/{project_prefix}/shared/{pipeline_name}/processed-data'
training_output = f'{bucket_name}/{project_prefix}/shared/{pipeline_name}/training-output'

sagemaker.config INFO - Fetched defaults config from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
s3://amazon-sagemaker-946156973544-ap-southeast-1-6583c3ac0a8a/dzd-4uv7pknhqq5kgx/avt7vgtu28llb5/shared/


In [2]:
# Define pipeline parameters
input_data_uri = ParameterString(
    name="InputDataUri",
    default_value=f"s3://{dataset_location}"
)

processing_instance_type = ParameterString(
    name="ProcessingInstanceType",
    default_value="ml.m5.xlarge"
)

training_instance_type = ParameterString(
    name="TrainingInstanceType",
    default_value="ml.g4dn.xlarge"
)

epochs = ParameterInteger(
    name="Epochs",
    default_value=50
)

batch_size = ParameterInteger(
    name="BatchSize",
    default_value=16
)

defect_classes = ParameterString(
    name="DefectClasses",
    default_value=DEFECT_CLASSES
)

In [3]:
# Step 1: Data Preprocessing using SKLearn container
sklearn_processor = SKLearnProcessor(
    framework_version="1.2-1",
    instance_type=processing_instance_type,
    instance_count=1,
    role=role,
    sagemaker_session=pipeline_session
)

processing_step = ProcessingStep(
    name="PreprocessData",
    processor=sklearn_processor,
    inputs=[
        ProcessingInput(
            source=input_data_uri,
            destination="/opt/ml/processing/input"
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="train_data",
            source="/opt/ml/processing/output/train",
            destination=f"s3://{dataprocessing_output}/train"
        ),
        ProcessingOutput(
            output_name="val_data",
            source="/opt/ml/processing/output/val",
            destination=f"s3://{dataprocessing_output}/val"
        )
    ],
    code=os.path.join(base_dir, "code", "processing.py"),
    cache_config=sagemaker.workflow.steps.CacheConfig(enable_caching=True, expire_after="30d")
)

sagemaker.config INFO - Applied value from config key = SageMaker.ProcessingJob.NetworkConfig.VpcConfig.Subnets
sagemaker.config INFO - Applied value from config key = SageMaker.ProcessingJob.NetworkConfig.VpcConfig.SecurityGroupIds


In [4]:
# Step 2: Model Training using PyTorch container
pytorch_estimator = PyTorch(
    entry_point="training_all_folds.py",
    source_dir=os.path.join(base_dir, "code"),
    role=role,
    instance_type=training_instance_type,
    instance_count=1,
    framework_version="2.0.0",
    py_version="py310",
    hyperparameters={
        "epochs": epochs,
        "batch_size": batch_size,
        "defect_classes": defect_classes,
        "lr": 0.001,
        "num_classes": 2,
        "drop_rate": 0.4,
        "drop_threshold": 0.8,
        "adl_alpha": 0.1,
        "patience": 5
    },
    output_path=f"s3://{training_output}",
    sagemaker_session=pipeline_session
)

training_step = TrainingStep(
    name="TrainModel",
    estimator=pytorch_estimator,
    inputs={
        "train": TrainingInput(
            s3_data=processing_step.properties.ProcessingOutputConfig.Outputs["train_data"].S3Output.S3Uri
        ),
        "val": TrainingInput(
            s3_data=processing_step.properties.ProcessingOutputConfig.Outputs["val_data"].S3Output.S3Uri
        ),
        "images": TrainingInput(
            s3_data=input_data_uri
        )
    }
)

sagemaker.config INFO - Applied value from config key = SageMaker.TrainingJob.VpcConfig.Subnets
sagemaker.config INFO - Applied value from config key = SageMaker.TrainingJob.VpcConfig.SecurityGroupIds


In [5]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

# Define the SKLearnProcessor
evaluation_processor = SKLearnProcessor(
    framework_version="1.2-1",
    instance_type=processing_instance_type,
    instance_count=1,
    role=role,
    sagemaker_session=pipeline_session
)

# Define the ProcessingStep
evaluation_step = ProcessingStep(
    name="EvaluateModel",
    processor=evaluation_processor,
    inputs=[
        ProcessingInput(
            source=training_step.properties.ModelArtifacts.S3ModelArtifacts,
            destination="/opt/ml/processing/model"
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="evaluation",
            source="/opt/ml/processing/output",
            destination=f"s3://{bucket_name}/{project_prefix}/shared/{pipeline_name}/evaluation"
        )
    ],
    code=os.path.join(base_dir, "code", "evaluation.py")
)

sagemaker.config INFO - Applied value from config key = SageMaker.ProcessingJob.NetworkConfig.VpcConfig.Subnets
sagemaker.config INFO - Applied value from config key = SageMaker.ProcessingJob.NetworkConfig.VpcConfig.SecurityGroupIds


In [6]:
# Create and execute the pipeline
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        input_data_uri,
        processing_instance_type,
        training_instance_type,
        epochs,
        batch_size,
        defect_classes
    ],
    steps=[processing_step, training_step, evaluation_step],
    sagemaker_session=pipeline_session
)

# Create or update the pipeline
pipeline.upsert(role_arn=role)

print(f"Pipeline created: {pipeline.name}")
print(f"Training output location: s3://{training_output}")
print(f"Pipeline definition: {pipeline.definition()}")

Pipeline created: ResNet50-ADL-Pipeline
Training output location: s3://amazon-sagemaker-946156973544-ap-southeast-1-6583c3ac0a8a/dzd-4uv7pknhqq5kgx/avt7vgtu28llb5/shared/ResNet50-ADL-Pipeline/training-output
Pipeline definition: {"Version": "2020-12-01", "Metadata": {}, "Parameters": [{"Name": "InputDataUri", "Type": "String", "DefaultValue": "s3://amazon-sagemaker-946156973544-ap-southeast-1-6583c3ac0a8a/dzd-4uv7pknhqq5kgx/avt7vgtu28llb5/shared/Dataset"}, {"Name": "ProcessingInstanceType", "Type": "String", "DefaultValue": "ml.m5.xlarge"}, {"Name": "TrainingInstanceType", "Type": "String", "DefaultValue": "ml.g4dn.xlarge"}, {"Name": "Epochs", "Type": "Integer", "DefaultValue": 50}, {"Name": "BatchSize", "Type": "Integer", "DefaultValue": 16}, {"Name": "DefectClasses", "Type": "String", "DefaultValue": "Diffuser_cover"}], "PipelineExperimentConfig": {"ExperimentName": {"Get": "Execution.PipelineName"}, "TrialName": {"Get": "Execution.PipelineExecutionId"}}, "Steps": [{"Name": "Preproce

In [7]:
# Execute the pipeline
execution = pipeline.start(
    parameters={
        "DefectClasses": DEFECT_CLASSES
    }
)

print(f"Pipeline execution started: {execution.arn}")
print(f"Training defect classes: {DEFECT_CLASSES}")

# Wait for completion
execution.wait()
print(f"\nPipeline execution completed!")
print(f"Evaluation results: s3://{bucket_name}/{project_prefix}/shared/{pipeline_name}/evaluation")

Pipeline execution started: arn:aws:sagemaker:ap-southeast-1:946156973544:pipeline/ResNet50-ADL-Pipeline/execution/bgmo0qq2mcd9
Training defect classes: Diffuser_cover
