# AR Dataset Preprocessing Pipeline - SageMaker Studio Execution
Run this notebook in SageMaker Studio

In [1]:
import sagemaker
import boto3
import os
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import TrainingStep, ProcessingStep, CacheConfig
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat,
    ParameterBoolean
)

from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.pytorch import PyTorch, PyTorchModel
from sagemaker.processing import FrameworkProcessor, ProcessingInput, ProcessingOutput
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.properties import PropertyFile
from sagemaker.model_metrics import MetricsSource, ModelMetrics
from sagemaker.drift_check_baselines import DriftCheckBaselines

from botocore.exceptions import ClientError

sagemaker.config INFO - Fetched defaults config from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


In [2]:
# # Upload local dataset to S3
import boto3
import os
import sagemaker

# s3 = boto3.client('s3')
local_dataset_path = 'shared/Dataset/AR_Train'
s3_dataset_prefix = 'ar-dataset'
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
project_prefix = sagemaker_session.default_bucket_prefix
role = sagemaker.get_execution_role()

# def upload_directory_to_s3(local_directory, bucket, s3_prefix):
#     for root, dirs, files in os.walk(local_directory):
#         for file in files:
#             local_path = os.path.join(root, file)
#             relative_path = os.path.relpath(local_path, local_directory)
#             s3_path = os.path.join(s3_prefix, relative_path).replace('\\', '/')
            
#             print(f"Uploading {local_path} to s3://{bucket}/{s3_path}")
#             s3.upload_file(local_path, bucket, s3_path)

# # Upload the dataset
# print("Uploading dataset to S3...")
# upload_directory_to_s3(local_dataset_path, bucket, s3_dataset_prefix)
# print("Upload completed!")

# Update your S3 URI
SHARE_LOC = f"s3://{bucket}/{"/".join(project_prefix.split('/')[:-1])}/shared"

INPUT_S3_URI = f"s3://{bucket}/{"/".join(project_prefix.split('/')[:-1])}/{local_dataset_path}/"
print(f"Dataset uploaded to: {INPUT_S3_URI}")

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
Dataset uploaded to: s3://amazon-sagemaker-036348883246-ap-southeast-1-e4202aa55412/dzd_4okzm1s1vw7yzl/65zfxlhn9ocbdt/shared/Dataset/AR_Train/


In [3]:
import boto3
import sagemaker
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.parameters import ParameterString, ParameterFloat, ParameterInteger
from sagemaker.workflow.pipeline_context import LocalPipelineSession
import os

# Initialize SageMaker session
sagemaker_session = PipelineSession()
role = sagemaker.get_execution_role()
region = sagemaker_session.boto_region_name
bucket = sagemaker_session.default_bucket()

print(f"SageMaker role: {role}")
print(f"Default bucket: {bucket}")
print(f"Region: {region}")

sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix
SageMaker role: arn:aws:iam::036348883246:role/datazone_usr_role_65zfxlhn9ocbdt_cum5fqsu6crj8h
Default bucket: amazon-sagemaker-036348883246-ap-southeast-1-e4202aa55412
Region: ap-southeast-1


In [4]:
# # Upload the preprocessing script to S3
preprocessing_script_path = f"{SHARE_LOC}/preprocessing_script.py"
print(preprocessing_script_path)
# # Write the preprocessing script
# preprocessing_code = '''
# #!/usr/bin/env python3
# import argparse
# import os
# import cv2
# import numpy as np
# import pickle
# from glob import glob
# from sklearn.model_selection import train_test_split
# import logging
# import subprocess
# import sys

# # Install required packages
# subprocess.check_call([sys.executable, "-m", "pip", "install", "opencv-python"])
# subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn"])

# # Set up logging
# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)

# def load_and_preprocess_data(data_dir, image_size=(224, 224), val_split=0.2):
#     \"\"\"Load and preprocess AR dataset\"\"\"
#     class_names = ['non-defect', 'defect']
#     class_map = {name: i for i, name in enumerate(class_names)}
    
#     search_pattern = os.path.join(data_dir, 'AR_Train', '*', 'lighting_panel', '*.jpg')
#     image_paths = glob(search_pattern)
    
#     if not image_paths:
#         logger.warning(f\"No images found with pattern: {search_pattern}\")
#         return (None, None), (None, None)
    
#     images = []
#     labels = []
    
#     logger.info(f\"Found {len(image_paths)} images. Preprocessing...\")
    
#     for path in image_paths:
#         try:
#             img = cv2.imread(path)
#             if img is None:
#                 continue
            
#             img_resized = cv2.resize(img, (image_size[1], image_size[0]))
#             img_normalized = img_resized.astype(np.float32) / 255.0
            
#             class_name = os.path.basename(os.path.dirname(os.path.dirname(path)))
            
#             if class_name in class_map:
#                 images.append(img_normalized)
#                 labels.append(class_map[class_name])
                
#         except Exception as e:
#             logger.error(f\"Error processing {path}: {e}\")
    
#     if not images:
#         return (None, None), (None, None)
    
#     X = np.array(images)
#     y = np.array(labels)
    
#     X_train, X_val, y_train, y_val = train_test_split(
#         X, y, test_size=val_split, random_state=42, stratify=y
#     )
    
#     logger.info(f\"Training: {len(X_train)}, Validation: {len(X_val)}\")
#     return (X_train, y_train), (X_val, y_val)

# def save_data(data, output_path, filename):
#     \"\"\"Save data using pickle\"\"\"
#     os.makedirs(output_path, exist_ok=True)
#     filepath = os.path.join(output_path, filename)
#     with open(filepath, 'wb') as f:
#         pickle.dump(data, f)
#     logger.info(f\"Saved: {filepath}\")

# if __name__ == \"__main__\":
#     parser = argparse.ArgumentParser()
#     parser.add_argument(\"--image-height\", type=int, default=224)
#     parser.add_argument(\"--image-width\", type=int, default=224)
#     parser.add_argument(\"--val-split\", type=float, default=0.2)
#     args = parser.parse_args()
    
#     input_dir = \"/opt/ml/processing/input\"
#     output_train_dir = \"/opt/ml/processing/output/train\"
#     output_val_dir = \"/opt/ml/processing/output/val\"
    
#     (X_train, y_train), (X_val, y_val) = load_and_preprocess_data(
#         data_dir=input_dir,
#         image_size=(args.image_height, args.image_width),
#         val_split=args.val_split
#     )
    
#     if X_train is not None:
#         save_data({'images': X_train, 'labels': y_train}, output_train_dir, 'train_data.pkl')
#         save_data({'images': X_val, 'labels': y_val}, output_val_dir, 'val_data.pkl')
#         save_data({
#             'image_size': (args.image_height, args.image_width),
#             'train_samples': len(X_train),
#             'val_samples': len(X_val)
#         }, output_train_dir, 'metadata.pkl')
#         logger.info(\"Preprocessing completed successfully!\")
#     else:
#         logger.error(\"No data processed\")
# '''

# with open(preprocessing_script_path, 'w') as f:
#     f.write(preprocessing_code)

# print(f"Created {preprocessing_script_path}")

s3://amazon-sagemaker-036348883246-ap-southeast-1-e4202aa55412/dzd_4okzm1s1vw7yzl/65zfxlhn9ocbdt/shared/preprocessing_script.py


In [5]:
# Configure your S3 paths
# IMPORTANT: Update these paths with your actual S3 locations
#https://hekynw8s7gjlmop.studio.ap-southeast-1.sagemaker.aws/jupyterlab/default/lab/tree/shared/Dataset
INPUT_DATA_PATH = "Dataset"# Where your AR dataset is stored
OUTPUT_S3_URI = f"{SHARE_LOC}/output"  # Where processed data will be saved

print(f"Input S3 URI: {INPUT_DATA_PATH}")
print(f"Output S3 URI: {OUTPUT_S3_URI}")

Input S3 URI: Dataset
Output S3 URI: s3://amazon-sagemaker-036348883246-ap-southeast-1-e4202aa55412/dzd_4okzm1s1vw7yzl/65zfxlhn9ocbdt/shared/output


In [6]:
# Create the pipeline
from sagemaker.workflow.functions import Join

# Parameters
input_data_path = ParameterString(name="InputDataPath", default_value=INPUT_S3_URI)
output_data_uri = ParameterString(name="OutputDataUri", default_value=OUTPUT_S3_URI)
image_size_height = ParameterInteger(name="ImageSizeHeight", default_value=224)
image_size_width = ParameterInteger(name="ImageSizeWidth", default_value=224)
val_split = ParameterFloat(name="ValidationSplit", default_value=0.2)
    
# SKLearn processor
sklearn_processor = SKLearnProcessor(
    framework_version="1.2-1",
    role=role,
    instance_type="ml.t3.medium",
    instance_count=1,
    base_job_name="ar-preprocessing",
    sagemaker_session=sagemaker_session
)
    
# Processing step
processing_step = ProcessingStep(
    name="ARDataPreprocessing",
    processor=sklearn_processor,
    inputs=[
        ProcessingInput(
            source="Dataset",
            destination="/opt/ml/processing/input",
            input_name="local_data"
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="train_data",
            source="/opt/ml/processing/output/train",
            destination=Join(on="/", values=[output_data_uri, "train"])
        ),
          ProcessingOutput(
              output_name="val_data",
              source="/opt/ml/processing/output/val", 
              destination=Join(on="/", values=[output_data_uri, "val"])
          )
    ],
    code=preprocessing_script_path,
    job_arguments=[
        "--image-height", image_size_height,
        "--image-width", image_size_width,
        "--val-split", val_split
    ]
)
# Create pipeline
pipeline = Pipeline(
    name="ar-preprocessing-pipeline",
    parameters=[
        input_data_path, output_data_uri,
        image_size_height, image_size_width, val_split
    ],
    steps=[processing_step],
    sagemaker_session=sagemaker_session
)
# return pipeline

# pipeline = create_pipeline()
print("Pipeline created successfully!")

sagemaker.config INFO - Applied value from config key = SageMaker.ProcessingJob.NetworkConfig.VpcConfig.Subnets
sagemaker.config INFO - Applied value from config key = SageMaker.ProcessingJob.NetworkConfig.VpcConfig.SecurityGroupIds
Pipeline created successfully!


In [7]:
print("Pipeline Definition:")
print(pipeline.definition())
print("hello")# Print JSON definition

pipeline.upsert(
    role_arn=role,
    description="AR Defect Detection Pipeline using ResNet50-SE",
)

# Example of how to start an execution
execution = pipeline.start()

Pipeline Definition:
{"Version": "2020-12-01", "Metadata": {}, "Parameters": [{"Name": "InputDataPath", "Type": "String", "DefaultValue": "s3://amazon-sagemaker-036348883246-ap-southeast-1-e4202aa55412/dzd_4okzm1s1vw7yzl/65zfxlhn9ocbdt/shared/Dataset/AR_Train/"}, {"Name": "OutputDataUri", "Type": "String", "DefaultValue": "s3://amazon-sagemaker-036348883246-ap-southeast-1-e4202aa55412/dzd_4okzm1s1vw7yzl/65zfxlhn9ocbdt/shared/output"}, {"Name": "ImageSizeHeight", "Type": "Integer", "DefaultValue": 224}, {"Name": "ImageSizeWidth", "Type": "Integer", "DefaultValue": 224}, {"Name": "ValidationSplit", "Type": "Float", "DefaultValue": 0.2}], "PipelineExperimentConfig": {"ExperimentName": {"Get": "Execution.PipelineName"}, "TrialName": {"Get": "Execution.PipelineExecutionId"}}, "Steps": [{"Name": "ARDataPreprocessing", "Type": "Processing", "Arguments": {"ProcessingResources": {"ClusterConfig": {"InstanceType": "ml.t3.medium", "InstanceCount": 1, "VolumeSizeInGB": 30}}, "AppSpecification": {"

In [None]:
# # Create/Update the pipeline
# try:
#     pipeline.upsert(role_arn=role)
#     print("Pipeline upserted successfully!")
# except Exception as e:
#     print(f"Error upserting pipeline: {e}")

In [None]:
# Execute the pipeline
# execution = pipeline.start(
#     parameters={
#         "InputDataUri": "Dataset",
#         "OutputDataUri": OUTPUT_S3_URI,
#         "ImageSizeHeight": 128,  # Matching your example
#         "ImageSizeWidth": 128,
#         "ValidationSplit": 0.2
#     }
# )

# print(f"Pipeline execution started!")
# print(f"Execution ARN: {execution.arn}")
# print(f"Execution name: {execution.name}")

In [None]:
# Monitor execution status
execution.wait(delay=30, max_attempts=120)  # Wait up to 1 hour
print(f"Pipeline execution completed!")
print(f"Status: {execution.describe()['PipelineExecutionStatus']}")

In [None]:
# Check execution details
steps = execution.list_steps()
for step in steps:
    print(f"Step: {step['StepName']}")
    print(f"Status: {step['StepStatus']}")
    if 'FailureReason' in step:
        print(f"Failure Reason: {step['FailureReason']}")
    print("---")