# Data preparation using SageMaker Processing

## Setup environment

In [None]:
import os
import boto3
import sagemaker
from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role() # we are using the notebook instance role for training in this example
bucket = sagemaker_session.default_bucket() # you can specify a bucket name here

## Build and push container

In [None]:
image_name = 'data-processing-containers'

In [None]:
! sh ./docker/build_and_push.sh $image_name

In [None]:
container = '<your-container-image-uri>' # Replace by your ECR repository ID

In [None]:
data_processor = Processor(role=role, 
                           image_uri=container, 
                           instance_count=1, 
                           instance_type='ml.m5.xlarge',
                           volume_size_in_gb=30, 
                           max_runtime_in_seconds=1200,
                           base_job_name='data-processing')

In [None]:
output_prefix = '/opt/ml/processing/output'

data_processor.run(
    arguments= [
        f'--data_dir={output_prefix}',
        '--task=all'
    ],
    outputs= [
        ProcessingOutput(
            output_name= 'preprocessed',
            source= output_prefix,
            destination= bucket
        )
    ]
)