# Prepare data with SageMaker Processing

## Setup environment

In [1]:
import os
import boto3
import sagemaker
from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role() # we are using the notebook instance role for training in this example
bucket = sagemaker_session.default_bucket() # you can specify a bucket name here

## Get data

In [None]:
# !wget https://aws-mlops-workshop.s3-eu-west-1.amazonaws.com/reviews/workshop_data/reviews.csv

In [13]:
prefix = 'data/input'
s3_input = sagemaker_session.upload_data('reviews.csv', bucket, prefix)
print(s3_input)

s3://sagemaker-eu-west-1-850475559451/data/input/reviews.csv


## Build and push container

In [2]:
image_name = 'data-processing-containers'

In [3]:
!sh ./docker/build_and_push.sh $image_name

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
Building image with name data-processing-containers
Sending build context to Docker daemon   7.68kB
Step 1/6 : FROM ubuntu:18.04
18.04: Pulling from library/ubuntu

[1B4877105a: Pulling fs layer 
[1Bcaa0f5b9: Pulling fs layer 
[1B811b6c42: Pulling fs layer 
[1BDigest: sha256:3235326357dfb65f1781dbc4df3b834546d8bf914e82cce58e6e6b676e23ce8f[K[4A[1K[K[4A[1K[K[4A[1K[K[4A[1K[K[4A[1K[K[4A[1K[K[4A[1K[K[3A[1K[K[2A[1K[K[1A[1K[K
Status: Downloaded newer image for ubuntu:18.04
 ---> c3c304cb4f22
Step 2/6 : RUN apt-get update && apt-get install -y --no-install-recommends         wget         zip         unzip         git         ca-certificates         curl         python3.6         python3-pip         && rm -rf /var/lib/apt-get/lists/*
 ---> Running in 6d8bbcfd3ee2
Get:1 http://archive.ubuntu.com/ubuntu bionic InRelease [242 kB]
Get:2 http://security.ubuntu.com/ubuntu 

Get:49 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 git amd64 1:2.17.1-1ubuntu0.7 [3915 kB]
Get:50 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 python-pip-whl all 9.0.1-2.3~ubuntu1.18.04.1 [1653 kB]
Get:51 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 python3-lib2to3 all 3.6.9-1~18.04 [77.4 kB]
Get:52 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 python3-distutils all 3.6.9-1~18.04 [144 kB]
Get:53 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 python3-pip all 9.0.1-2.3~ubuntu1.18.04.1 [114 kB]
Get:54 http://archive.ubuntu.com/ubuntu bionic/main amd64 unzip amd64 6.0-21ubuntu1 [167 kB]
Get:55 http://archive.ubuntu.com/ubuntu bionic/main amd64 zip amd64 3.0-11build1 [167 kB]
[91mdebconf: delaying package configuration, since apt-utils is not installed
[0mFetched 23.3 MB in 1s (42.1 MB/s)
Selecting previously unselected package libssl1.1:amd64.
(Reading database ... 4046 files and directories currently installed.)

Selecting previously unselected package libheimntlm0-heimdal:amd64.
Preparing to unpack .../22-libheimntlm0-heimdal_7.5.0+dfsg-1_amd64.deb ...
Unpacking libheimntlm0-heimdal:amd64 (7.5.0+dfsg-1) ...
Selecting previously unselected package libgssapi3-heimdal:amd64.
Preparing to unpack .../23-libgssapi3-heimdal_7.5.0+dfsg-1_amd64.deb ...
Unpacking libgssapi3-heimdal:amd64 (7.5.0+dfsg-1) ...
Selecting previously unselected package libsasl2-modules-db:amd64.
Preparing to unpack .../24-libsasl2-modules-db_2.1.27~101-g0780600+dfsg-3ubuntu2.1_amd64.deb ...
Unpacking libsasl2-modules-db:amd64 (2.1.27~101-g0780600+dfsg-3ubuntu2.1) ...
Selecting previously unselected package libsasl2-2:amd64.
Preparing to unpack .../25-libsasl2-2_2.1.27~101-g0780600+dfsg-3ubuntu2.1_amd64.deb ...
Unpacking libsasl2-2:amd64 (2.1.27~101-g0780600+dfsg-3ubuntu2.1) ...
Selecting previously unselected package libldap-common.
Preparing to unpack .../26-libldap-common_2.4.45+dfsg-1ubuntu1.5_all.deb ...
Unpacking libldap-

  Downloading https://files.pythonhosted.org/packages/b8/a6/d1a816b89aa1e9e96bcb298eb1ee1854f21662ebc6d55ffa3d7b3b50122b/joblib-0.15.1-py3-none-any.whl (298kB)
Collecting scipy>=0.19.1 (from scikit-learn->-r /tmp/requirements.txt (line 3))
  Downloading https://files.pythonhosted.org/packages/dc/29/162476fd44203116e7980cfbd9352eef9db37c49445d1fec35509022f6aa/scipy-1.4.1-cp36-cp36m-manylinux1_x86_64.whl (26.1MB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn->-r /tmp/requirements.txt (line 3))
  Downloading https://files.pythonhosted.org/packages/f7/12/ec3f2e203afa394a149911729357aa48affc59c20e2c1c8297a60f33f133/threadpoolctl-2.1.0-py3-none-any.whl
Collecting six>=1.5 (from python-dateutil>=2.6.1->pandas->-r /tmp/requirements.txt (line 2))
  Downloading https://files.pythonhosted.org/packages/ee/ff/48bde5c0f013094d729fe4b0316ba2a24774b3ff1c52d924a8a4cb04078a/six-1.15.0-py2.py3-none-any.whl
Installing collected packages: numpy, six, python-dateutil, pytz, pandas, joblib, scipy, threa

In [4]:
# Replace by your ECR image ID
container = '850475559451.dkr.ecr.eu-west-1.amazonaws.com/data-processing-containers:latest' 

## Launch data processing job

In [5]:
data_processor = Processor(role=role, 
                           image_uri=container, 
                           instance_count=1, 
                           instance_type='ml.m5.xlarge',
                           volume_size_in_gb=30, 
                           max_runtime_in_seconds=1200,
                           base_job_name='data-processing')

In [None]:
input_folder = '/opt/ml/processing/input'
output_folder = '/opt/ml/processing/output'

data_processor.run(
    arguments= [
        f'--input={input_folder}',
        f'--output={output_folder}'
    ],
    inputs = [
        ProcessingInput(
            input_name='input',
            source=s3_input,
            destination=input_folder
        )
    ],
    outputs= [
        ProcessingOutput(
            output_name='preprocessed',
            source=output_folder,
            destination=bucket
        )
    ]
)


Job Name:  data-processing-2020-06-15-05-01-32-015
Inputs:  [{'InputName': 'input', 'S3Input': {'S3Uri': 's3://sagemaker-eu-west-1-850475559451/data/input/reviews.csv', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'preprocessed', 'S3Output': {'S3Uri': 's3://sagemaker-eu-west-1-850475559451/data-processing-2020-06-15-05-01-32-015/output/preprocessed', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]
...