# Spin up a processing node via SageMaker and run some analysis

## Set up basic container 

In [1]:
%%writefile docker_test3/Dockerfile

FROM python:3.7-slim-buster

RUN pip3 install pandas==0.25.3 scikit-learn==0.21.3 boto3==1.18.21 sagemaker==2.48.1 
ENV PYTHONUNBUFFERED=TRUE

ENTRYPOINT ["python3"]


Overwriting docker_test3/Dockerfile


In [3]:
import sagemaker
sagemaker.__version__

'2.59.3'

In [4]:
import boto3

account_id = boto3.client('sts').get_caller_identity().get('Account')
region = boto3.Session().region_name
ecr_repository = 'sagemaker-processing-container-test3'
tag = ':latest'
processing_repository_uri = '{}.dkr.ecr.{}.amazonaws.com/{}'.format(account_id, region, ecr_repository + tag)

processing_repository_uri

'667175760002.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-processing-container-test3:latest'

In [5]:

# Create ECR repository and push docker image
! echo docker build -t $ecr_repository docker_test1

! echo " aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com"
! echo aws ecr create-repository --repository-name $ecr_repository
! echo docker tag {ecr_repository + tag} $processing_repository_uri
! echo docker push $processing_repository_uri

docker build -t sagemaker-processing-container-test3 docker_test1
 aws ecr get-login-password --region eu-central-1 | docker login --username AWS --password-stdin 667175760002.dkr.ecr.eu-central-1.amazonaws.com
aws ecr create-repository --repository-name sagemaker-processing-container-test3
docker tag sagemaker-processing-container-test3:latest 667175760002.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-processing-container-test3:latest
docker push 667175760002.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-processing-container-test3:latest


In [141]:

# Create ECR repository and push docker image
!docker build -t $ecr_repository docker_test
print('------')
!aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com
!aws ecr create-repository --repository-name $ecr_repository
!docker tag {ecr_repository + tag} $processing_repository_uri
!docker push $processing_repository_uri

Sending build context to Docker daemon  2.048kB
Step 1/4 : FROM python:3.7-slim-buster
 ---> 1241d4388782
Step 2/4 : RUN pip3 install pandas==0.25.3 scikit-learn==0.21.3 boto3==1.18.21 sagemaker==2.48.1
 ---> Using cache
 ---> f423fbdd38d3
Step 3/4 : ENV PYTHONUNBUFFERED=TRUE
 ---> Using cache
 ---> 679c1c3225c3
Step 4/4 : ENTRYPOINT ["python3"]
 ---> Using cache
 ---> c914580b0a98
Successfully built c914580b0a98
Successfully tagged sagemaker-processing-container-test:latest
------
https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded

An error occurred (RepositoryAlreadyExistsException) when calling the CreateRepository operation: The repository with name 'sagemaker-processing-container-test' already exists in the registry with id '667175760002'
The push refers to repository [667175760002.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-processing-container-test]

[1B2262f703: Preparing 
[1B8ac6cb11: Preparing 
[1Baabce64c: Preparing 
[1B125ea8

In [92]:
processing_repository_uri, ecr_repository, tag

('667175760002.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-processing-container-test:latest',
 'sagemaker-processing-container-test',
 ':latest')

## Build a dummy preprocessing script
* Download from S3
* Upload to S3
* Batch mode?
* Input parameters

In [125]:
%%writefile preprocessing.py

import pandas as pd
import os
import sys
import boto3
import sagemaker
import argparse

parser = argparse.ArgumentParser()

parser.add_argument('--input', nargs='+', default= [])
parser.add_argument('--dryrun', action='store_true')

args, _ = parser.parse_known_args()

todo= []
for i in args.input:
    newfile= '{}.csv'.format(i)
    print('adding output file', newfile)
    todo.append(newfile)

if args.dryrun:
    print('dryrun')
    sys.exit()

#    sys.exit()

os.environ['AWS_DEFAULT_REGION'] = 'eu-central-1'

# sagemaker.Session(boto3.session.Session())

#
dat= pd.DataFrame({'num_legs': [2, 4, 8, 0],
                   'num_wings': [2, 0, 0, 0],
                   'num_specimen_seen': [10, 2, 1, 8]},
                  index=['falcon', 'dog', 'spider', 'fish'])

# dat.to_csv()

dir_out= "/opt/ml/processing/outputs"
outfiles= ['abc.csv','A2.csv','A3.csv','blahblah.csv']+ todo

# get TCGA files

print("getting TCGA files from S3...")
all_tcga_files= sagemaker.s3.S3Downloader.list('s3://gmb-ds-dbgap/data/Digital_pathology/TCGA/')
all_tcga_files= [i for i in all_tcga_files if i.endswith('.svs')]
dat_TCGA= pd.DataFrame()
dat_TCGA['File']= all_tcga_files
dat_TCGA['Cancer']= dat_TCGA['File'].replace(".*TCGA/", "", regex= True).replace("/.*", '', regex= True)
dat_TCGA['Image']= dat_TCGA['File'].transform(lambda x: os.path.basename(x))


print("generating output files")

ofile= os.path.join(dir_out, 'all_TCGA.csv')

dat_TCGA.to_csv(ofile)

# directly uploading to s3
sagemaker.s3.S3Uploader.upload(ofile, 's3://gmb-ds-dbgap/test_dir/sagemaker_upload')

# write some random files
for i in outfiles:
    fname= os.path.join(dir_out, i)
    print('writing to ', fname)
    dat.to_csv(fname)

# all files under /opt/ml/processing/outputs will be automatically copied to the sagemaker s3 bucket


Overwriting preprocessing.py


In [122]:
import argparse
argparse.__version__

'1.1'

## Submit job to queue

In [103]:
import sagemaker
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
role

'arn:aws:iam::667175760002:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole'

In [126]:
from sagemaker.processing import ScriptProcessor

script_processor = ScriptProcessor(
    command=["python3"],
    image_uri=processing_repository_uri,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large", # ml.m5.xlarge
)

In [128]:
script_processor.run(
    code="preprocessing.py",
    # inputs=[ProcessingInput(source=input_data, destination="/opt/ml/processing/input")],
    outputs=[
        ProcessingOutput(output_name="outputs", source="/opt/ml/processing/outputs"),
    ],
    arguments=["--input", "file1", 'file2', 'file3', 'file4', 'file5'],
)
script_processor_job_description = script_processor.jobs[-1].describe()
print(script_processor_job_description)


Job Name:  sagemaker-processing-container-test-2021-09-14-13-07-02-889
Inputs:  [{'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-eu-central-1-667175760002/sagemaker-processing-container-test-2021-09-14-13-07-02-889/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'outputs', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-eu-central-1-667175760002/sagemaker-processing-container-test-2021-09-14-13-07-02-889/output/outputs', 'LocalPath': '/opt/ml/processing/outputs', 'S3UploadMode': 'EndOfJob'}}]
......................[34madding output file file1.csv[0m
[34madding output file file2.csv[0m
[34madding output file file3.csv[0m
[34madding output file file4.csv[0m
[34madding output file file5.csv[0m
[34mgetting TCGA files from S3...[0m
[34mwriting to  /opt/ml/pro

In [97]:
boto3.session.Session()

Session(region_name='eu-central-1')

## Old code

In [109]:
sagemaker.s3.S3Uploader.upload('bank_clean.csv', 's3://gmb-ds-dbgap/test_dir/sagemaker_upload')

's3://gmb-ds-dbgap/test_dir/sagemaker_upload/bank_clean.csv'

In [53]:
bucket= 'gmb-ds-dbgap'
subfolder='data/Digital_pathology/TCGA/'

conn = boto3.client('s3')
contents = conn.list_objects(Bucket=bucket, Prefix=subfolder, MaxKeys= 10)['Contents']
for f in contents:
    if f['Key'].endswith('.svs'):
        print(f['Key'])

data/Digital_pathology/TCGA/TCGA-ACC/0151a039-5482-48cd-bf25-7df4349069e8/TCGA-OR-A5JZ-01Z-00-DX5.DAFFD4FE-CF6A-427C-9D31-81B7941F896E.svs
data/Digital_pathology/TCGA/TCGA-ACC/01d60958-1b6b-4c74-b406-e73800ee7f7f/TCGA-OR-A5JY-01Z-00-DX1.23AEFC00-3720-48F1-A2C4-6914BBEB5E09.svs
data/Digital_pathology/TCGA/TCGA-ACC/023ea8be-b887-4c3a-9695-b8a21d9375a5/TCGA-OR-A5JL-01Z-00-DX2.2C12A8F7-C0CE-48DE-B007-9EBE72645512.svs
data/Digital_pathology/TCGA/TCGA-ACC/02413797-21db-4407-837a-03e68cc5a98c/TCGA-OR-A5K1-01Z-00-DX4.5D8E9E35-B179-44C5-8228-090808AD2AC4.svs
data/Digital_pathology/TCGA/TCGA-ACC/02433c5c-c3a3-45c5-9ff8-30aababaa134/TCGA-OR-A5K2-01Z-00-DX4.151DF136-DFC9-4369-8FC1-1EAEEB3177BD.svs


In [63]:
aa= sagemaker.s3.S3Downloader.list('s3://gmb-ds-dbgap/data/Digital_pathology/TCGA')

In [76]:

all_tcga_files= sagemaker.s3.S3Downloader.list('s3://gmb-ds-dbgap/data/Digital_pathology/TCGA/')
all_tcga_files= [i for i in all_tcga_files if i.endswith('.svs')]
dat_TCGA= pd.DataFrame()
dat_TCGA['File']= all_tcga_files
dat_TCGA['Cancer']= dat_TCGA['File'].replace(".*TCGA/", "", regex= True).replace("/.*", '', regex= True)
dat_TCGA['Image']= dat_TCGA['File'].transform(lambda x: os.path.basename(x))


In [81]:
dat_TCGA

Unnamed: 0,File,Cancer,Image
0,s3://gmb-ds-dbgap/data/Digital_pathology/TCGA/...,TCGA-ACC,TCGA-OR-A5JZ-01Z-00-DX5.DAFFD4FE-CF6A-427C-9D3...
1,s3://gmb-ds-dbgap/data/Digital_pathology/TCGA/...,TCGA-ACC,TCGA-OR-A5JY-01Z-00-DX1.23AEFC00-3720-48F1-A2C...
2,s3://gmb-ds-dbgap/data/Digital_pathology/TCGA/...,TCGA-ACC,TCGA-OR-A5JL-01Z-00-DX2.2C12A8F7-C0CE-48DE-B00...
3,s3://gmb-ds-dbgap/data/Digital_pathology/TCGA/...,TCGA-ACC,TCGA-OR-A5K1-01Z-00-DX4.5D8E9E35-B179-44C5-822...
4,s3://gmb-ds-dbgap/data/Digital_pathology/TCGA/...,TCGA-ACC,TCGA-OR-A5K2-01Z-00-DX4.151DF136-DFC9-4369-8FC...
...,...,...,...
11888,s3://gmb-ds-dbgap/data/Digital_pathology/TCGA/...,TCGA-UVM,TCGA-WC-AA9A-01Z-00-DX1.623129B9-6420-4844-8E2...
11889,s3://gmb-ds-dbgap/data/Digital_pathology/TCGA/...,TCGA-UVM,TCGA-V4-A9EO-01Z-00-DX1.F918F843-0A2E-4ECF-953...
11890,s3://gmb-ds-dbgap/data/Digital_pathology/TCGA/...,TCGA-UVM,TCGA-V4-A9F3-01Z-00-DX1.39FC86CA-2833-4361-93A...
11891,s3://gmb-ds-dbgap/data/Digital_pathology/TCGA/...,TCGA-UVM,TCGA-VD-AA8M-01Z-00-DX1.AA194E33-B494-44E3-9C7...


In [67]:
dat_TCGA= pd.DataFrame()
dat_TCGA['File']= aa

In [74]:
dat_TCGA['File'].transform(lambda x: os.path.basename(x))

0        TCGA-OR-A5JZ-01Z-00-DX5.DAFFD4FE-CF6A-427C-9D3...
1        TCGA-OR-A5JZ-01Z-00-DX5.DAFFD4FE-CF6A-427C-9D3...
2        TCGA-OR-A5JY-01Z-00-DX1.23AEFC00-3720-48F1-A2C...
3        TCGA-OR-A5JY-01Z-00-DX1.23AEFC00-3720-48F1-A2C...
4        TCGA-OR-A5JL-01Z-00-DX2.2C12A8F7-C0CE-48DE-B00...
                               ...                        
23796    TCGA-VD-AA8M-01Z-00-DX1.AA194E33-B494-44E3-9C7...
23797    TCGA-VD-AA8M-01Z-00-DX1.AA194E33-B494-44E3-9C7...
23798    TCGA-V4-A9EF-01Z-00-DX1.3084FABF-8AC4-403A-ADD...
23799    TCGA-V4-A9EF-01Z-00-DX1.3084FABF-8AC4-403A-ADD...
23800                               TCGA_imaging_files.tsv
Name: File, Length: 23801, dtype: object

In [32]:
from sagemaker import get_execution_role

role = get_execution_role()

role

'arn:aws:iam::667175760002:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole'