# Spin up a processing node via SageMaker and run some analysis
* Set up simplified ML workflow
  * Upload data to s3
  * Run container
  * Upload results to s3 from processing instance
* Remaining issues:
  * EFS: we can mount the efs but the owners are different. as a result, /efs/ is read only
    * use chmod go+rw efs/
* Change log:
  * now use a different docker image (base-layer) that has GPU support
  * now using tumor purity piptline and app data bundle

## Build a preprocessing script template
* Download from S3
* Upload to S3
* Batch mode?
* Input parameters

In [1]:
import sagemaker
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput


In [2]:
sagemaker_session = sagemaker.Session(default_bucket= 'sagemaker-tumor-purity-workflow')
role = sagemaker.get_execution_role()
role

'arn:aws:iam::667175760002:role/service-role/AmazonSageMaker-ExecutionRole-20211001T082927'

In [3]:
# container_url= '667175760002.dkr.ecr.eu-central-1.amazonaws.com/base-layer:latest' # my custom container

container_url= '667175760002.dkr.ecr.eu-central-1.amazonaws.com/base-layer:latest'
# container_url= '667175760002.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-processing-container-test:latest' # container from sagemaker tutorial - use this for speed

In [4]:
! pwd

/home/ec2-user/SageMaker


## Processing script
* download app bundle from s3
* unpack
* show file structure and export to file 

In [17]:
# edit and test run_workflow.py


In [19]:
sagemaker.s3.S3Downloader.download("s3://gmb-ds-dbgap/test_dir/sagemaker_upload/docker_workflow_package.tgz", local_path= 'input/')

In [23]:
"alksdf {} laksdf {}".format(1,2)

'alksdf 1 laksdf 2'

In [21]:
! ls input
! tar xvfz input/docker_workflow_package.tgz

docker_workflow_package.tgz
workflow_tumor_purity.py
DP_lib/
DP_lib/img_classifier.py
DP_lib/__pycache__/
DP_lib/__pycache__/perf_eval.cpython-37.pyc
DP_lib/__pycache__/__init__.cpython-37.pyc
DP_lib/__pycache__/util_tumor_purity.cpython-37.pyc
DP_lib/__pycache__/_nbdev.cpython-37.pyc
DP_lib/__pycache__/img_classifier.cpython-37.pyc
DP_lib/_nbdev.py
DP_lib/img_clustering.py
DP_lib/perf_eval.py
DP_lib/util_tumor_purity.py
DP_lib/__init__.py
data/GEN1046/GEN1046_meta_bioset_export7.csv
models/model_MLC_export7_bio_c10_lr0.01_stage3_fold0.pth
tar: models/model_MLC_export7_bio_c10_lr0.01_stage3_fold0.pth: Wrote only 3584 of 10240 bytes
models/model_MLC_export7_bio_c10_lr0.01_stage3_fold1.pth
tar: models/model_MLC_export7_bio_c10_lr0.01_stage3_fold1.pth: Cannot write: No space left on device
models/model_MLC_export7_bio_c10_lr0.01_stage3_fold2.pth
tar: models/model_MLC_export7_bio_c10_lr0.01_stage3_fold2.pth: Cannot write: No space left on device
models/model_MLC_export7_bio_c10_lr0.01_stag

## Submit job to queue
1. basic example from tutorial
2. add input directory and show what's on the exec node when processing job is run
3. once done, the outputs are in the sagemaker s3 bucket

### Test 1 - dummy input files

In [5]:

script_processor = ScriptProcessor(
    command=["python3"],
    image_uri= container_url,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large", # ml.m5.xlarge
)

In [6]:
script_processor.run(
    code="run_workflow.py",
    # inputs=[ProcessingInput(source=input_data, destination="/opt/ml/processing/input")],
    outputs=[
        ProcessingOutput(output_name="outputs", source="/opt/ml/processing/outputs"),
    ],
    arguments=["--input", "file1", 'file2', 'file3', 'file4', 'file5'],
)
script_processor_job_description = script_processor.jobs[-1].describe()
print(script_processor_job_description)


Job Name:  base-layer-2022-05-06-17-35-30-667
Inputs:  [{'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-eu-central-1-667175760002/base-layer-2022-05-06-17-35-30-667/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'outputs', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-eu-central-1-667175760002/base-layer-2022-05-06-17-35-30-667/output/outputs', 'LocalPath': '/opt/ml/processing/outputs', 'S3UploadMode': 'EndOfJob'}}]
......................................................[34m** checking input directory: /opt/ml/processing/input[0m
[34m/opt/ml/processing/input/code[0m
[34m/opt/ml/processing/input/code/preprocessing.py[0m
[34madding output file file1.csv[0m
[34madding output file file2.csv[0m
[34madding output file file3.csv[0m
[34madding output file file

In [10]:
container_url

'667175760002.dkr.ecr.eu-central-1.amazonaws.com/base-layer:latest'

### Test 2 - now we copy files from s3 as input
* TO DO: need to copy an entire directory recursively if path ends with /

In [33]:
"s3://".startswith('s3://')
" ".join([ "'{}'".format(i) for i in ['abc def', 'ghi adsf']])

"'abc def' 'ghi adsf'"

In [42]:
# note:
#  ProcessingInput only takes a single file (local or s3 path) as input. It doesn't recursively copy files.

# input_data= 'efs/project/Digital_pathology/output/export/p_test/'

def testrun2():
    sp2 = ScriptProcessor(
        command=["python3"],
        image_uri= container_url,
        role=role,
        instance_count=1,
        instance_type="ml.m5.large", # fast launch
    )

    sp2.run(
        code="run_workflow.py",
        inputs=[
            # ProcessingInput(source=input_data, destination="/opt/ml/processing/input", s3_compression_type='None'),
            # ProcessingInput(source='s3://gmb-ds-dbgap/test_dir/test_data/output/export/noisy_student_test2/', destination="/opt/ml/processing/another_input", s3_compression_type='None'),
            # ProcessingInput(source='s3://gmb-ds-dbgap/test_dir/test_data/HE_images/', destination="/opt/ml/processing/HE_input", s3_compression_type='None'),        
        ],
        outputs=[
            ProcessingOutput(output_name="outputs", source="/opt/ml/processing/output_workflow"),
            # ProcessingOutput(output_name="outputs", source="/opt/ml/processing/outputs"),
            # ProcessingOutput(output_name="output2", source="/opt/ml/processing/output2"),

        ],
        arguments=["--input", "s3://gmb-ds-dbgap/data/Digital_pathology/pathology_lab/mirror_Y_drive/Tumor Purity/NSCLC/AVD-44AOK-2145A.svs",
                               "s3://gmb-ds-dbgap/data/Digital_pathology/pathology_lab/mirror_Y_drive/Tumor Purity/NSCLC/AVD-44AOK-8925A-1_1005413.svs",
                               "s3://gmb-ds-dbgap/data/Digital_pathology/pathology_lab/mirror_Y_drive/Tumor Purity/NSCLC/AVD-66JDF-6969A-1_1005414.svs",
                   "--app-data", "s3://gmb-ds-dbgap/test_dir/sagemaker_upload/docker_workflow_package.tgz",
                  "--download", "s3://gmb-ds-dbgap/test_dir/sagemaker_upload",
                  "--show_image_specs"],
    )

multiprocessing.Process(target=testrun2).start()



Job Name:  base-layer-2022-05-09-15-44-27-265
Inputs:  [{'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-eu-central-1-667175760002/base-layer-2022-05-09-15-44-27-265/input/code/run_workflow.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'outputs', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-eu-central-1-667175760002/base-layer-2022-05-09-15-44-27-265/output/outputs', 'LocalPath': '/opt/ml/processing/output_workflow', 'S3UploadMode': 'EndOfJob'}}]
..............................................[34mINFO:root:Tumor purity workflow[0m
[34mINFO:root:input: ['s3://gmb-ds-dbgap/data/Digital_pathology/pathology_lab/mirror_Y_drive/Tumor Purity/NSCLC/AVD-44AOK-2145A.svs', 's3://gmb-ds-dbgap/data/Digital_pathology/pathology_lab/mirror_Y_drive/Tumor Purity/NSCLC/AVD-44AOK-8925A-1_1005413.svs', 's3

### Test 2B - run tumor purity with multiple CPUs

In [47]:
# note:
#  ProcessingInput only takes a single file (local or s3 path) as input. It doesn't recursively copy files.

# input_data= 'efs/project/Digital_pathology/output/export/p_test/'

def testrun2B():
    sp2 = ScriptProcessor(
        command=["python3"],
        image_uri= container_url,
        role=role,
        instance_count=1,
        instance_type="ml.m5.12xlarge", # fast launch
    )

    sp2.run(
        code="run_workflow.py",
        inputs=[
            # ProcessingInput(source=input_data, destination="/opt/ml/processing/input", s3_compression_type='None'),
            # ProcessingInput(source='s3://gmb-ds-dbgap/test_dir/test_data/output/export/noisy_student_test2/', destination="/opt/ml/processing/another_input", s3_compression_type='None'),
            # ProcessingInput(source='s3://gmb-ds-dbgap/test_dir/test_data/HE_images/', destination="/opt/ml/processing/HE_input", s3_compression_type='None'),        
        ],
        outputs=[
            ProcessingOutput(output_name="outputs", source="/opt/ml/processing/output_workflow"),
            # ProcessingOutput(output_name="outputs", source="/opt/ml/processing/outputs"),
            # ProcessingOutput(output_name="output2", source="/opt/ml/processing/output2"),

        ],
        arguments=["--input", "s3://gmb-ds-dbgap/data/Digital_pathology/pathology_lab/mirror_Y_drive/Tumor Purity/NSCLC/AVD-44AOK-2145A.svs",
                   "--app-data", "s3://gmb-ds-dbgap/test_dir/sagemaker_upload/docker_workflow_package.tgz",
                  "--download", "s3://gmb-ds-dbgap/test_dir/sagemaker_upload",
                  ],
    )

multiprocessing.Process(target=testrun2B).start()



Job Name:  base-layer-2022-05-09-17-01-18-894
Inputs:  [{'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-eu-central-1-667175760002/base-layer-2022-05-09-17-01-18-894/input/code/run_workflow.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'outputs', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-eu-central-1-667175760002/base-layer-2022-05-09-17-01-18-894/output/outputs', 'LocalPath': '/opt/ml/processing/output_workflow', 'S3UploadMode': 'EndOfJob'}}]
.............................................[34mINFO:root:Tumor purity workflow[0m
[34mINFO:root:input: ['s3://gmb-ds-dbgap/data/Digital_pathology/pathology_lab/mirror_Y_drive/Tumor Purity/NSCLC/AVD-44AOK-2145A.svs'][0m
[34mINFO:root:app-data: s3://gmb-ds-dbgap/test_dir/sagemaker_upload/docker_workflow_package.tgz[0m
[34mINFO:root:base_d

### Test 3 - test workflow on GPU instance
* https://docs.aws.amazon.com/sagemaker/latest/dg/notebooks-available-instance-types.html

In [13]:
import multiprocessing, time


In [None]:
def testrun3():
    print("tumor purity workflow using GPU")
    sp3 = ScriptProcessor(
        command=["python3"],
        image_uri= container_url,
        role=role,
        instance_count=1,
        instance_type=  "ml.g4dn.xlarge" # "ml.p3.2xlarge" # "ml.g4dn.xlarge", # fast launch
    )
    sp3.run(
        code="run_workflow.py",
        inputs=[
            # ProcessingInput(source=input_data, destination="/opt/ml/processing/input", s3_compression_type='None'),
        ],
        outputs=[
            ProcessingOutput(output_name="outputs_workflow", source="/opt/ml/processing/output_workflow"),
        ],
        arguments=["--input", "s3://gmb-ds-dbgap/data/Digital_pathology/pathology_lab/mirror_Y_drive/Tumor Purity/NSCLC/AVD-44AOK-2145A.svs",
                               "s3://gmb-ds-dbgap/data/Digital_pathology/pathology_lab/mirror_Y_drive/Tumor Purity/NSCLC/AVD-44AOK-8925A-1_1005413.svs",
                               "s3://gmb-ds-dbgap/data/Digital_pathology/pathology_lab/mirror_Y_drive/Tumor Purity/NSCLC/AVD-66JDF-6969A-1_1005414.svs",
                   "--app-data", "s3://gmb-ds-dbgap/test_dir/sagemaker_upload/docker_workflow_package.tgz",
                  "--download", "s3://gmb-ds-dbgap/test_dir/sagemaker_upload"],
    )

multiprocessing.Process(target=testrun3).start()


In [46]:
import torch

In [39]:

def network_call():
    for i in range(20):
        print(i)
        time.sleep(1)
    
multiprocessing.Process(target=network_call).start()



0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


In [None]:
(sagemaker.s3.S3Downloader.list('s3://gmb-ds-dbgap/test_dir/test_data/output/export/noisy_student_test2/pred_WBCs'))


In [4]:
# s3 download test - it's very slow
def s3_download(s3_path, local_path, dryrun= False):
    s3_files= sagemaker.s3.S3Downloader.list(s3_path)
    print("total of {} files found".format(len(s3_files)))
    for i in s3_files:
        print('downloading ', i)
        if dryrun is False:
            sagemaker.s3.S3Downloader.download(i, local_path)

In [2]:
sagemaker.s3.S3Downloader.download('s3://gmb-ds-dbgap/test_dir/test_data/output/export/noisy_student_test2/df_noisy_student_combined_meta.csv', 's3_cache/test')

In [14]:
def run_cmd(cmd, dryrun= True, check= True):
    print(cmd)
    if dryrun is not True:
        # print("exec")
        p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
        out, err= p.communicate()
        if err is not None:
            err= err.decode('UTF-8')
        if out is not None:
            out= out.decode('UTF-8')
        p_status = p.wait()

        print(out)
        
        if p_status != 0:
            print("** Non-zero status returned! **")
            if err is not None:
                print(err)
            if check:
                assert p_status==0, "Command execution failed. Abort!"
        # subprocess.run(cmd, shell= True, check= check)

In [17]:
run_cmd('aws s3 ls', dryrun= False)

aws s3 ls
2021-01-04 18:24:02 aws-logs-667175760002-eu-central-1
2020-02-25 21:04:02 cf-templates-1gir9gjzsrb3z-eu-central-1
2019-12-15 01:33:37 cf-templates-1gir9gjzsrb3z-us-east-1
2020-05-14 16:20:30 developability
2020-12-10 17:24:16 dp.data.hasi
2021-03-24 14:39:00 dsd0001-tech8170-antibodydevelopability
2021-03-23 13:41:08 dsd0002-tech8110-developmentofnewfunctionalfcmutants
2021-04-13 13:31:51 gmb-commercial-ds
2021-01-29 09:35:03 gmb-cro-genewiz
2019-09-17 14:34:50 gmb-cro-histogenix
2019-10-31 19:22:46 gmb-cro-logs
2021-06-09 08:25:10 gmb-cro-medgenome1
2021-06-07 11:58:12 gmb-cro-ventana-backup
2021-02-16 15:33:59 gmb-discovery-ds-scrna-wra
2020-06-10 01:34:54 gmb-ds-dbgap
2019-09-13 17:46:21 gmb-ds-demo
2021-09-02 09:23:40 gmb-ds-disc
2019-08-15 18:28:50 gmb-ds-exp
2020-12-04 17:32:57 gmb-ds-nanopore
2020-06-10 01:40:43 gmb-ds-public-dlbcl
2020-08-13 06:07:26 gmb-flagship
2021-05-04 07:07:58 gmb-hac
2020-01-09 15:02:04 gmb-lab-lims
2020-01-30 19:28:15 gmb-lab-lims-dev0
2019-1

In [18]:
import subprocess

cmd= "aws s3 cp {} {} --recursive".format('s3://gmb-ds-dbgap/test_dir/test_data/output/export/noisy_student_test2/pred_WBCs', 's3_cache3')
print(cmd)
run_cmd(cmd, dryrun=False)


aws s3 cp s3://gmb-ds-dbgap/test_dir/test_data/output/export/noisy_student_test2/pred_WBCs s3_cache3 --recursive
aws s3 cp s3://gmb-ds-dbgap/test_dir/test_data/output/export/noisy_student_test2/pred_WBCs s3_cache3 --recursive
download: s3://gmb-ds-dbgap/test_dir/test_data/output/export/noisy_student_test2/pred_WBCs/pred_HE_12380021C0001X_18_61_232.png to s3_cache3/pred_HE_12380021C0001X_18_61_232.png
download: s3://gmb-ds-dbgap/test_dir/test_data/output/export/noisy_student_test2/pred_WBCs/pred_HE_12380019C0001X_18_61_302.png to s3_cache3/pred_HE_12380019C0001X_18_61_302.png
download: s3://gmb-ds-dbgap/test_dir/test_data/output/export/noisy_student_test2/pred_WBCs/pred_HE_12380021C0001X_18_87_222.png to s3_cache3/pred_HE_12380021C0001X_18_87_222.png
download: s3://gmb-ds-dbgap/test_dir/test_data/output/export/noisy_student_test2/pred_WBCs/pred_HE_12380021C0001X_18_108_241.png to s3_cache3/pred_HE_12380021C0001X_18_108_241.png
download: s3://gmb-ds-dbgap/test_dir/test_data/output/export

In [8]:
s3_download('s3://gmb-ds-dbgap/test_dir/test_data/output/export/noisy_student_test2/pred_WBCs', 's3_cache2', dryrun= False)

total of 500 files found
downloading  s3://gmb-ds-dbgap/test_dir/test_data/output/export/noisy_student_test2/pred_WBCs/pred_HE_12380007C0001X_18_100_287.png
downloading  s3://gmb-ds-dbgap/test_dir/test_data/output/export/noisy_student_test2/pred_WBCs/pred_HE_12380019C0001X_18_37_237.png
downloading  s3://gmb-ds-dbgap/test_dir/test_data/output/export/noisy_student_test2/pred_WBCs/pred_HE_12380019C0001X_18_61_302.png
downloading  s3://gmb-ds-dbgap/test_dir/test_data/output/export/noisy_student_test2/pred_WBCs/pred_HE_12380021C0001X_18_108_241.png
downloading  s3://gmb-ds-dbgap/test_dir/test_data/output/export/noisy_student_test2/pred_WBCs/pred_HE_12380021C0001X_18_61_232.png
downloading  s3://gmb-ds-dbgap/test_dir/test_data/output/export/noisy_student_test2/pred_WBCs/pred_HE_12380021C0001X_18_80_224.png
downloading  s3://gmb-ds-dbgap/test_dir/test_data/output/export/noisy_student_test2/pred_WBCs/pred_HE_12380021C0001X_18_87_222.png
downloading  s3://gmb-ds-dbgap/test_dir/test_data/output

KeyboardInterrupt: 

In [34]:
import glob
glob.glob('/opt/ml/processing/input/*', recursive=True)
glob.glob('efs/project/Digital_pathology/output/export/p_test/*', recursive= True)

['efs/project/Digital_pathology/output/export/p_test/pred_Mucin']

In [47]:
import os
from pathlib import Path

path = Path(input_data) # '/home/janbodnar/Documents/prog/python/')

for e in path.rglob('*'):
    print(e)

efs/project/Digital_pathology/output/export/p_test/pred_Mucin
efs/project/Digital_pathology/output/export/p_test/pred_Mucin/pred_HE_12380013C0001X_18_45_325.png
efs/project/Digital_pathology/output/export/p_test/pred_Mucin/pred_HE_12380050C0001X_18_99_305.png
efs/project/Digital_pathology/output/export/p_test/pred_Mucin/pred_HE_12380035C0001X_18_95_261.png
efs/project/Digital_pathology/output/export/p_test/pred_Mucin/pred_HE_12380092C0002X_19_77_187.png
efs/project/Digital_pathology/output/export/p_test/pred_Mucin/pred_HE_12380106B0001S_19_141_209.png
efs/project/Digital_pathology/output/export/p_test/pred_Mucin/pred_HE_12380094C0001X_19_56_349.png
efs/project/Digital_pathology/output/export/p_test/pred_Mucin/pred_HE_12380001C0001X_18_64_141.png
efs/project/Digital_pathology/output/export/p_test/pred_Mucin/pred_HE_12380007C0001X_18_88_280.png
efs/project/Digital_pathology/output/export/p_test/pred_Mucin/pred_HE_12380035C0001X_18_146_307.png
efs/project/Digital_pathology/output/export/p

## Old code

In [109]:
sagemaker.s3.S3Uploader.upload('bank_clean.csv', 's3://gmb-ds-dbgap/test_dir/sagemaker_upload')

's3://gmb-ds-dbgap/test_dir/sagemaker_upload/bank_clean.csv'

In [53]:
bucket= 'gmb-ds-dbgap'
subfolder='data/Digital_pathology/TCGA/'

conn = boto3.client('s3')
contents = conn.list_objects(Bucket=bucket, Prefix=subfolder, MaxKeys= 10)['Contents']
for f in contents:
    if f['Key'].endswith('.svs'):
        print(f['Key'])

data/Digital_pathology/TCGA/TCGA-ACC/0151a039-5482-48cd-bf25-7df4349069e8/TCGA-OR-A5JZ-01Z-00-DX5.DAFFD4FE-CF6A-427C-9D31-81B7941F896E.svs
data/Digital_pathology/TCGA/TCGA-ACC/01d60958-1b6b-4c74-b406-e73800ee7f7f/TCGA-OR-A5JY-01Z-00-DX1.23AEFC00-3720-48F1-A2C4-6914BBEB5E09.svs
data/Digital_pathology/TCGA/TCGA-ACC/023ea8be-b887-4c3a-9695-b8a21d9375a5/TCGA-OR-A5JL-01Z-00-DX2.2C12A8F7-C0CE-48DE-B007-9EBE72645512.svs
data/Digital_pathology/TCGA/TCGA-ACC/02413797-21db-4407-837a-03e68cc5a98c/TCGA-OR-A5K1-01Z-00-DX4.5D8E9E35-B179-44C5-8228-090808AD2AC4.svs
data/Digital_pathology/TCGA/TCGA-ACC/02433c5c-c3a3-45c5-9ff8-30aababaa134/TCGA-OR-A5K2-01Z-00-DX4.151DF136-DFC9-4369-8FC1-1EAEEB3177BD.svs


In [63]:
aa= sagemaker.s3.S3Downloader.list('s3://gmb-ds-dbgap/data/Digital_pathology/TCGA')

In [76]:

all_tcga_files= sagemaker.s3.S3Downloader.list('s3://gmb-ds-dbgap/data/Digital_pathology/TCGA/')
all_tcga_files= [i for i in all_tcga_files if i.endswith('.svs')]
dat_TCGA= pd.DataFrame()
dat_TCGA['File']= all_tcga_files
dat_TCGA['Cancer']= dat_TCGA['File'].replace(".*TCGA/", "", regex= True).replace("/.*", '', regex= True)
dat_TCGA['Image']= dat_TCGA['File'].transform(lambda x: os.path.basename(x))


In [81]:
dat_TCGA

Unnamed: 0,File,Cancer,Image
0,s3://gmb-ds-dbgap/data/Digital_pathology/TCGA/...,TCGA-ACC,TCGA-OR-A5JZ-01Z-00-DX5.DAFFD4FE-CF6A-427C-9D3...
1,s3://gmb-ds-dbgap/data/Digital_pathology/TCGA/...,TCGA-ACC,TCGA-OR-A5JY-01Z-00-DX1.23AEFC00-3720-48F1-A2C...
2,s3://gmb-ds-dbgap/data/Digital_pathology/TCGA/...,TCGA-ACC,TCGA-OR-A5JL-01Z-00-DX2.2C12A8F7-C0CE-48DE-B00...
3,s3://gmb-ds-dbgap/data/Digital_pathology/TCGA/...,TCGA-ACC,TCGA-OR-A5K1-01Z-00-DX4.5D8E9E35-B179-44C5-822...
4,s3://gmb-ds-dbgap/data/Digital_pathology/TCGA/...,TCGA-ACC,TCGA-OR-A5K2-01Z-00-DX4.151DF136-DFC9-4369-8FC...
...,...,...,...
11888,s3://gmb-ds-dbgap/data/Digital_pathology/TCGA/...,TCGA-UVM,TCGA-WC-AA9A-01Z-00-DX1.623129B9-6420-4844-8E2...
11889,s3://gmb-ds-dbgap/data/Digital_pathology/TCGA/...,TCGA-UVM,TCGA-V4-A9EO-01Z-00-DX1.F918F843-0A2E-4ECF-953...
11890,s3://gmb-ds-dbgap/data/Digital_pathology/TCGA/...,TCGA-UVM,TCGA-V4-A9F3-01Z-00-DX1.39FC86CA-2833-4361-93A...
11891,s3://gmb-ds-dbgap/data/Digital_pathology/TCGA/...,TCGA-UVM,TCGA-VD-AA8M-01Z-00-DX1.AA194E33-B494-44E3-9C7...


In [67]:
dat_TCGA= pd.DataFrame()
dat_TCGA['File']= aa

In [74]:
dat_TCGA['File'].transform(lambda x: os.path.basename(x))

0        TCGA-OR-A5JZ-01Z-00-DX5.DAFFD4FE-CF6A-427C-9D3...
1        TCGA-OR-A5JZ-01Z-00-DX5.DAFFD4FE-CF6A-427C-9D3...
2        TCGA-OR-A5JY-01Z-00-DX1.23AEFC00-3720-48F1-A2C...
3        TCGA-OR-A5JY-01Z-00-DX1.23AEFC00-3720-48F1-A2C...
4        TCGA-OR-A5JL-01Z-00-DX2.2C12A8F7-C0CE-48DE-B00...
                               ...                        
23796    TCGA-VD-AA8M-01Z-00-DX1.AA194E33-B494-44E3-9C7...
23797    TCGA-VD-AA8M-01Z-00-DX1.AA194E33-B494-44E3-9C7...
23798    TCGA-V4-A9EF-01Z-00-DX1.3084FABF-8AC4-403A-ADD...
23799    TCGA-V4-A9EF-01Z-00-DX1.3084FABF-8AC4-403A-ADD...
23800                               TCGA_imaging_files.tsv
Name: File, Length: 23801, dtype: object

In [32]:
from sagemaker import get_execution_role

role = get_execution_role()

role

'arn:aws:iam::667175760002:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole'