In [4]:
import logging
import urllib.request
import shutil
import tarfile

from hashlib import sha256
from pathlib import Path

from Pegasus.api import *

logging.basicConfig(level=logging.DEBUG)


# Download the initial input (not using a pegasus job, because we will need
# iterate through it and add all files to the rc)
with urllib.request.urlopen("http://www.isi.edu/~tanaka/nested-dir.tar.gz") as response,\
    open("nested-dir.tar.gz", "wb") as f:
    shutil.copyfileobj(response, f)

with tarfile.open("nested-dir.tar.gz") as tar:
    tar.extractall()
    
rc = ReplicaCatalog()

# Iterate through the directories adding files
# nested-dir
# ├── cats
# │   └── cat_sounds.txt
# └── dogs
#     └── dog_sounds.txt

in_files = list()
def get_files(d: Path) -> None:
    for p in d.iterdir():
        if p.is_file():
            lfn = "_".join([sha256(p.name.encode()).hexdigest(), p.name])
            f = File(lfn)
            in_files.append(f)
            rc.add_replica("local", lfn, str(p.resolve()))
            
            print("Added file: {}, lfn: {}, pfn: {}".format(
                    str(p),
                    lfn,
                    p.resolve()
                ))
        else:
            get_files(p)

get_files(Path("nested-dir"))

rc.write()

tc = TransformationCatalog()\
        .add_transformations(
            Transformation(
                "print_file_contents",
                site="local",
                pfn="/home/scitech/shared-data/sample-deep-lfn-condorio-wf/print_file_contents.py",
                is_stageable=True
            )
        ).write()

wf = Workflow("nested-dir-tutorial")
wf.add_jobs(
    Job("print_file_contents")
        .add_inputs(*in_files)
        .set_stdout("out.txt")
)


try:
    wf.plan(submit=True).wait()
except PegasusClientError as e:
    print(e.output)


Added file: nested-dir/dogs/dog_sounds.txt, lfn: 9ed1913037f173c9e346ee9111cc4c31c3cd1a9c4536c80ae8a7d4f95cf0dd6f_dog_sounds.txt, pfn: /home/scitech/shared-data/sample-deep-lfn-condorio-wf/nested-dir/dogs/dog_sounds.txt
Added file: nested-dir/cats/cat_sounds.txt, lfn: c38546df40a0c17b18c407c37bb762eff252ac7fe4cd4f324f5c84d69f53ffdf_cat_sounds.txt, pfn: /home/scitech/shared-data/sample-deep-lfn-condorio-wf/nested-dir/cats/cat_sounds.txt


Plan:
[main] WARN  schema.JsonMetaSchema  - Unknown keyword $defs - you should define your own Meta Schema. If the keyword is irrelevant for validation, just use a NonValidationKeyword
[main] WARN  schema.JsonMetaSchema  - Unknown keyword additionalItems - you should define your own Meta Schema. If the keyword is irrelevant for validation, just use a NonValidationKeyword
[main] WARN  schema.JsonMetaSchema  - Unknown keyword examples - you should define your own Meta Schema. If the keyword is irrelevant for validation, just use a NonValidationKeyword
2020.06.12 01:03:35.337 UTC:    
2020.06.12 01:03:35.344 UTC:   ----------------------------------------------------------------------- 
2020.06.12 01:03:35.350 UTC:   File for submitting this DAG to HTCondor           : nested-dir-tutorial-0.dag.condor.sub 
2020.06.12 01:03:35.357 UTC:   Log of DAGMan debugging messages                 : nested-dir-tutorial-0.dag.dagman.out 
2020.06.12 01:03:35.363 UTC:   Log of HTCondor library output    

[[1;32m##################################################[0m] 100.0% ..Success ([1;32mCompleted: 6[0m, [1;33mQueued: 0[0m, [1;36mRunning: 0[0m, [1;31mFailed: 0[0m)
