In [1]:
!sudo pip3 install --upgrade pip keras==2.1.5 tensorflow==1.13.1 numpy pandas pillow sklearn optuna scikit-image  optkeras h5py==2.10.0

Collecting pip
  Downloading pip-21.1-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 1.6 MB/s eta 0:00:01
[?25hCollecting keras==2.1.5
  Downloading Keras-2.1.5-py2.py3-none-any.whl (334 kB)
[K     |████████████████████████████████| 334 kB 2.5 MB/s eta 0:00:01
[?25hCollecting tensorflow==1.13.1
  Downloading tensorflow-1.13.1-cp36-cp36m-manylinux1_x86_64.whl (92.5 MB)
[K     |████████████████████████████████| 92.5 MB 113 kB/s  eta 0:00:01   |▉                               | 2.3 MB 3.0 MB/s eta 0:00:30     |█                               | 2.7 MB 3.0 MB/s eta 0:00:30     |██████▉                         | 19.6 MB 3.3 MB/s eta 0:00:22     |███████▍                        | 21.5 MB 9.6 MB/s eta 0:00:08     |████████▍                       | 24.3 MB 9.6 MB/s eta 0:00:08     |████████▌                       | 24.7 MB 9.6 MB/s eta 0:00:08     |███████████████████████▊        | 68.7 MB 7.5 MB/s eta 0:00:04     |█████████████████████████       | 72.1 MB 8.5 M

Collecting greenlet!=0.4.17; python_version >= "3"
  Downloading greenlet-1.0.0-cp36-cp36m-manylinux2010_x86_64.whl (156 kB)
[K     |████████████████████████████████| 156 kB 14.3 MB/s eta 0:00:01
Collecting kiwisolver>=1.0.1
  Downloading kiwisolver-1.3.1-cp36-cp36m-manylinux1_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 2.7 MB/s eta 0:00:01
[?25hCollecting cycler>=0.10
  Downloading cycler-0.10.0-py2.py3-none-any.whl (6.5 kB)
Collecting pyperclip>=1.6
  Downloading pyperclip-1.8.2.tar.gz (20 kB)
Collecting colorama>=0.3.7
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Using legacy 'setup.py install' for sklearn, since package 'wheel' is not installed.
Using legacy 'setup.py install' for termcolor, since package 'wheel' is not installed.
Using legacy 'setup.py install' for pyperclip, since package 'wheel' is not installed.
Installing collected packages: pip, numpy, scipy, keras, gast, keras-preprocessing, absl-py, mock, tensorflow-estimator, grpcio

In [5]:
#!/usr/bin/env python3
import os
import logging
from pathlib import Path
import requests 
from glob import glob
from zipfile import ZipFile
import pickle
import pandas as pd

logging.basicConfig(level=logging.DEBUG)

#Import Pegasus API
from Pegasus.api import *

#Properties
props = Properties()
props["dagman.retry"] = "100"
props["pegasus.transfer.arguments"] = "-m 1"
props.write()

#Replica Catalog
rc = ReplicaCatalog()
input_files = glob('*.jpg')
input_files.sort()
in_files=[]

checkpoint_file = "checkpoint_file2.hdf5"
if not os.path.isfile(checkpoint_file):
    df = pd.DataFrame(list())
    df.to_csv(checkpoint_file)
    
hpo_checkpoint_file = 'hpo_checkpoint.pkl'
if not os.path.isfile(hpo_checkpoint_file):
    df = pd.DataFrame(list())
    df.to_csv(hpo_checkpoint_file)
    
for file in input_files:
    in_files.append(File(file))
    rc.add_replica("local", File(file), str(Path(".").resolve() / file))  
rc.add_replica("local", checkpoint_file, Path(".").resolve() / checkpoint_file)
rc.add_replica("local", hpo_checkpoint_file, Path(".").resolve() / hpo_checkpoint_file)
rc.write()


#Transformation
tools_container = Container(
                    "tools-container", 
                    Container.DOCKER, 
                    image="docker:///ssrujanaa/catsanddogs:latest"
                )

pre_process_resize = Transformation( "preprocess1.py",
            site="local",
            pfn="/home/scitech/shared-data/CatsAndDogs/preprocess1.py",
            is_stageable=True
            )

# pre_process_augment = Transformation( "Augmentation.py",
#             site="condorpool",
#             pfn="/usr/bin/Augmentation.py",
#             is_stageable=True
#             )

data_split  = Transformation( "Data_Split.py",
            site="local",
            pfn="/home/scitech/shared-data/CatsAndDogs/Data_Split.py",
            is_stageable=True
            )


# hpo  = Transformation( "hpo_checkpointing.py",
#             site="condorpool",
#             pfn="/usr/bin/hpo_checkpointing.py",
#             is_stageable=False,
#             container=tools_container
#             )

# vgg_model  = Transformation( "VGG_model.py",
#             site="condorpool",
#             pfn="/usr/bin/VGG_model.py",
#             is_stageable=False,
#             container=tools_container
#             )

# test_model =  Transformation( "Test.py",
#             site="local",
#             pfn="/home/scitech/shared-data/CatsAndDogs/Test.py",
#             is_stageable=True
#             )
                    
tc = TransformationCatalog()\
    .add_containers(tools_container)\
    .add_transformations(pre_process_resize,data_split)\
    .write()
#     .add_transformations(pre_process_resize,pre_process_augment,data_split,hpo,vgg_model,test_model)\
#Workflow
wf = Workflow("Cats_and_Dogs", infer_dependencies=True)


resized_images = File('resized_images.txt')
all_files = [File("resized_{}".format(f.lfn)) for f in in_files]
labels = File('labels.txt')

job_preprocess1 = Job(pre_process_resize)\
                    .add_inputs(*in_files)\
                    .add_outputs(*all_files,resized_images,labels) 

aug_images_txt = File('augmentation.txt')
aug_labels_txt = File('aug_labels.txt')
augmented_files = []
for f in all_files:
    augmented_files.extend([File(str(f).replace("{}".format(os.path.splitext(str(f))[0]), "Aug_{}_{}".format(os.path.splitext(str(f))[0],i))) for i in range(3)])

    
job_preprocess2 = Job(pre_process_augment)\
                    .add_inputs(*all_files,labels)\
                    .add_outputs(aug_images_txt,aug_labels_txt,*augmented_files)

training_data = File('training.pkl')
testing_data = File('testing.pkl')
val_data = File('validation.pkl')

# job_data_split = Job(data_split)\
#                     .add_inputs(*augmented_files,labels)\
#                     .add_outputs(training_data,testing_data,val_data)
job_data_split = Job(data_split)\
                    .add_inputs(*all_files,labels)\
                    .add_outputs(training_data,testing_data,val_data)

model = File('model.h5')
output_file = File('hpo_results.pkl')
job_hpo = Job(hpo)\
                    .add_checkpoint(File(hpo_checkpoint_file), stage_out=True)\
                    .add_inputs(*augmented_files,training_data,testing_data,val_data)\
                    .add_profiles(Namespace.PEGASUS, key="maxwalltime", value=1)\
                    .add_outputs(output_file)

job_vgg_model = Job(vgg_model)\
                    .add_args("-epochs",6, "--batch_size",2)\
                    .add_checkpoint(File(checkpoint_file), stage_out=True)\
                    .add_inputs(*augmented_files,training_data,testing_data,val_data,output_file)\
                    .add_profiles(Namespace.PEGASUS, key="maxwalltime", value=1)\
                    .add_outputs(model)

results_file = File('Result_Metrics.txt')
job_test_model = Job(test_model)\
                    .add_inputs(*augmented_files,testing_data,model)\
                    .add_outputs(results_file)

# wf.add_jobs(job_preprocess1,job_preprocess2,job_data_split,job_hpo,job_vgg_model,job_test_model)  
wf.add_jobs(job_preprocess1,job_data_split)  

<Pegasus.api.workflow.Workflow at 0x7f8f56c22208>

In [6]:
try:
     wf.plan(submit=True)\
        .wait()\
        .analyze()\
        .statistics()
except PegasusClientError as e:
    print(e.output)


################
# pegasus-plan #
################
[main] WARN  schema.JsonMetaSchema  - Unknown keyword $defs - you should define your own Meta Schema. If the keyword is irrelevant for validation, just use a NonValidationKeyword
[main] WARN  schema.JsonMetaSchema  - Unknown keyword additionalItems - you should define your own Meta Schema. If the keyword is irrelevant for validation, just use a NonValidationKeyword
[main] WARN  schema.JsonMetaSchema  - Unknown keyword examples - you should define your own Meta Schema. If the keyword is irrelevant for validation, just use a NonValidationKeyword
2021.04.16 19:23:24.127 UTC:
2021.04.16 19:23:24.133 UTC:   -----------------------------------------------------------------------
2021.04.16 19:23:24.138 UTC:   File for submitting this DAG to HTCondor           : Cats_and_Dogs-0.dag.condor.sub
2021.04.16 19:23:24.144 UTC:   Log of DAGMan debugging messages                 : Cats_and_Dogs-0.dag.dagman.out
2021.04.16 19:23:24.152 UTC:   Log of 

[[1;32m#######[0m-------------------------------------------]  14.3% ..Running ([1;32mCompleted: 2[0m, [1;33mQueued: 0[0m, [1;36mRunning: 0[0m, [1;31mFailed: 0[0m)

KeyboardInterrupt: 