In [2]:
!sudo pip3 install --upgrade pip keras==2.1.5 tensorflow==1.13.1 numpy pandas pillow sklearn optuna scikit-image  optkeras

Collecting pip
  Downloading pip-20.2.3-py2.py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 1.9 MB/s eta 0:00:01
[?25hCollecting keras==2.1.5
  Downloading Keras-2.1.5-py2.py3-none-any.whl (334 kB)
[K     |████████████████████████████████| 334 kB 1.9 MB/s eta 0:00:01
[?25hCollecting tensorflow==1.13.1
  Downloading tensorflow-1.13.1-cp36-cp36m-manylinux1_x86_64.whl (92.5 MB)
[K     |████████████████████████████████| 92.5 MB 6.2 MB/s eta 0:00:011    |████                            | 11.6 MB 3.5 MB/s eta 0:00:24     |████▎                           | 12.3 MB 3.5 MB/s eta 0:00:24     |█████████▊                      | 28.0 MB 6.8 MB/s eta 0:00:10     |██████████▍                     | 30.0 MB 3.1 MB/s eta 0:00:21     |██████████▊                     | 31.0 MB 3.1 MB/s eta 0:00:20     |███████████▍                    | 32.8 MB 3.1 MB/s eta 0:00:20     |███████████▋                    | 33.6 MB 3.1 MB/s eta 0:00:20     |████████████                    | 34.4

[?25hCollecting PrettyTable<0.8,>=0.7.2
  Downloading prettytable-0.7.2.tar.bz2 (21 kB)
Collecting stevedore>=2.0.1
  Downloading stevedore-3.2.2-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 2.2 MB/s  eta 0:00:01
Collecting cycler>=0.10
  Downloading cycler-0.10.0-py2.py3-none-any.whl (6.5 kB)
Collecting kiwisolver>=1.0.1
  Downloading kiwisolver-1.2.0-cp36-cp36m-manylinux1_x86_64.whl (88 kB)
[K     |████████████████████████████████| 88 kB 7.0 MB/s  eta 0:00:01
Collecting colorama>=0.3.7
  Downloading colorama-0.4.3-py2.py3-none-any.whl (15 kB)
Collecting pyperclip>=1.6
  Downloading pyperclip-1.8.0.tar.gz (16 kB)
Using legacy 'setup.py install' for sklearn, since package 'wheel' is not installed.
Using legacy 'setup.py install' for termcolor, since package 'wheel' is not installed.
Using legacy 'setup.py install' for PrettyTable, since package 'wheel' is not installed.
Using legacy 'setup.py install' for pyperclip, since package 'wheel' is not installed.

In [3]:
#!/usr/bin/env python3
import os
import logging
from pathlib import Path
import requests 
from glob import glob
from zipfile import ZipFile
import pickle
import pandas as pd

logging.basicConfig(level=logging.DEBUG)

#Import Pegasus API
from Pegasus.api import *

#Properties
props = Properties()
props["dagman.retry"] = "2"
props["pegasus.transfer.arguments"] = "-m 1"
props.write()

#Replica Catalog
rc = ReplicaCatalog()
input_files = glob('*.jpg')
input_files.sort()
in_files=[]

checkpoint_file = "checkpoint1.csv"
if not os.path.isfile(checkpoint_file):
    df = pd.DataFrame(list())
    df.to_csv(checkpoint_file)
        

for file in input_files:
    in_files.append(File(file))
    rc.add_replica("local", File(file), str(Path(".").resolve() / file))  
rc.add_replica("local", File(checkpoint_file), str(Path(".").resolve() / checkpoint_file)) 
rc.write()


#Transformation
pre_process_resize = Transformation( "preprocess1.py",
        site = "local",
        pfn = "/home/scitech/shared-data/CatsAndDogs/preprocess1.py",
        is_stageable=True)

pre_process_augment = Transformation( "Augmentation.py",
        site = "local",
        pfn = "/home/scitech/shared-data/CatsAndDogs/Augmentation.py",
        is_stageable=True)

data_split  = Transformation( "Data_Split.py",
        site = "local",
        pfn = "/home/scitech/shared-data/CatsAndDogs/Data_Split.py",
        is_stageable=True)

vgg_model  = Transformation( "VGG_model.py",
        site = "local",
        pfn = "/home/scitech/shared-data/CatsAndDogs/VGG_model.py",
        is_stageable=True)

hpo =  Transformation( "hpo_checkpointing.py",
        site = "local",
        pfn=str(Path(".").resolve() /"hpo_checkpointing.py"),
        is_stageable=True)
                    
tc = TransformationCatalog()\
    .add_transformations(pre_process_resize,pre_process_augment,data_split,vgg_model,hpo)\
    .write()

#Workflow
wf = Workflow("Cats_and_Dogs", infer_dependencies=True)


resized_images = File('resized_images.txt')
all_files = [File("resized_{}".format(f.lfn)) for f in in_files]
labels = File('labels.txt')

job_preprocess1 = Job(pre_process_resize)\
                    .add_inputs(*in_files)\
                    .add_outputs(*all_files,resized_images,labels) 

aug_images_txt = File('augmentation.txt')
aug_labels_txt = File('aug_labels.txt')
augmented_files = []
for f in all_files:
    augmented_files.extend([File(str(f).replace("{}".format(os.path.splitext(str(f))[0]), "Aug_{}_{}".format(os.path.splitext(str(f))[0],i))) for i in range(3)])

    
job_preprocess2 = Job(pre_process_augment)\
                    .add_inputs(*all_files,labels)\
                    .add_outputs(aug_images_txt,aug_labels_txt,*augmented_files)

training_data = File('training.pkl')
testing_data = File('testing.pkl')
val_data = File('validation.pkl')

job_data_split = Job(data_split)\
                    .add_inputs(*augmented_files,labels)\
                    .add_outputs(training_data,testing_data,val_data)

model = File('model.h5')

job_vgg_model = Job(vgg_model)\
                    .add_inputs(*augmented_files,training_data,testing_data,val_data)\
                    .add_outputs(model)

job_hpo = Job(hpo)\
                    .add_checkpoint(File(checkpoint_file), stage_out=True)\
                    .add_inputs(model,*augmented_files,training_data,testing_data,val_data)\
                    .add_profiles(Namespace.PEGASUS, key="maxwalltime", value=100)

wf.add_jobs(job_preprocess1,job_preprocess2,job_data_split,job_vgg_model,job_hpo)                                    

<Pegasus.api.workflow.Workflow at 0x7f652eca7f28>

In [4]:
try:
     wf.plan(submit=True)\
        .wait()\
        .analyze()\
        .statistics()
except PegasusClientError as e:
    print(e.output)


################
# pegasus-plan #
################
[main] WARN  schema.JsonMetaSchema  - Unknown keyword $defs - you should define your own Meta Schema. If the keyword is irrelevant for validation, just use a NonValidationKeyword
[main] WARN  schema.JsonMetaSchema  - Unknown keyword additionalItems - you should define your own Meta Schema. If the keyword is irrelevant for validation, just use a NonValidationKeyword
[main] WARN  schema.JsonMetaSchema  - Unknown keyword examples - you should define your own Meta Schema. If the keyword is irrelevant for validation, just use a NonValidationKeyword
2020.09.28 19:15:14.109 UTC: [FATAL ERROR]  
 [1] java.lang.RuntimeException: Unable to parse URL file:///home/scitech/shared-data/Scientific-Workflows-master 3/shared-data/CatsAndDogs at edu.isi.pegasus.common.util.PegasusURL.parse(PegasusURL.java:113) 



[main] WARN  schema.JsonMetaSchema  - Unknown keyword $defs - you should define your own Meta Schema. If the keyword is irrelevant for validation, just use a NonValidationKeyword
[main] WARN  schema.JsonMetaSchema  - Unknown keyword additionalItems - you should define your own Meta Schema. If the keyword is irrelevant for validation, just use a NonValidationKeyword
[main] WARN  schema.JsonMetaSchema  - Unknown keyword examples - you should define your own Meta Schema. If the keyword is irrelevant for validation, just use a NonValidationKeyword

2020.09.28 19:15:14.109 UTC: [FATAL ERROR]  
 [1] java.lang.RuntimeException: Unable to parse URL file:///home/scitech/shared-data/Scientific-Workflows-master 3/shared-data/CatsAndDogs at edu.isi.pegasus.common.util.PegasusURL.parse(PegasusURL.java:113) 



In [139]:
import csv

with open("wf-output/checkpoint1.csv", encoding="utf8") as csvfile:
    csvreader = csv.reader(csvfile, delimiter=",")

    for row in csvreader:
        print(": ".join(row))

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

In [140]:
with open("wf-output/checkpoint1.csv", 'r', encoding='ISO-8859-1', newline='') as csvfile:
    csvreader = csv.reader(csvfile, delimiter=",")
    for row in csvreader:
        print(', '.join(row))

Error: line contains NULL byte

In [126]:
import codecs
csvReader = csv.reader(codecs.open("wf-output/checkpoint.csv", 'rU', 'ISO-8859-1'))

In [127]:
for row in csvReader:
        print(', '.join(row))

Error: line contains NULL byte