# SISTER Production Summary

In [2]:
import json
import os
import pandas as pd
import IPython
import datetime as dt 
import re
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from maap.maap import MAAP
maap = MAAP(maap_host="sister-api.imgspec.org")

### Get list of jobs

_Rerun to retreive up-to-date job list_

In [3]:
job_list = maap.listJobs('anonymous').text
jobs  = json.loads(job_list)

### Set regex pattern to match tags

In [4]:
crid= "001"
pattern = f"^SISTER.*{crid}$"

### Cycle through jobs and store job information for tags matching regex pattern
_If duplicate tags exists only most recently run job is retained_

TODO: Retain all jobs, index by job ID instead of tag

In [5]:
production = {}
crid= "001"

for job in jobs['jobs']:
    job_id = list(job.keys())[0]
    tag = job[job_id]['tags'][0]

    if re.match(pattern,tag):

        job_info = {}
        job_info['id'] = job_id
        job_info['status'] = job[job_id]['status'].replace('job-','')

        if 'CWL' in tag:
            sister,sensor,product,datetime,crid = tag.split('_')
            job_info['PGE'] = product
        else:
            sister,sensor,level,product,datetime,crid = tag.split('_')
            job_info['PGE'] = f'{level}_{product}'

        job_info['sensor'] = sensor

        if 'time_start' in job[job_id]['job']['job_info'].keys():
            start_time = job[job_id]['job']['job_info']['time_start']
        else:
            start_time = '2000-01-01T00:00:00.0Z'
        job_info['start_time'] =dt.datetime.strptime(start_time,'%Y-%m-%dT%H:%M:%S.%fZ')

        if 'time_end' in job[job_id]['job']['job_info'].keys():
            end_time = job[job_id]['job']['job_info']['time_end']
        else:
            end_time = '2000-01-01T00:00:00.0Z'
        job_info['end_time'] =dt.datetime.strptime(end_time,'%Y-%m-%dT%H:%M:%S.%fZ')

        if 'duration' in job[job_id]['job']['job_info'].keys():
            job_duration = float(job[job_id]['job']['job_info']['duration'])/60
        else:
            job_duration = -1

        job_info['duration'] =job_duration

        if tag not in production.keys():
            production[tag] = job_info

        elif job_info['start_time'] >  production[tag]['start_time']:
            production[tag] = job_info

        if 'facts' in job[job_id]['job']['job_info'].keys():
            if 'ec2_instance_type' in job[job_id]['job']['job_info']['facts'].keys():
                instance_type = job[job_id]['job']['job_info']['facts']['ec2_instance_type']
            else:
                instance_type = np.nan
        else:
            instance_type = np.nan

        job_info['instance_type'] = instance_type

        output_datasets = []
        input_datasets = []
        inputs_disk_usage = 0
        staged_disk_usage = 0

        if job_info['status'] == 'completed':
            if 'metrics' in job[job_id]['job']['job_info'].keys():
                for staged in job[job_id]['job']['job_info']['metrics']['products_staged']:
                    if staged['dataset_type'].startswith('L'):
                        staged_disk_usage+=staged['disk_usage']/1E9
                        for url in staged['urls']:
                            if url.startswith('http'):
                                output_datasets.append(url)

            for input_dataset in job[job_id]['job']['job_info']['metrics']['inputs_localized']:
                inputs_disk_usage += input_dataset['disk_usage']/1E9
                input_datasets.append(input_dataset['url'])

        job_info['output_datasets'] = output_datasets
        job_info['input_datasets'] = input_datasets
        job_info['inputs_disk_usage'] = inputs_disk_usage
        job_info['staged_disk_usage'] = staged_disk_usage

## Create production results dataframe
---

In [7]:
production_df = pd.DataFrame(index = production.keys(),
                        columns = job_info.keys())

for tag in production.keys():
    production_df.loc[tag,:] = [production[tag][key] for key in job_info.keys()]


## Get job statistics by sensor and PGE

In [9]:
prod_complete = production_df[production_df.status == 'completed']

production_mean = prod_complete.groupby(['sensor','PGE']).mean()[['duration','inputs_disk_usage','staged_disk_usage']]
production_mode = prod_complete.groupby(['sensor','PGE']).agg( lambda x: pd.Series.mode(x)[0])['instance_type']
production_count = prod_complete.groupby(['sensor','PGE']).count()

production_mean

Unnamed: 0_level_0,Unnamed: 1_level_0,duration,inputs_disk_usage,staged_disk_usage
sensor,PGE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AVCL,CWL,164.50669,0.0,0.0
AVCL,L1B_RDN,23.760056,3.127971,3.352559
AVCL,L2A_CORFL,5.367303,3.277368,3.129379
AVCL,L2A_RFL,104.450215,3.490446,6.593808
AVCL,L2A_RSRFL,6.131268,6.638825,6.253548
AVCL,L2B_FRCOVER,9.657685,3.129379,0.039016
AVCL,L2B_GRAINSIZE,1.459346,3.183356,0.000592
AVCL,L2B_ROUTER,0.343597,0.039016,0.0
AVCL,L2B_VEGBIOCHEM,2.221901,3.168395,0.040053
AVNG,CWL,62.580039,0.0,0.0


## Print CSV

In [11]:
PGEs = {'CWL':'CWL Workflow',
         'L1B_RDN':'L1 Preprocess',
         'L2A_RFL':'L2 ISOFIT',
         'L2A_RSRFL' :'L2 Resample',
         'L2A_CORFL':'L2 Correction',
         'L2B_ROUTER': 'L2 Router',
         'L2B_FRCOVER':'L2 Fractional Cover',
         'L2B_GRAINSIZE':'L2 Snow grain size',
         'L2B_VEGBIOCHEM':'L2 Vegetation Traits',}

sensors = {'AVCL': 'AVIRIS-CL',
           'AVNG': 'AVIRIS-NG',
           'PRISMA':'PRISMA',
           'DESIS':'DESIS'}

for pge in PGEs.keys():
    for sensor in sensors.keys():
        if (sensor == 'DESIS') & (pge == 'L2B_GRAINSIZE'):
            continue
        else:
            row = production_mean.loc[(sensor,pge)].values.round(3).astype(str).tolist()
            instance_type = production_mode.loc[(sensor,pge)]
            count  = str(production_count.loc[(sensor,pge),'id'])

            print(','.join([f'{PGEs[pge]} - {sensors[sensor]}'] + row + [instance_type,count]))


CWL Workflow - AVIRIS-CL,164.507,0.0,0.0,t3a.large,23
CWL Workflow - AVIRIS-NG,62.58,0.0,0.0,t3a.large,59
CWL Workflow - PRISMA,52.929,0.0,0.0,t3.large,27
CWL Workflow - DESIS,47.242,0.0,0.0,t3.large,8
L1 Preprocess - AVIRIS-CL,23.76,3.128,3.353,c5.9xlarge,25
L1 Preprocess - AVIRIS-NG,11.443,8.427,0.3,t3.xlarge,61
L1 Preprocess - PRISMA,6.506,0.804,1.408,t3a.xlarge,28
L1 Preprocess - DESIS,2.274,0.565,2.157,t3a.xlarge,8
L2 ISOFIT - AVIRIS-CL,104.45,3.49,6.594,c5.9xlarge,24
L2 ISOFIT - AVIRIS-NG,22.91,0.3,0.567,c5.9xlarge,61
L2 ISOFIT - PRISMA,21.04,1.402,2.651,c5.9xlarge,27
L2 ISOFIT - DESIS,18.512,2.157,4.084,c5.9xlarge,8
L2 Resample - AVIRIS-CL,6.131,6.639,6.254,t3a.xlarge,23
L2 Resample - AVIRIS-NG,0.633,0.567,0.282,t3.xlarge,61
L2 Resample - PRISMA,1.943,2.651,2.432,t3a.xlarge,27
L2 Resample - DESIS,1.777,4.084,1.049,t3a.xlarge,8
L2 Correction - AVIRIS-CL,5.367,3.277,3.129,t3a.xlarge,23
L2 Correction - AVIRIS-NG,0.766,0.16,0.145,t3a.xlarge,59
L2 Correction - PRISMA,1.089,1.275,1.21