In [29]:

import os
import re
import glob
import collections as col

import numpy as np

root_path = '/home/local/work/data/hgsvc/rsrc_stats'

def load_resources(file_path):
    
    cpu_mult = 1
    threads_info = os.path.basename(file_path).split('.', 2)[-2]
    if threads_info.startswith('t'):
        cpu_mult = int(threads_info.strip('t'))
    with open(file_path, 'r') as table:
        header = table.readline()
        rsrc_values = table.readline().split('\t')
        walltime_sec = float(rsrc_values[0])
        memory_mb = float(rsrc_values[2])
    return walltime_sec, memory_mb, cpu_mult


def extract_cluster(file_name):
    
    mobj = re.search('cluster[0-9]+', file_name)
    if mobj is None:
        raise ValueError('Cannot id cluster name: {}'.format(file_name))
    cluster_id = mobj.group(0)
    sample = file_name.split('_')[0]
    return sample, cluster_id


def find_outliers(run_infos):
    
    header = np.array([t[0] + '_' + t[1] for t in run_infos], dtype=np.object)
    runtime = np.array([t[2] for t in run_infos], dtype=np.float64)
    memory = np.array([t[3] for t in run_infos], dtype=np.float64)
    
    runtime_frac = (runtime / runtime.mean()).round(2)
    memory_frac = (memory / memory.mean()).round(2)
    
    mean_rank = ((runtime_frac + memory_frac) / 2).round(1)
    
    candidates = []
    for rk, h, rtf, rt, memf, mem in zip(mean_rank, header, runtime_frac, runtime, memory_frac, memory):
        candidates.append((rk, h, rtf, rt, memf, mem))
    return sorted(candidates, reverse=True)
        
    

check_paths = [
    ('variant_calls/deepvar/*/*/processing/10-norm/splits/*.rsrc', True, 'VarCall'),
    ('integrative_phasing/processing/whatshap/deepvar_QUAL10_GQ100/*/*/*/*.rsrc', True, 'WHPhase'),
    ('diploid_assembly/strandseq_split/deepvar_QUAL10_GQ100/*/*/*/polishing/alignments/*h?-un*racon-p1*.rsrc', True, 'AlnPol1'),
    ('diploid_assembly/strandseq_split/deepvar_QUAL10_GQ100/*/*/*/polishing/alignments/*h?-un*racon-p2*.rsrc', True, 'AlnPol2'),
    ('reference_assembly/non-hap-res/*.rsrc', False, 'NHRassm'),
    ('diploid_assembly/strandseq_split/deepvar_QUAL10_GQ100/*/*/*/draft/haploid_assembly/*.rsrc', True, 'ClustAssm')
]

rsrc_collect = col.defaultdict(list)

for sub_path in ['hifi']:
    search_path = os.path.join(root_path, sub_path)
    for (path, is_cluster, label) in check_paths:
        files = glob.glob(os.path.join(search_path, path))
        if is_cluster:
            extractor = extract_cluster
        else:
            extractor = lambda x: (x.split('_')[0], 'wg')
        for f in files:
            sample, seq_id = extractor(os.path.basename(f))
            walltime_sec, memory_mb, cpu_mult = load_resources(f)
            rsrc_collect[(label, sub_path)].append((sample, seq_id, walltime_sec, memory_mb, cpu_mult))

for run_type, infos in rsrc_collect.items():
    print(run_type)
    res = find_outliers(infos)
    for r in res:
        if r[0] > 3:
            print(r)
    print('===')


('VarCall', 'hifi')
===
('WHPhase', 'hifi')
===
('AlnPol1', 'hifi')
(34.7, 'NA19240_cluster16', 64.3, 45765.4544, 5.15, 34163.99)
(34.5, 'NA19240_cluster16', 63.95, 45518.5325, 4.97, 33025.76)
(18.4, 'HG00513_cluster16', 33.62, 23927.6205, 3.25, 21554.93)
(17.4, 'HG00513_cluster16', 31.07, 22111.3438, 3.69, 24475.39)
(7.6, 'HG02818_cluster21', 11.96, 8509.4627, 3.28, 21775.25)
(7.4, 'HG02818_cluster21', 11.6, 8253.4252, 3.19, 21185.18)
(5.2, 'HG03125_cluster9', 8.08, 5753.7531, 2.42, 16091.38)
(4.8, 'HG03125_cluster9', 7.27, 5171.9566, 2.39, 15889.26)
(4.2, 'HG02818_cluster24', 5.42, 3858.062, 2.97, 19724.46)
(3.2, 'HG02818_cluster24', 4.26, 3032.4036, 2.22, 14734.21)
===
('AlnPol2', 'hifi')
(45.4, 'NA19240_cluster16', 85.41, 42042.7095, 5.34, 32951.34)
(45.2, 'NA19240_cluster16', 84.87, 41774.7158, 5.43, 33530.82)
(24.0, 'HG00513_cluster16', 44.53, 21919.4059, 3.4, 20965.08)
(22.7, 'HG00513_cluster16', 42.05, 20698.5265, 3.3, 20372.53)
(14.5, 'HG00733_cluster3', 25.55, 12578.6265, 3.3