In [39]:
import os
import fnmatch
import json
import dateutil.parser
import re
import pprint
import glob
import pandas as pd

In [52]:
with open("/treehouse/treeshop/manifests/148.txt") as f:
    ids = sorted([word.strip() for line in f.readlines() for word in line.split(',') if word.strip()])
print("Found {} ids".format(len(ids)))

Found 22 ids


In [53]:
# All methods
methods = [json.loads(open(m).read()) for id in ids 
           for m in glob.glob("/treehouse/archive/downstream/{}/secondary/*/methods.json".format(id))]
print("Found {} secondary outputs".format(len(methods)))

Found 110 secondary outputs


In [54]:
# UMEND QC
reports = [{"id": id, **json.loads(open(m).read())} for id in ids 
           for m in glob.glob("/treehouse/archive/downstream/{}/secondary/*bam-umend-qc-1.1.0-cc481e4/bam_umend_qc.json".format(id))]
print("{} Completed RNASeq and BAM UMEND QC".format(len(reports)))
print("{} Passed".format(sum(r["qc"] == "PASS" for r in reports)))
print("{} Failed: {}".format(sum(r["qc"] == "FAIL" for r in reports), [r["id"] for r in reports if r["qc"] == "FAIL"]))

22 Completed RNASeq and BAM UMEND QC
18 Passed
4 Failed: ['THR32_0958_S01', 'THR32_0959_S01', 'THR32_0961_S01', 'THR32_0962_S01']


In [51]:
[(r["id"], r["qc"], r["estExonicUniqMappedNonDupeReadCount"]) for r in reports]

[('THR32_0941_S01', 'PASS', 67902686.305),
 ('THR32_0948_S01', 'PASS', 55603774.125),
 ('THR32_0950_S01', 'PASS', 61940149.125),
 ('THR32_0955_S01', 'PASS', 63723171.19),
 ('THR32_0956_S01', 'PASS', 81539059.74),
 ('THR32_0958_S01', 'FAIL', 116638.62),
 ('THR32_0959_S01', 'FAIL', 116469.08),
 ('THR32_0960_S01', 'PASS', 89613340.78),
 ('THR32_0961_S01', 'FAIL', 142184.295),
 ('THR32_0962_S01', 'FAIL', 102334.05)]

In [44]:
reports = [{"id": id, 
            "methods": [m for m in glob.glob("/treehouse/archive/downstream/{}/secondary/*/methods.json".format(id))]}
            for id in ids]
[(r["id"], len(r["methods"])) for r in reports]

[('THR32_0941_S01', 5),
 ('THR32_0948_S01', 5),
 ('THR32_0950_S01', 5),
 ('THR32_0955_S01', 5),
 ('THR32_0956_S01', 3),
 ('THR32_0958_S01', 5),
 ('THR32_0959_S01', 5),
 ('THR32_0960_S01', 5),
 ('THR32_0961_S01', 0),
 ('THR32_0962_S01', 5)]

In [55]:
runtimes = pd.DataFrame([{"id": m["sample_id"], 
               "pipeline": m["pipeline"]["docker"]["url"],
               "duration": (dateutil.parser.parse(m["end"]) 
                            - dateutil.parser.parse(m["start"])).total_seconds()/(60*60)}
 for m in methods])
runtimes

Unnamed: 0,duration,id,pipeline
0,2.145330,THR32_0941_S01,https://hub.docker.com/alpine
1,17.353016,THR32_0941_S01,https://quay.io/ucsc_cgl/rnaseq-cgl-pipeline
2,5.714362,THR32_0941_S01,https://hub.docker.com/r/ucsctreehouse/bam-ume...
3,3.126580,THR32_0941_S01,https://hub.docker.com/r/ucsctreehouse/fusion
4,0.151184,THR32_0941_S01,https://hub.docker.com/r/ucsctreehouse/mini-va...
5,0.024285,THR32_0942_S01,https://hub.docker.com/alpine
6,23.184223,THR32_0942_S01,https://quay.io/ucsc_cgl/rnaseq-cgl-pipeline
7,9.450743,THR32_0942_S01,https://hub.docker.com/r/ucsctreehouse/bam-ume...
8,2.992001,THR32_0942_S01,https://hub.docker.com/r/ucsctreehouse/fusion
9,0.261034,THR32_0942_S01,https://hub.docker.com/r/ucsctreehouse/mini-va...


In [56]:
stats = runtimes.pivot(columns="pipeline").describe()
stats.mean()

          pipeline                                            
duration  https://hub.docker.com/alpine                            3.447476
          https://hub.docker.com/r/ucsctreehouse/bam-umend-qc      7.572514
          https://hub.docker.com/r/ucsctreehouse/fusion            4.881853
          https://hub.docker.com/r/ucsctreehouse/mini-var-call     2.881609
          https://quay.io/ucsc_cgl/rnaseq-cgl-pipeline            15.750411
dtype: float64

In [57]:
stats.sum(axis=1)

count    110.000000
mean      25.208235
std       10.114932
min       10.003490
25%       17.880441
50%       25.618358
75%       29.683642
max       47.761812
dtype: float64