In [2]:
import os
import fnmatch
import json
import dateutil.parser
import re
import pprint
import glob
import pandas as pd

In [14]:
with open("/treehouse/treeshop/manifests/134.txt") as f:
    ids = sorted([word.strip() for line in f.readlines() for word in line.split(',') if word.strip()])
print("Found {} ids".format(len(ids)))

Found 84 ids


In [15]:
# Get all methods
methods = [json.loads(open(m).read()) for id in ids 
           for m in glob.glob("/treehouse/archive/downstream/{}/secondary/*/methods.json".format(id))]
print("Found {} secondary outputs".format(len(methods)))

Found 335 secondary outputs


In [11]:
# UMEND QC
reports = [{"id": id, **json.loads(open(m).read())} for id in ids 
           for m in glob.glob("/treehouse/archive/downstream/{}/secondary/*bam-umend-qc-*/bam_umend_qc.json".format(id))]
print("{} Completed RNASeq and BAM UMEND QC".format(len(reports)))
print("{} Passed".format(sum(r["qc"] == "PASS" for r in reports)))
print("{} Failed: {}".format(sum(r["qc"] == "FAIL" for r in reports), [r["id"] for r in reports if r["qc"] == "FAIL"]))

1 Completed RNASeq and BAM UMEND QC
1 Passed
0 Failed: []


In [5]:
[(r["id"], r["qc"], r["estExonicUniqMappedNonDupeReadCount"]) for r in reports]

[('TH34_1149_S01', 'PASS', 42801908.76),
 ('TH34_1150_S01', 'PASS', 74938147.44)]

In [6]:
reports = [{"id": id, 
            "methods": [m for m in glob.glob("/treehouse/archive/downstream/{}/secondary/*/methods.json".format(id))]}
            for id in ids]
[(r["id"], len(r["methods"])) for r in reports]

[('TH34_1149_S01', 5), ('TH34_1150_S01', 5)]

In [12]:
runtimes = pd.DataFrame([{"id": m["sample_id"], 
               "pipeline": m["pipeline"]["docker"]["url"],
               "duration": (dateutil.parser.parse(m["end"]) 
                            - dateutil.parser.parse(m["start"])).total_seconds()/(60*60)}
 for m in methods])
runtimes.groupby("id").head()

Unnamed: 0,duration,id,pipeline
0,0.019739,TH34_1149_S01,https://hub.docker.com/alpine
1,21.380914,TH34_1149_S01,https://quay.io/ucsc_cgl/rnaseq-cgl-pipeline
2,5.655238,TH34_1149_S01,https://hub.docker.com/r/ucsctreehouse/bam-ume...
3,3.653716,TH34_1149_S01,https://hub.docker.com/r/ucsctreehouse/fusion
4,0.299909,TH34_1149_S01,https://hub.docker.com/r/ucsctreehouse/mini-va...
5,0.025909,TH34_1150_S01,https://hub.docker.com/alpine
6,18.864039,TH34_1150_S01,https://quay.io/ucsc_cgl/rnaseq-cgl-pipeline
7,6.689221,TH34_1150_S01,https://hub.docker.com/r/ucsctreehouse/bam-ume...
8,4.068017,TH34_1150_S01,https://hub.docker.com/r/ucsctreehouse/fusion
9,0.215631,TH34_1150_S01,https://hub.docker.com/r/ucsctreehouse/mini-va...


In [10]:
stats = runtimes.pivot(columns="pipeline").describe()
stats.max()

          pipeline                                            
duration  https://hub.docker.com/alpine                            2.000000
          https://hub.docker.com/r/ucsctreehouse/bam-umend-qc      6.689221
          https://hub.docker.com/r/ucsctreehouse/fusion            4.068017
          https://hub.docker.com/r/ucsctreehouse/mini-var-call     2.000000
          https://quay.io/ucsc_cgl/rnaseq-cgl-pipeline            21.380914
dtype: float64

In [9]:
stats.sum(axis=1)

count    10.000000
mean     30.436167
std       2.867746
min      28.408364
25%      29.422266
50%      30.436167
75%      31.450068
max      32.463970
dtype: float64