In [2]:
import os
import fnmatch
import json
import dateutil.parser
import re
import pprint
import glob
import pandas as pd

In [14]:
with open("/treehouse/treeshop/manifests/134.txt") as f:
    ids = sorted([word.strip() for line in f.readlines() for word in line.split(',') if word.strip()])
print("Found {} ids".format(len(ids)))

Found 84 ids


In [1]:
ids = ["SJINF001_E", "SJBALL010_D"]

In [6]:
# Get all methods
methods = [json.loads(open(m).read()) for id in ids 
           for m in glob.glob("/treehouse/treeshop/stjude/btfv9/{}/secondary/*/methods.json".format(id))]
print("Found {} secondary outputs".format(len(methods)))

Found 10 secondary outputs


In [11]:
# UMEND QC
reports = [{"id": id, **json.loads(open(m).read())} for id in ids 
           for m in glob.glob("/treehouse/treeshop/stjude/btfv9/{}/secondary/*/methods.json".format(id))]
print("{} Completed RNASeq and BAM UMEND QC".format(len(reports)))
print("{} Passed".format(sum(r["qc"] == "PASS" for r in reports)))
print("{} Failed: {}".format(sum(r["qc"] == "FAIL" for r in reports), [r["id"] for r in reports if r["qc"] == "FAIL"]))

10 Completed RNASeq and BAM UMEND QC


KeyError: 'qc'

In [5]:
[(r["id"], r["qc"], r["estExonicUniqMappedNonDupeReadCount"]) for r in reports]

[('TH34_1149_S01', 'PASS', 42801908.76),
 ('TH34_1150_S01', 'PASS', 74938147.44)]

In [9]:
reports = [{"id": id, 
            "methods": [m for m in glob.glob("/treehouse/treeshop/stjude/btfv9/{}/secondary/*/methods.json".format(id))]}
            for id in ids]
[(r["id"], len(r["methods"])) for r in reports]

[('SJINF001_E', 5), ('SJBALL010_D', 5)]

In [10]:
runtimes = pd.DataFrame([{"id": m["sample_id"], 
               "pipeline": m["pipeline"]["docker"]["url"],
               "duration": (dateutil.parser.parse(m["end"]) 
                            - dateutil.parser.parse(m["start"])).total_seconds()/(60*60)}
 for m in methods])
runtimes.groupby("id").head()

Unnamed: 0,duration,id,pipeline
0,0.007623,SJINF001_E,https://hub.docker.com/alpine
1,3.324949,SJINF001_E,https://quay.io/ucsc_cgl/rnaseq-cgl-pipeline
2,1.257035,SJINF001_E,https://hub.docker.com/r/ucsctreehouse/bam-ume...
3,4.332746,SJINF001_E,https://hub.docker.com/r/ucsctreehouse/fusion
4,0.062882,SJINF001_E,https://hub.docker.com/r/ucsctreehouse/mini-va...
5,0.005244,SJBALL010_D,https://hub.docker.com/alpine
6,2.644941,SJBALL010_D,https://quay.io/ucsc_cgl/rnaseq-cgl-pipeline
7,0.969958,SJBALL010_D,https://hub.docker.com/r/ucsctreehouse/bam-ume...
8,2.899541,SJBALL010_D,https://hub.docker.com/r/ucsctreehouse/fusion
9,0.098115,SJBALL010_D,https://hub.docker.com/r/ucsctreehouse/mini-va...


In [13]:
stats = runtimes.pivot(columns="pipeline").describe()
stats.max()

          pipeline                                            
duration  https://hub.docker.com/alpine                           2.000000
          https://hub.docker.com/r/ucsctreehouse/bam-umend-qc     2.000000
          https://hub.docker.com/r/ucsctreehouse/fusion           4.332746
          https://hub.docker.com/r/ucsctreehouse/mini-var-call    2.000000
          https://quay.io/ucsc_cgl/rnaseq-cgl-pipeline            3.324949
dtype: float64

In [14]:
stats.sum(axis=1)

count    10.000000
mean      7.801517
std       1.723856
min       6.582567
25%       7.192042
50%       7.801517
75%       8.410992
max       9.020468
dtype: float64