In [132]:
import os
import fnmatch
import json
import dateutil.parser
import re
import pprint
import glob
import pandas as pd

In [175]:
with open("/treehouse/treeshop/manifests/140.1.txt") as f:
    ids = sorted([word.strip() for line in f.readlines() for word in line.split(',') if word.strip()])
print("Found {} ids".format(len(ids)))

Found 4 ids


In [186]:
# All methods
methods = [json.loads(open(m).read()) for id in ids 
           for m in glob.glob("/treehouse/archive/downstream/{}/secondary/*/methods.json".format(id))]
print("Found {} secondary outputs".format(len(methods)))

Found 20 secondary outputs


In [187]:
# UMEND QC
reports = [{"id": id, **json.loads(open(m).read())} for id in ids 
           for m in glob.glob("/treehouse/archive/downstream/{}/secondary/*bam-umend-qc-1.1.0-cc481e4/bam_umend_qc.json".format(id))]
print("{} Completed RNASeq and BAM UMEND QC".format(len(reports)))
print("{} Passed".format(sum(r["qc"] == "PASS" for r in reports)))
print("{} Failed: {}".format(sum(r["qc"] == "FAIL" for r in reports), [r["id"] for r in reports if r["qc"] == "FAIL"]))

4 Completed RNASeq and BAM UMEND QC
3 Passed
1 Failed: ['TH01_0717_S01']


In [190]:
pd.DataFrame(reports).set_index('id')

Unnamed: 0_level_0,estExonicUniqMappedNonDupeReadCount,input,qc,uniqMappedNonDupeReadCount
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TH01_0710_S01,44697630.0,readDist.txt,PASS,64867826.0
TH01_0713_S01,33975060.0,readDist.txt,PASS,71991565.0
TH01_0715_S01,41372970.0,readDist.txt,PASS,55923263.5
TH01_0717_S01,9226395.0,readDist.txt,FAIL,19771111.0


In [191]:
[(r["id"], r["qc"], r["estExonicUniqMappedNonDupeReadCount"]) for r in reports]

[('TH01_0710_S01', 'PASS', 44697633),
 ('TH01_0713_S01', 'PASS', 33975062.7),
 ('TH01_0715_S01', 'PASS', 41372967.375),
 ('TH01_0717_S01', 'FAIL', 9226394.94)]

In [183]:
reports = [{"id": id, 
            "methods": [m for m in glob.glob("/treehouse/archive/downstream/{}/secondary/*/methods.json".format(id))]}
            for id in ids]
[(r["id"], len(r["methods"])) for r in reports]

[('TH01_0710_S01', 2),
 ('TH01_0713_S01', 5),
 ('TH01_0715_S01', 5),
 ('TH01_0717_S01', 5)]

In [173]:
runtimes = pd.DataFrame([{"id": m["sample_id"], 
               "pipeline": m["pipeline"]["docker"]["url"],
               "duration": (dateutil.parser.parse(m["end"]) 
                            - dateutil.parser.parse(m["start"])).total_seconds()/(60*60)}
 for m in methods])
runtimes

Unnamed: 0,duration,id,pipeline
0,0.020491,TH01_0710_S01,https://hub.docker.com/alpine
1,0.011007,TH01_0713_S01,https://hub.docker.com/alpine
2,7.761169,TH01_0713_S01,https://quay.io/ucsc_cgl/rnaseq-cgl-pipeline
3,0.011527,TH01_0715_S01,https://hub.docker.com/alpine
4,7.843065,TH01_0715_S01,https://quay.io/ucsc_cgl/rnaseq-cgl-pipeline
5,0.006318,TH01_0717_S01,https://hub.docker.com/alpine
6,4.231748,TH01_0717_S01,https://quay.io/ucsc_cgl/rnaseq-cgl-pipeline
7,4.446321,TH01_0717_S01,https://hub.docker.com/r/ucsctreehouse/bam-ume...
8,1.843002,TH01_0717_S01,https://hub.docker.com/r/ucsctreehouse/fusion
9,0.146709,TH01_0717_S01,https://hub.docker.com/r/ucsctreehouse/mini-va...


In [159]:
stats = runtimes.pivot(columns="pipeline").describe()
stats

Unnamed: 0_level_0,duration,duration,duration,duration,duration
pipeline,https://hub.docker.com/alpine,https://hub.docker.com/r/ucsctreehouse/bam-umend-qc,https://hub.docker.com/r/ucsctreehouse/fusion,https://hub.docker.com/r/ucsctreehouse/mini-var-call,https://quay.io/ucsc_cgl/rnaseq-cgl-pipeline
count,12.0,12.0,12.0,12.0,12.0
mean,0.023692,6.956021,2.934652,0.214092,20.239206
std,0.006138,1.68841,0.963998,0.043735,4.083669
min,0.018981,5.237647,2.112196,0.154571,15.158212
25%,0.01959,5.626776,2.288067,0.186933,17.298667
50%,0.021123,6.219239,2.636636,0.216102,19.556427
75%,0.024795,8.177105,3.077107,0.233472,22.941253
max,0.038409,10.152545,5.49486,0.302643,27.597463


In [162]:
stats.sum(axis=1)

count    60.000000
mean     30.367663
std       6.785950
min      22.681608
25%      25.420033
50%      28.649527
75%      34.453732
max      43.585920
dtype: float64

In [160]:
reports = [{"id": id, **json.loads(open(m).read())} for id in ids 
           for m in glob.glob("/treehouse/archive/downstream/{}/secondary/*fusion*/methods.json".format(id))]