In [2]:
import os
import fnmatch
import json
import dateutil.parser
import re
import pprint
import glob
import pandas as pd

In [3]:
# ids = os.listdir("/treehouse/treeshop/jpfeil/downstream/")

# Big files for final run
ids = "G26192,G26226,G27317,G27324,G27327,G27328,G27330,G27332,G27334,G27337,G27367,G27371,G27375,G27385,G27452,G27453,G27470,G27478,G27518,G27520,G27526,G27535,G28548,G28554,G28577,G28581,G28590,G28598,G28599,G28600,G28614,G30579,G30601,G30614,G30624,G30647,G41669,G41683,G41699,G41716,G41724,G41727,G41741,G41750".split(",")

print("Found {} ids".format(len(ids)))

Found 44 ids


In [4]:
# All methods
methods = [json.loads(open(m).read()) for id in ids 
           for m in glob.glob("/treehouse/treeshop/jpfeil/downstream/{}/secondary/methods.json".format(id))]
print("Found {} secondary outputs".format(len(methods)))

Found 44 secondary outputs


In [5]:
# UMEND QC
reports = [{"id": id, **json.loads(open(m).read())} for id in ids 
           for m in glob.glob("/treehouse/treeshop/jpfeil/downstream/{}/secondary/*bam-umend-qc-1.1.0-cc481e4/bam_umend_qc.json".format(id))]
print("{} Completed RNASeq and BAM UMEND QC".format(len(reports)))
print("{} Passed".format(sum(r["qc"] == "PASS" for r in reports)))
print("{} Failed: {}".format(sum(r["qc"] == "FAIL" for r in reports), [r["id"] for r in reports if r["qc"] == "FAIL"]))

[(r["id"], r["qc"], r["estExonicUniqMappedNonDupeReadCount"], r["uniqMappedNonDupeReadCount"]) for r in reports]

44 Completed RNASeq and BAM UMEND QC
43 Passed
1 Failed: ['G30624']


[('G26192', 'PASS', 72380131.49, 82313949.5),
 ('G26226', 'PASS', 91845558, 107200308),
 ('G27317', 'PASS', 55596520.875, 65238683.5),
 ('G27324', 'PASS', 63887777.53, 73763241),
 ('G27327', 'PASS', 67504976.465, 76518108),
 ('G27328', 'PASS', 64001053.77, 73126639.5),
 ('G27330', 'PASS', 55863494.915, 60916595.5),
 ('G27332', 'PASS', 67570875.19, 78296005.5),
 ('G27334', 'PASS', 70914979.84, 82553615.5),
 ('G27337', 'PASS', 67834309.64, 77733435),
 ('G27367', 'PASS', 61552890.31, 70963336.5),
 ('G27371', 'PASS', 65296111.805, 75264661),
 ('G27375', 'PASS', 61894630.5, 73536795),
 ('G27385', 'PASS', 64598764.62, 78259475.5),
 ('G27452', 'PASS', 34014403.82, 42798798),
 ('G27453', 'PASS', 72846453.59, 85781894),
 ('G27470', 'PASS', 69099881.56, 79284923),
 ('G27478', 'PASS', 72755875.59, 82867810.5),
 ('G27518', 'PASS', 72732314.25, 84546216),
 ('G27520', 'PASS', 64742590.77, 80854321),
 ('G27526', 'PASS', 70903046.88, 78769468.5),
 ('G27535', 'PASS', 77675574.445, 89740137),
 ('G28548'

In [6]:
runtimes = pd.DataFrame([{"id": m["sample_id"], 
               "duration": (dateutil.parser.parse(m["end"]) 
                            - dateutil.parser.parse(m["start"])).total_seconds()/(60*60)}
 for m in methods])
runtimes.groupby("id").head()

Unnamed: 0,duration,id
0,14.886711,G26192
1,31.64981,G26226
2,17.365594,G27317
3,13.439397,G27324
4,13.464116,G27327
5,14.32554,G27328
6,17.545979,G27330
7,14.325338,G27332
8,14.316235,G27334
9,13.372023,G27337


In [7]:
runtimes.describe()

Unnamed: 0,duration
count,44.0
mean,15.744809
std,4.622075
min,8.76956
25%,13.457936
50%,14.760824
75%,16.64887
max,31.64981


In [8]:
""" Experimental processing from ceph or s3 storage """
import boto3
from botocore.config import Config

boto3.setup_default_session(profile_name="ceph")
s3 = boto3.resource("s3", endpoint_url="http://ceph-gw-01.pod",
                    config=Config(signature_version='s3'))
fastqs = sorted([obj.key for obj in s3.Bucket("CCLE").objects.all()
                 if re.search(r"fastq|fq", obj.key)])

pairs = [(fastqs[i], fastqs[i+1]) for i in range(0, len(fastqs), 2)]
print("Found {} samples".format(len(pairs)))
print(fastqs[0:4])
for pair in pairs:
    sample_id = pair[0].split(".")[0]

    obj = s3.Object("CCLE", pair[0])
    if obj.content_length > 10000000000:
        print("Skipping {}, to large: {}".format(sample_id, obj.content_length))
        continue


Found 415 samples
['G15512.HCC1954.5.btfv9.R1.fastq.gz', 'G15512.HCC1954.5.btfv9.R2.fastq.gz', 'G20462.KMS-11.2.btfv9.R1.fastq.gz', 'G20462.KMS-11.2.btfv9.R2.fastq.gz']
Skipping G15512, to large: 164160828758
Skipping G26192, to large: 10561902095
Skipping G26226, to large: 15118416465
Skipping G27317, to large: 11674216298
Skipping G27324, to large: 10576490093
Skipping G27327, to large: 10090451623
Skipping G27328, to large: 10570373956
Skipping G27330, to large: 10305050147
Skipping G27332, to large: 10709224912
Skipping G27334, to large: 11148224464
Skipping G27337, to large: 10265234861
Skipping G27367, to large: 10059856820
Skipping G27371, to large: 10416632958
Skipping G27375, to large: 10553189737
Skipping G27385, to large: 10220244308
Skipping G27452, to large: 11041705763
Skipping G27453, to large: 10868463903
Skipping G27470, to large: 10266671112
Skipping G27478, to large: 10663105369
Skipping G27518, to large: 10050515018
Skipping G27520, to large: 11129507460
Skipping G2

In [9]:
# FASTQC Reads
fastqc = [{"id": id, "reads": re.findall("Total Sequences\D*(\d*)", open(m).read())[0]} for id in ids
           for m in glob.glob("/treehouse/treeshop/jpfeil/downstream/{}/secondary/ucsc_cgl-rna*/QC/fastQC/*.html".format(id))]
print("Found {} fastQC reports".format(len(fastqc)))
print(fastqc)

Found 88 fastQC reports
[{'id': 'G26192', 'reads': '117946111'}, {'id': 'G26192', 'reads': '117946111'}, {'id': 'G26226', 'reads': '169743158'}, {'id': 'G26226', 'reads': '169743158'}, {'id': 'G27317', 'reads': '127733736'}, {'id': 'G27317', 'reads': '127733736'}, {'id': 'G27324', 'reads': '116068176'}, {'id': 'G27324', 'reads': '116068176'}, {'id': 'G27327', 'reads': '110926944'}, {'id': 'G27327', 'reads': '110926944'}, {'id': 'G27328', 'reads': '116779589'}, {'id': 'G27328', 'reads': '116779589'}, {'id': 'G27330', 'reads': '113077283'}, {'id': 'G27330', 'reads': '113077283'}, {'id': 'G27332', 'reads': '118571166'}, {'id': 'G27332', 'reads': '118571166'}, {'id': 'G27334', 'reads': '122896621'}, {'id': 'G27334', 'reads': '122896621'}, {'id': 'G27337', 'reads': '113804694'}, {'id': 'G27337', 'reads': '113804694'}, {'id': 'G27367', 'reads': '110625165'}, {'id': 'G27367', 'reads': '110625165'}, {'id': 'G27371', 'reads': '115153509'}, {'id': 'G27371', 'reads': '115153509'}, {'id': 'G27375'