Directly download the output of RNASeq into memory and extract expression levels into a pandas dataframe

In [1]:
import os
import io
import requests
import tarfile
import numpy as np
import pandas as pd

In [2]:
def get_redwood_signed_url(object_id):
    """
    Get a signed url to download object_id
    """
    url = "https://storage.ucsc-cgl.org:5431/download/{}".format(object_id)
    parameters = {"offset": "0", "length": "-1", "external": "true"}
    with open(".redwood/token") as token:
        header = {"AUTHORIZATION": "Bearer {}".format(token.read().rstrip())}    
    response = requests.get(url, headers=header, params=parameters, verify=False)
    response.raise_for_status()
    return response.json()["parts"][0]["url"]

In [3]:
def get_redwood_object(object_id):
    """
    Get an object from Redwood by object id
    """
    response = requests.get(get_redwood_signed_url(object_id), stream=True)
    response.raise_for_status()
    return response.content

In [4]:
%%time
content = get_redwood_object("140e147d-d58c-56fe-b616-318dbd9f3e7a")
tar = tarfile.open(fileobj=io.BytesIO(content), mode="r:gz")



CPU times: user 1.91 s, sys: 952 ms, total: 2.86 s
Wall time: 53.3 s


In [48]:
tar.list()

-rw-r--r-- root/root    1563801 2017-02-03 08:27:27 _EGAR00001270204_SJINF060_R-H8RMVADXX/RSEM/rsem.genes.norm_counts.tab
-rw-r--r-- root/root    1439045 2017-02-03 08:27:26 _EGAR00001270204_SJINF060_R-H8RMVADXX/RSEM/rsem.genes.raw_counts.tab
-rw-r--r-- root/root    8841367 2017-02-03 08:27:30 _EGAR00001270204_SJINF060_R-H8RMVADXX/RSEM/rsem.isoform.norm_counts.tab
-rw-r--r-- root/root    8356848 2017-02-03 08:27:27 _EGAR00001270204_SJINF060_R-H8RMVADXX/RSEM/rsem.isoform.raw_counts.tab
-rw-r--r-- root/root    6512312 2017-02-03 08:27:18 _EGAR00001270204_SJINF060_R-H8RMVADXX/RSEM/rsem_genes.results
-rw-r--r-- root/root   13734108 2017-02-03 08:27:18 _EGAR00001270204_SJINF060_R-H8RMVADXX/RSEM/rsem_isoforms.results
-rw-r--r-- root/root     936376 2017-02-03 08:27:52 _EGAR00001270204_SJINF060_R-H8RMVADXX/RSEM/Hugo/rsem.genes.norm_counts.hugo.tab
-rw-r--r-- root/root     861854 2017-02-03 08:27:52 _EGAR00001270204_SJINF060_R-H8RMVADXX/RSEM/Hugo/rsem.genes.raw_counts.hugo.tab
-rw-r--r-- root/

In [51]:
norm_counts_path = [name for name in tar.getnames() if "rsem.genes.norm_counts.hugo.tab" in name][0]
norm_counts = tar.extractfile(norm_counts_path)

from collections import defaultdict
converters = defaultdict(str)
converters["gene_id"] = str

expression = pd.read_csv(norm_counts, sep="\t", index_col=0, converters=converters, dtype=np.float32)
print "Shape: {}".format(expression.shape)
print "Type: {}".format(expression.dtypes)
print expression.head()

Shape: (60448, 1)
Type: _EGAR00001270204_SJINF060_R-H8RMVADXX    float32
dtype: object
          _EGAR00001270204_SJINF060_R-H8RMVADXX
gene_id                                        
TSPAN6                                 3.021100
TNMD                                   0.000000
DPM1                                 444.108795
SCYL3                                810.045288
C1orf112                            1055.513550


In [62]:
url = "https://ucsc-cgl.org/api/v1/repository/files/exportFull"
files = pd.DataFrame.from_csv(url, sep='\t')
files.head() 

Unnamed: 0_level_0,Project,Center Name,Submitter Donor ID,Donor UUID,Submitter Donor Primary Site,Submitter Specimen ID,Specimen UUID,Submitter Specimen Type,Submitter Experimental Design,Submitter Sample ID,Sample UUID,Analysis Type,Workflow Name,Workflow Version,File Type,File Path,Upload File ID,Data Bundle UUID,Metadata.json
Program,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Treehouse,Expression Analysis,THR10,THR10_0242,cffe6db3-ec6d-5049-ab65-0f8091eafe6f,BTO:0000042,THR10_0242_S01,3c48be7e-84e6-5a3c-91d5-55b81d1bb0ed,Primary tumour - other,RNA-Seq,THR10_0242_S01,07a4a630-a706-5732-a2b5-b1a9db19e2da,sequence_upload,spinnaker,1.0.0,fastq.gz,THR10_0242_S01.5.R1.fq.gz,94e5dd47-dd5d-5d25-b2ba-258ea6848887,38072b6b-deea-5575-8b3b-ba276fef1b61,c0bb9c18-7797-5f72-afd3-cec18aea8beb
Treehouse,Expression Analysis,THR10,THR10_0242,cffe6db3-ec6d-5049-ab65-0f8091eafe6f,BTO:0000042,THR10_0242_S01,3c48be7e-84e6-5a3c-91d5-55b81d1bb0ed,Primary tumour - other,RNA-Seq,THR10_0242_S01,07a4a630-a706-5732-a2b5-b1a9db19e2da,sequence_upload,spinnaker,1.0.0,fastq.gz,THR10_0242_S01.5.R2.fq.gz,0475229e-981d-5752-ad3e-f6d1a7409bf4,38072b6b-deea-5575-8b3b-ba276fef1b61,c0bb9c18-7797-5f72-afd3-cec18aea8beb
SU2C,WCDT,UCSF,DTB-053,29cb5925-bc6f-5a1c-9f06-37ad4fe4eb76,,DTB-053_Baseline,ba348c00-8d1d-5dfd-b39f-f8bbffc657f0,Metastatic tumour - other,RNA-Seq,DTB-053_Baseline_1,64a0c69c-67d9-538a-bace-ad8921fd85d7,sequence_upload,Spinnaker,1.0.1,fastq.tar,DTB-053_rnaseq_fastq.tar,0d0dbe3c-1b1b-536f-a7db-0ecf32810bf7,03a49353-ff4d-5ca8-985c-55307685b8f0,939dd4b4-9ebc-5f30-8ebe-dc57b0342bab
SU2C,WCDT,UCSF,DTB-053,29cb5925-bc6f-5a1c-9f06-37ad4fe4eb76,,DTB-053_Baseline,ba348c00-8d1d-5dfd-b39f-f8bbffc657f0,Metastatic tumour - other,RNA-Seq,DTB-053_Baseline_1,64a0c69c-67d9-538a-bace-ad8921fd85d7,rna_seq_quantification,quay.io/ucsc_cgl/rnaseq-cgl-pipeline,3.2.1-1,bam,DTB-053_Baseline_1.sortedByCoord.md.bam,fc1dde3b-610d-5876-9fab-b12c25fbd988,a185fff2-5bcf-4422-a9f1-fd7747c1f651,d3965c49-6c30-525a-adb5-d953bf5198a5
SU2C,WCDT,UCSF,DTB-053,29cb5925-bc6f-5a1c-9f06-37ad4fe4eb76,,DTB-053_Baseline,ba348c00-8d1d-5dfd-b39f-f8bbffc657f0,Metastatic tumour - other,RNA-Seq,DTB-053_Baseline_1,64a0c69c-67d9-538a-bace-ad8921fd85d7,rna_seq_quantification,quay.io/ucsc_cgl/rnaseq-cgl-pipeline,3.2.1-1,gz,DTB-053_Baseline_1.tar.gz,96c5bb13-b31c-58b0-bfe1-6d047bded8f0,a185fff2-5bcf-4422-a9f1-fd7747c1f651,d3965c49-6c30-525a-adb5-d953bf5198a5


In [69]:
files[
    (files["Submitter Donor ID"] == "THR10_0242") &
    (files["Submitter Specimen ID"] == "THR10_0242_S01") &
    (files["Submitter Sample ID"] == "THR10_0242_S01")]

Unnamed: 0_level_0,Project,Center Name,Submitter Donor ID,Donor UUID,Submitter Donor Primary Site,Submitter Specimen ID,Specimen UUID,Submitter Specimen Type,Submitter Experimental Design,Submitter Sample ID,Sample UUID,Analysis Type,Workflow Name,Workflow Version,File Type,File Path,Upload File ID,Data Bundle UUID,Metadata.json
Program,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Treehouse,Expression Analysis,THR10,THR10_0242,cffe6db3-ec6d-5049-ab65-0f8091eafe6f,BTO:0000042,THR10_0242_S01,3c48be7e-84e6-5a3c-91d5-55b81d1bb0ed,Primary tumour - other,RNA-Seq,THR10_0242_S01,07a4a630-a706-5732-a2b5-b1a9db19e2da,sequence_upload,spinnaker,1.0.0,fastq.gz,THR10_0242_S01.5.R1.fq.gz,94e5dd47-dd5d-5d25-b2ba-258ea6848887,38072b6b-deea-5575-8b3b-ba276fef1b61,c0bb9c18-7797-5f72-afd3-cec18aea8beb
Treehouse,Expression Analysis,THR10,THR10_0242,cffe6db3-ec6d-5049-ab65-0f8091eafe6f,BTO:0000042,THR10_0242_S01,3c48be7e-84e6-5a3c-91d5-55b81d1bb0ed,Primary tumour - other,RNA-Seq,THR10_0242_S01,07a4a630-a706-5732-a2b5-b1a9db19e2da,sequence_upload,spinnaker,1.0.0,fastq.gz,THR10_0242_S01.5.R2.fq.gz,0475229e-981d-5752-ad3e-f6d1a7409bf4,38072b6b-deea-5575-8b3b-ba276fef1b61,c0bb9c18-7797-5f72-afd3-cec18aea8beb


In [70]:
files[
    (files["Submitter Donor ID"] == "THR10_0242")]

Unnamed: 0_level_0,Project,Center Name,Submitter Donor ID,Donor UUID,Submitter Donor Primary Site,Submitter Specimen ID,Specimen UUID,Submitter Specimen Type,Submitter Experimental Design,Submitter Sample ID,Sample UUID,Analysis Type,Workflow Name,Workflow Version,File Type,File Path,Upload File ID,Data Bundle UUID,Metadata.json
Program,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Treehouse,Expression Analysis,THR10,THR10_0242,cffe6db3-ec6d-5049-ab65-0f8091eafe6f,BTO:0000042,THR10_0242_S01,3c48be7e-84e6-5a3c-91d5-55b81d1bb0ed,Primary tumour - other,RNA-Seq,THR10_0242_S01,07a4a630-a706-5732-a2b5-b1a9db19e2da,sequence_upload,spinnaker,1.0.0,fastq.gz,THR10_0242_S01.5.R1.fq.gz,94e5dd47-dd5d-5d25-b2ba-258ea6848887,38072b6b-deea-5575-8b3b-ba276fef1b61,c0bb9c18-7797-5f72-afd3-cec18aea8beb
Treehouse,Expression Analysis,THR10,THR10_0242,cffe6db3-ec6d-5049-ab65-0f8091eafe6f,BTO:0000042,THR10_0242_S01,3c48be7e-84e6-5a3c-91d5-55b81d1bb0ed,Primary tumour - other,RNA-Seq,THR10_0242_S01,07a4a630-a706-5732-a2b5-b1a9db19e2da,sequence_upload,spinnaker,1.0.0,fastq.gz,THR10_0242_S01.5.R2.fq.gz,0475229e-981d-5752-ad3e-f6d1a7409bf4,38072b6b-deea-5575-8b3b-ba276fef1b61,c0bb9c18-7797-5f72-afd3-cec18aea8beb


In [None]:
df = get_object(donor, sample, specimen)