In [1]:
import requests
import pandas

### Set up server URL
Change the URL here to point the client to other implementations of the api.  The examples here are based on the GTEx server running at:
http://gtexportal.org/rnaget

GTEx has provided documentaation on their implementation at: http://gtexportal.org/rnaget/docs

In [2]:
server_params = {'gtex': {'rnaget_url': "http://gtexportal.org/rnaget",
                          'headers': {'User-Agent': 'python-requests/2.21.0',
                                      'Accept-Encoding': 'gzip, deflate',
                                      'Accept': '*/*',
                                      'Connection': 'keep-alive'}
                         }
                }

# use this to select the target server.  This should be the only knob to turn - everything
# from here on should use the API.  Your server parameters can be added to the dictionary.
rnaget_params = server_params['gtex']

### Retrieve implementation details
The /service-info endpoint will return useful information about the implementation including the API version implemented and the supported endpoints.

In [3]:
r = requests.get(f"{rnaget_params['rnaget_url']}/service-info", headers=rnaget_params['headers'])
print(r.json())

{'id': 'org.gtexportal.api.rnaget', 'name': 'GTEx Portal RNAget', 'version': '1.0.0', 'type': {'group': 'org.ga4gh', 'artifact': 'rnaget', 'version': '1.2.0'}, 'organization': {'name': 'GTEx Project', 'url': 'https://gtexportal.org'}, 'description': 'This service provides access to GTEx public RNA-Seq data.', 'contactUrl': 'https://gtexportal.org/home/contact', 'documentationUrl': 'https://gtexportal.org/rnaget/docs', 'createdAt': None, 'updatedAt': None, 'environment': 'prod', 'supported': {'projects': True, 'studies': True, 'expressions': True, 'continuous': False}}


From the service-info we see that v1.2.0 of the API is implemented.  This server also supports the /projects, /studies and /expressions endpoints.

### Project

In [4]:
r = requests.get(f"{rnaget_params['rnaget_url']}/projects", headers=rnaget_params['headers'])
print(r.json())

[{'id': 'gtex', 'version': '1.0', 'name': 'GTEx', 'description': 'GTEx Public includes all publicly available data generated by the Genotype Tissue Expression (GTEx) Project'}]


With the projectID we can get details

In [5]:
projectID = 'gtex'
r = requests.get(f"{rnaget_params['rnaget_url']}/projects/{projectID}", headers=rnaget_params['headers'])
print(r.json())

{'id': 'gtex', 'version': '1.0', 'name': 'GTEx', 'description': 'GTEx Public includes all publicly available data generated by the Genotype Tissue Expression (GTEx) Project'}


### Study
The /studies endpoint behaves similarly

In [6]:
r = requests.get(f"{rnaget_params['rnaget_url']}/studies", headers=rnaget_params['headers'])
print(r.json())

[{'id': 'gtex_v8', 'version': '1.0', 'name': 'GTEx Analysis V8', 'description': 'This is the V8 release of GTEx data. V8 is a full release', 'parentProjectId': 'gtex', 'genome': 'GRCh38/hg38'}]


In [7]:
studyID = 'gtex_v8'
r = requests.get(f"{rnaget_params['rnaget_url']}/studies/{studyID}", headers=rnaget_params['headers'])
print(r.json())

{'id': 'gtex_v8', 'version': '1.0', 'name': 'GTEx Analysis V8', 'description': 'This is the V8 release of GTEx data. V8 is a full release', 'parentProjectId': 'gtex', 'genome': 'GRCh38/hg38'}


### Expression
This is the heart of the RNAget API and allows for several operations.  These examples continue to use the GTEx implementation.  RNAget is a retrieval API so search and discovery are out of scope.  The GTEx documentation does inform us of the available values for the required path parameter expressionId: gene_tpms

In [8]:
# to demonstrate a key API feature and make the example smaller we will request a slice of the matrix
featureList = 'EGFR'
payload = {"featureNameList": featureList, "tissueSiteDetailIDs": "Adipose_Subcutaneous"}
expressionId = 'gene_tpms'
r = requests.get(f"{rnaget_params['rnaget_url']}/expressions/{expressionId}/ticket", params=payload,
                 headers=rnaget_params['headers'])
print(r.json())

{'units': 'tsv', 'url': 'https://storage.cloud.google.com/prod_rnaget_working_storage/tmp5nr_zoy8.tsv', 'version': None, 'fileType': None, 'studyID': None, 'headers': None, 'MD5': None}


We have used the /ticket endpoint so the return json includes the URL to download the actual data

## Working with the files
In addition to getting a download URL, the API has the /bytes endpoint for inline data.  Using this we can stream the expression file to a local file and run whatever downstream analysis we wish.  The following are some simple navigation examples.

In [9]:
# to demonstrate a key API feature and make the example smaller we will request a slice of the matrix
featureList = ['EGFR','PTEN']
payload = {"featureNameList": featureList, "tissueSiteDetailIDs": "Adipose_Subcutaneous"}
expressionId = 'gene_tpms'

# use the /bytes endpoint to stream the data
r = requests.get(f"{rnaget_params['rnaget_url']}/expressions/{expressionId}/bytes", params=payload,
                 headers=rnaget_params['headers'], stream=True)

# save the streamed data locally (to a file in this case)
demo_data_file = "adipose_sub_EGFR_PTEN.tsv"
with open(demo_data_file, 'wb') as fd:
    for chunk in r.iter_content(chunk_size=128):
        fd.write(chunk)

The tsv file can now be read into whatever dataframe you want using all the usual import mechanisms.  Here pandas is used as an example.

In [10]:
df = pandas.read_csv(demo_data_file, sep='\t', header=0, index_col=0)
df

Unnamed: 0_level_0,GTEX-1117F-0226-SM-5GZZ7,GTEX-111CU-1826-SM-5GZYN,GTEX-111FC-0226-SM-5N9B8,GTEX-111VG-2326-SM-5N9BK,GTEX-111YS-2426-SM-5GZZQ,GTEX-1122O-2026-SM-9YFMG,GTEX-1128S-2126-SM-5H12U,GTEX-113IC-0226-SM-5HL5C,GTEX-117YX-2226-SM-5EGJJ,GTEX-11DXW-0326-SM-5H11W,...,GTEX-ZXES-2026-SM-5NQ6R,GTEX-ZXG5-0226-SM-59HJI,GTEX-ZYFC-0326-SM-5NQ7H,GTEX-ZYFD-0226-SM-5NQ86,GTEX-ZYT6-0326-SM-7LG5R,GTEX-ZYVF-0226-SM-5GIEG,GTEX-ZYW4-0226-SM-5E44M,GTEX-ZYY3-0226-SM-5E45M,GTEX-ZZ64-1626-SM-5E43W,GTEX-ZZPU-2726-SM-5NQ8O
gencode_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000146648.17,82.49,44.13,37.12,16.52,42.25,30.76,41.34,33.35,68.27,33.0,...,46.05,66.89,22.35,28.32,46.09,49.73,59.95,31.19,49.65,56.88
ENSG00000171862.10,42.55,39.73,37.82,19.66,55.36,38.11,33.11,40.25,67.74,25.5,...,37.73,45.91,29.62,48.67,28.54,34.83,53.31,31.31,92.12,46.3


In [11]:
df.index

Index(['ENSG00000146648.17', 'ENSG00000171862.10'], dtype='object', name='gencode_ids')

All the usual dataframe actions can be done.  Like selecting a subset:

In [12]:
df.iloc[:,:5]

Unnamed: 0_level_0,GTEX-1117F-0226-SM-5GZZ7,GTEX-111CU-1826-SM-5GZYN,GTEX-111FC-0226-SM-5N9B8,GTEX-111VG-2326-SM-5N9BK,GTEX-111YS-2426-SM-5GZZQ
gencode_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000146648.17,82.49,44.13,37.12,16.52,42.25
ENSG00000171862.10,42.55,39.73,37.82,19.66,55.36


and all of the usual statistics that Pandas defines.

In [13]:
for i in range(df.index.size):
    print(f'{df.index[i]}')
    print(f'Mean: {df.iloc[i, :].mean()}\nStd Dev: {df.iloc[i, :].std()}\n')

ENSG00000146648.17
Mean: 41.620633484162894
Std Dev: 16.43412996464274

ENSG00000171862.10
Mean: 47.18760180995475
Std Dev: 17.90503536354805



## Wrap it up
We can tie it all together in a nice little function that uses RNAget to query the GTEx server for the data to calculate summary statistics on a list of genes for a provided tissue.

In [14]:
def summarize_gtex_tissue_expression(tissue, expression_id, studyID, genelist=None, localfile=None):
    payload = {"tissueSiteDetailIDs": tissue}
    if genelist is not None:
        payload['featureNameList'] = genelist

    # use the /bytes endpoint to stream the data
    r = requests.get(f"{rnaget_params['rnaget_url']}/expressions/{expressionId}/bytes", params=payload,
                     headers=rnaget_params['headers'], stream=True)

    if localfile is not None:
        # save the streamed data locally (to a file in this case)
        with open(localfile, 'wb') as fd:
            for chunk in r.iter_content(chunk_size=128):
                fd.write(chunk)

        df = pandas.read_csv(localfile, sep='\t', header=0, index_col=0)
    # the else, just stream it into the df is left as an exercise to implement

    s = requests.get(f"{rnaget_params['rnaget_url']}/studies/{studyID}", headers=rnaget_params['headers'])
    gtex_name = s.json()['name']

    print(f'Gene Summary Statistics for {tissue}\nStudy: {gtex_name}')
    for i in range(df.index.size):
        print(f'{df.index[i]}')
        print(f'Mean: {df.iloc[i, :].mean()}\nStd Dev: {df.iloc[i, :].std()}\n')

In [15]:
featureList = ['EGFR','PTEN']
demo_data_file = "adipose_sub_EGFR_PTEN.tsv"
tissue = "Adipose_Subcutaneous"
expression_id = 'gene_tpms'
studyID = 'gtex_v8'
summarize_gtex_tissue_expression(tissue, expression_id, studyID, genelist=featureList, localfile=demo_data_file)

Gene Summary Statistics for Adipose_Subcutaneous
Study: GTEx Analysis V8
ENSG00000146648.17
Mean: 41.620633484162894
Std Dev: 16.43412996464274

ENSG00000171862.10
Mean: 47.18760180995475
Std Dev: 17.90503536354805

