# Getting Data by using EMG API

The following task shows how get metadata and annotations retrieved from the EMG API and store in .json format.

In [1]:
import copy
try:
    from urllib import urlencode
except ImportError:
    from urllib.parse import urlencode
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
from jsonapi_client import Session, Filter
%matplotlib inline

API_BASE = 'https://www.ebi.ac.uk/metagenomics/api/latest/'

In /home/mone/.local/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The text.latex.unicode rcparam was deprecated in Matplotlib 3.0 and will be removed in 3.2.
In /home/mone/.local/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The savefig.frameon rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.
In /home/mone/.local/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The pgf.debug rcparam was deprecated in Matplotlib 3.0 and will be removed in 3.2.
In /home/mone/.local/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The verbose.level rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.
In /home/mone/.local/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The verbose.fileo rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.


List all runs
https://www.ebi.ac.uk/metagenomics/api/latest/runs?experiment_type=metagenomic&study_accession=PRJEB1787

In [2]:
#This requisition shoul be made once, avoiding to block acess
def find_metadata(metadata, key):
    """
    Extract metadata value for given key
    """
    for m in metadata:
        if m['key'].lower() == key.lower():
            return m['value']
    return None


metadata_key = 'geographic location (depth)'
normilize_key = 'Predicted CDS with InterProScan match'

# map GO terms to the temperature
result = {}


with Session(API_BASE) as s:

    # temporary dict to store accession and metadata
    metadata_map = {}
    # list of runs missing metadata
    missing_meta = list()
    
    print('Loading data from API.', end='', flush=True)

    # preparing url
    params = {
        'experiment_type': 'metagenomic',
        'study_accession': 'ERP001736',
        'page_size': 20,
        'include': 'sample',
    }
    f = Filter(urlencode(params))
    # list runs
    for anls in s.iterate('analyses', f):
        print('.', end='', flush=True)

        # find temperature for each run
        try:
            m_value = float(find_metadata(anls.sample.sample_metadata, metadata_key))
        except:
            m_value = None
        if m_value is not None:
            metadata_map[anls.accession] = m_value
        else:
            # missing value, skip run!
            missing_meta.append(anls.accession)
            continue

        _pcds = int(find_metadata(anls.analysis_summary, normilize_key))
        if _pcds is None:
            # missing value, skip run!
            continue

        _temperature = metadata_map[anls.accession]
        try:
            result[_temperature]
        except KeyError:
            result[_temperature] = {}

        # list a summary of GO terms derived from InterPro matches
        for ann in anls.taxonomy:
            try:
                ann.hierarchy['phylum']
            except KeyError:
                continue
            if len(ann.hierarchy['phylum']) > 0:
                l = "{}:{}".format(ann.hierarchy['kingdom'], ann.hierarchy['phylum'])
                try:
                    result[_temperature][l]
                except KeyError:
                    result[_temperature][l] = list()
                # normalize annotation counts, adjusting value
                _norm = int(ann.count)/_pcds
                # assign value
                result[_temperature][l].append(_norm)
            
            

    print("DONE")
    # print("Missing: ", missing_meta)

Loading data from API..........................................................................................................................................................................................................................................................DONE


In [303]:
with Session(API_BASE) as s:
    study = s.get('studies', 'ERP001736').resource
    print('Study name:', study.study_name)
    print('Study abstract:', study.study_abstract)

Study name: Shotgun Sequencing of Tara Oceans DNA samples corresponding to size fractions for  prokaryotes.
Study abstract: Seawater was filtered from different depths to retain small cell sizes (Bacteria Organisms). The DNA was extracted and submitted to high throughput sequencing.


### Clean up data

In [3]:
# remove invalid temperatures
for k in copy.deepcopy(list(result.keys())):
    if k > 2000:
        del result[k]
# average value of the same temperature
for k in result:
    for k1 in result[k]:
        result[k][k1] = np.mean(result[k][k1])
print(result)

{25.0: {'Bacteria:Proteobacteria': 3.6069983060860936e-06, 'Bacteria:WS3': 2.786268335162039e-08, 'Bacteria:Cyanobacteria': 1.6551575993332143e-05, 'Bacteria:SAR406': 7.266039340242523e-06, 'Bacteria:Firmicutes': 2.8773371701026925e-07, 'Bacteria:Bacteroidetes': 1.4416066926874579e-06, 'Archaea:Euryarchaeota': 1.0655570036305709e-05, 'Bacteria:Verrucomicrobia': 1.3859984887573495e-06, 'Bacteria:Chlamydiae': 4.3648632723626306e-08, 'Bacteria:Actinobacteria': 3.5377474992136654e-06, 'Bacteria:Planctomycetes': 6.01282329608303e-07, 'Bacteria:SBR1093': 9.015584892701995e-07, 'Bacteria:Gemmatimonadetes': 2.4265157633755684e-07, 'Bacteria:Lentisphaerae': 1.1256228099820371e-07, 'Bacteria:PAUC34f': 3.250434932988258e-07, 'Bacteria:ZB3': 1.6419252990516485e-07, 'Bacteria:AD3': 6.081723512497847e-08, 'Bacteria:Tenericutes': 1.83766490796169e-07, 'Bacteria:BHI80-139': 7.852148124748662e-08, 'Bacteria:Chloroflexi': 3.0675380605426125e-06, 'Archaea:Crenarchaeota': 2.160389821507764e-06, 'Bacteria:

### Storing data in a  .json file 

In [42]:
import json
with open('data.json', 'w') as fp:
    json.dump(result, fp)