# Associating Annotations with VR Objects

This notebook demonstrates how to associate information with VR objects.

Information is never embedded within VR objects. Instead, it is associated with those objects by means of their ids.  This approach to annotations scales better in size and distributes better across multiple data sources.

In [7]:
import collections
from ga4gh.core import ga4gh_identify
from ga4gh.vrs import models
from ga4gh.vrs.dataproxy import SeqRepoRESTDataProxy
from ga4gh.vrs.extras.translator import Translator

# Requires seqrepo REST interface is running on this URL (e.g., using docker image)
seqrepo_rest_service_url = "https://services.genomicmedlab.org/seqrepo"
dp = SeqRepoRESTDataProxy(base_url=seqrepo_rest_service_url)

tlr = Translator(data_proxy=dp)

In [8]:
# Declare some data as human-readable RS id labels with HGVS expressions
data = (
    ("rs7412C",   "NC_000019.10:g.44908822="),
    ("rs7412T",   "NC_000019.10:g.44908822C>T"),
    ("rs429358C", "NC_000019.10:g.44908684="),
    ("rs429358T", "NC_000019.10:g.44908684T>C")
)

In [9]:
# Parse the HGVS expressions and generate three dicts:
# alleles[allele_id] ⇒ allele object
# rs_names[allele_id] ⇒ rs label
# hgvs_name[allele_id] ⇒ original hgvs expression

# For convenience, also build
# rs_to_id[rs_name] ⇒ allele_id

alleles = {}
rs_names = {}
hgvs_names = collections.defaultdict(lambda: dict())
for rs, hgvs_expr in data:
    allele = tlr.translate_from(hgvs_expr,'hgvs')
    allele_id = ga4gh_identify(allele)
    alleles[allele_id] = allele
    hgvs_names[allele_id] = hgvs_expr
    rs_names[allele_id] = rs

rs_to_id = {r: i for i, r in rs_names.items()}

In [10]:
# Now, build a new set of annotations: allele frequencies
# This is more complicated because it maps to a map of frequences
# It should be clear that other frequencies could be easily added here
# or as a separate data source
freqs = {
    "gnomad": {
        "global": {
            rs_to_id["rs7412C"]: 0.9385,
            rs_to_id["rs7412T"]: 0.0615,
            rs_to_id["rs429358C"]: 0.1385,
            rs_to_id["rs429358T"]: 0.8615,
        }
    }
}

In [11]:
# It might be convenient to save these data
# A saved document might have structure like this:
doc = {
    "alleles": alleles,
    "hgvs_names": hgvs_names,
    "rs_names": rs_names,
    "freqs": freqs
}

In [12]:
# For the benefit of pretty printing, let's replace the allele objects with their dict representations
doc["alleles"] = {i: a.as_dict() for i, a in doc["alleles"].items()}
import json
print(json.dumps(doc, indent=2))

{
  "alleles": {
    "ga4gh:VA.UUvQpMYU5x8XXBS-RhBhmipTWe2AALzj": {
      "_id": "ga4gh:VA.UUvQpMYU5x8XXBS-RhBhmipTWe2AALzj",
      "location": {
        "interval": {
          "end": 44908822,
          "start": 44908821,
          "type": "SimpleInterval"
        },
        "sequence_id": "ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
        "type": "SequenceLocation"
      },
      "state": {
        "sequence": "C",
        "type": "SequenceState"
      },
      "type": "Allele"
    },
    "ga4gh:VA.EgHPXXhULTwoP4-ACfs-YCXaeUQJBjH_": {
      "_id": "ga4gh:VA.EgHPXXhULTwoP4-ACfs-YCXaeUQJBjH_",
      "location": {
        "interval": {
          "end": 44908822,
          "start": 44908821,
          "type": "SimpleInterval"
        },
        "sequence_id": "ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
        "type": "SequenceLocation"
      },
      "state": {
        "sequence": "T",
        "type": "SequenceState"
      },
      "type": "Allele"
    },
    "ga4gh:VA.LQrGFIOAP8wEA