```
                             rs7412 
                             NC_000019.10:g.44908822
                             C          T
rs429358                 C   APOE-ε4    APOE-ε1
NC_000019.10:g.44908684  T   APOE-ε3    APOE-ε2
http://snpedia.com/index.php/APOE
```

# Setup

In [1]:
import json
import pprint

from vmc.richmodels import Interval, Locus, Allele, Haplotype, Genotype
import vmc.codecs.json

vmc.richmodels.id_function = "uuid"
vmc.richmodels.id_function = "digest"

def to_json(o):
    return json.dumps(o, indent=2, sort_keys=True, cls=vmc.codecs.json.JSONEncoder, ensure_ascii=False)

# Identifier

In [2]:
sr = "NCBI:NC_000019.10"

# Interval

In [3]:
intervals = {
    "rs429358": Interval(44908683, 44908684),
    "rs7412": Interval(44908821, 44908822),
    }

o = intervals["rs429358"]
print("r={o!r}\ns={o}\nj={j}".format(o=o, j=to_json(o)))

r=Interval(start=44908683, end=44908684)
s=44908683:44908684
j={
  "end": 44908684,
  "start": 44908683
}


# Locus

In [4]:
loci = {
    "rs429358": Locus(sr, intervals["rs429358"]),
    "rs7412": Locus(sr, intervals["rs7412"]),
}

o = loci["rs429358"]
print("r={o!r}\ns={o}\nid={o.id}\nj={j}".format(o=o, j=to_json(o)))

r=Locus(seqref='NCBI:NC_000019.10', location=Interval(start=44908683, end=44908684), id=None)
s=NCBI:NC_000019.10:44908683:44908684
id=None
j={
  "id": null,
  "location": {
    "end": 44908684,
    "start": 44908683
  },
  "seqref": "NCBI:NC_000019.10"
}


# Alleles

In [5]:
alleles = {
    "rs429358T": Allele(sr, intervals["rs429358"], "T"),
    "rs429358C": Allele(sr, intervals["rs429358"], "C"),
    "rs7412T":   Allele(sr, intervals["rs7412"],   "T"),
    "rs7412C":   Allele(sr, intervals["rs7412"],   "C"),
    }
o = alleles["rs429358C"]
print("r={o!r}\ns={o}\nid={o.id}\nj={j}".format(o=o, j=to_json(o)))

r=Allele(seqref='NCBI:NC_000019.10', location=Interval(start=44908683, end=44908684), replacement='C', id='uBocJSAilo', identifiers=[])
s=NCBI:NC_000019.10:44908683:44908684:C
id=uBocJSAilo
j={
  "id": "uBocJSAilo",
  "identifiers": [],
  "location": {
    "end": 44908684,
    "start": 44908683
  },
  "replacement": "C",
  "seqref": "NCBI:NC_000019.10"
}


# Haplotypes

In [6]:
haplotypes = {
    "ε1": Haplotype([alleles["rs429358C"], alleles["rs7412T"]]),
    "ε2": Haplotype([alleles["rs429358T"], alleles["rs7412T"]]),
    "ε3": Haplotype([alleles["rs429358T"], alleles["rs7412C"]]),
    "ε4": Haplotype([alleles["rs429358C"], alleles["rs7412C"]]),
    "ε4r": Haplotype([alleles["rs7412C"], alleles["rs429358C"]]),
    }

o = haplotypes["ε1"]
print("r={o!r}\ns={o}\nid={o.id}\nj={j}".format(o=o, j=to_json(o)))

r=Haplotype(alleles=[Allele(seqref='NCBI:NC_000019.10', location=Interval(start=44908683, end=44908684), replacement='C', id='uBocJSAilo', identifiers=[]), Allele(seqref='NCBI:NC_000019.10', location=Interval(start=44908821, end=44908822), replacement='T', id='MY3zDtCFN8', identifiers=[])], id='i0SCxJp24-', identifiers=[])
s=Haplotype(alleles=[Allele(seqref='NCBI:NC_000019.10', location=Interval(start=44908683, end=44908684), replacement='C', id='uBocJSAilo', identifiers=[]), Allele(seqref='NCBI:NC_000019.10', location=Interval(start=44908821, end=44908822), replacement='T', id='MY3zDtCFN8', identifiers=[])], id='i0SCxJp24-', identifiers=[])
id=i0SCxJp24-
j={
  "allele_ids": [
    "MY3zDtCFN8",
    "uBocJSAilo"
  ],
  "id": "i0SCxJp24-"
}


In [7]:
from vmc.utils import multimap
haplotype_id_name_map = multimap((haplotypes[n].id, n) for n in haplotypes)
pprint.pprint(haplotype_id_name_map)

{'5nDsuTc-tN': ['ε2'],
 'i0SCxJp24-': ['ε1'],
 'i4BSqxa8G9': ['ε3'],
 'nGEpOLlNtf': ['ε4', 'ε4r']}


# Genotypes

In [8]:
genotypes = {
    "{}/{}".format(h1n, h2n): Genotype([h1, h2])
    for h1n, h1 in haplotypes.items()
    for h2n, h2 in haplotypes.items()
    }

o = genotypes["ε4/ε4r"]
print("r={o!r}\ns={o}\nid={o.id}\nj={j}".format(o=o, j=to_json(o)))

r=Genotype(haplotypes=[Haplotype(alleles=[Allele(seqref='NCBI:NC_000019.10', location=Interval(start=44908683, end=44908684), replacement='C', id='uBocJSAilo', identifiers=[]), Allele(seqref='NCBI:NC_000019.10', location=Interval(start=44908821, end=44908822), replacement='C', id='AUbGAr-aBT', identifiers=[])], id='nGEpOLlNtf', identifiers=[]), Haplotype(alleles=[Allele(seqref='NCBI:NC_000019.10', location=Interval(start=44908821, end=44908822), replacement='C', id='AUbGAr-aBT', identifiers=[]), Allele(seqref='NCBI:NC_000019.10', location=Interval(start=44908683, end=44908684), replacement='C', id='uBocJSAilo', identifiers=[])], id='nGEpOLlNtf', identifiers=[])], id='O9P1aweZ9k', identifiers=[])
s=Genotype(haplotypes=[Haplotype(alleles=[Allele(seqref='NCBI:NC_000019.10', location=Interval(start=44908683, end=44908684), replacement='C', id='uBocJSAilo', identifiers=[]), Allele(seqref='NCBI:NC_000019.10', location=Interval(start=44908821, end=44908822), replacement='C', id='AUbGAr-aBT', 

# Associating data

## External Identifiers

In [9]:
identifier_id_map = {}
identifier_id_map.update(multimap((a.computed_identifier(), a.id) for a in alleles.values()))
identifier_id_map.update(multimap((h.computed_identifier(), h.id) for h in haplotypes.values()))
identifier_id_map.update(multimap((g.computed_identifier(), g.id) for g in genotypes.values()))
identifier_id_map


{'GA:5Kz4eJfHVW2mKcGjY3rgjQAUbGAr-aBT': ['AUbGAr-aBT'],
 'GA:HW17jzWbPWQyIcRdwwyeKvuBocJSAilo': ['uBocJSAilo'],
 'GA:s9484RoL0-BQlf1sppO7HmDriWL4GHjx': ['DriWL4GHjx'],
 'GA:uynDR_lCUsmDMN0LWAbpQZMY3zDtCFN8': ['MY3zDtCFN8'],
 'GG:2CLolzZMnVzC_inXALl7eAufBIZXqUM0': ['ufBIZXqUM0', 'ufBIZXqUM0'],
 'GG:DA1Gtie9O6lySp-6ONeNhffOAcnwIq9P': ['fOAcnwIq9P'],
 'GG:QQgbv45hKoAP4CfL0DT5omsJKWRs_TOF': ['sJKWRs_TOF',
  'sJKWRs_TOF',
  'sJKWRs_TOF',
  'sJKWRs_TOF'],
 'GG:WWVaQU6YdwCpZLF1kR3se5vPDVomeEXQ': ['vPDVomeEXQ'],
 'GG:_SU3lBFQuqLW06NOZ8IAhDZkTgob7h3F': ['ZkTgob7h3F', 'ZkTgob7h3F'],
 'GG:_uq2kJfLezLKwSOB0mGGm9T0Jbj6ZMA0': ['T0Jbj6ZMA0',
  'T0Jbj6ZMA0',
  'T0Jbj6ZMA0',
  'T0Jbj6ZMA0'],
 'GG:cEyezPUQv9EuxUmZe6JbaVO9P1aweZ9k': ['O9P1aweZ9k',
  'O9P1aweZ9k',
  'O9P1aweZ9k',
  'O9P1aweZ9k'],
 'GG:e80rV05VbzK-b_duVlO3_4ZiuDaPcAUP': ['ZiuDaPcAUP',
  'ZiuDaPcAUP',
  'ZiuDaPcAUP',
  'ZiuDaPcAUP'],
 'GG:jhPS2pK6zHetS-wes_XGSBspLS2eLfgR': ['spLS2eLfgR'],
 'GG:uNQUl8igBqyKujxl-jVaGX9JDAVOY37V': ['9JDAVOY37V

## Allele equivalence when defined on different sequences

In [10]:
a1 = alleles["rs429358C"]

In [11]:
a2 = Allele("NCBI:CM000681.2", Interval(start=44908683, end=44908684), replacement="C")

In [12]:
print(a1, a1.id)
print(a2, a2.id)

NCBI:NC_000019.10:44908683:44908684:C uBocJSAilo
NCBI:CM000681.2:44908683:44908684:C 6dnjxvre_C


In [13]:
import biocommons.seqrepo
sr = biocommons.seqrepo.SeqRepo("/usr/local/share/seqrepo/master")

In [14]:
from vmc.utils import seq_id

In [15]:
b1 = Allele(seq_id(sr, a1.seqref), a1.location, a1.replacement)
b2 = Allele(seq_id(sr, a2.seqref), a2.location, a2.replacement)

In [16]:
print(b1, b1.computed_identifier())
print(b2, b2.computed_identifier())

GS:IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl:44908683:44908684:C GA:GMpC3hSMPgE9pZdbDdWE7iXFfjweLMRZ
GS:IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl:44908683:44908684:C GA:GMpC3hSMPgE9pZdbDdWE7iXFfjweLMRZ


## Associating Haplotype names with Haplotype computed identifiers

In [17]:
gt = genotypes["ε1/ε2"]

In [18]:
gt.haplotype_ids()

['5nDsuTc-tN', 'i0SCxJp24-']

In [19]:
[haplotype_id_name_map[hid] for hid in gt.haplotype_ids()]

[['ε2'], ['ε1']]

# Document Example

In [20]:
patient_data = {
    "sample-id": "e89c387a-b539-11e6-9d82-fb96077e5724",
    "date": "2016-11-27T00:00:00",
   
    "variation": {
        "meta": {
            "vmc_schema_version": 1,
            "id_scheme": "computed",
        },
        "alleles": alleles,
        "haplotypes": haplotypes,
        "genotypes": genotypes,
        "identifier_id_map": identifier_id_map
    }
}
print(to_json(patient_data))

{
  "date": "2016-11-27T00:00:00",
  "sample-id": "e89c387a-b539-11e6-9d82-fb96077e5724",
  "variation": {
    "alleles": {
      "rs429358C": {
        "id": "uBocJSAilo",
        "identifiers": [],
        "location": {
          "end": 44908684,
          "start": 44908683
        },
        "replacement": "C",
        "seqref": "NCBI:NC_000019.10"
      },
      "rs429358T": {
        "id": "DriWL4GHjx",
        "identifiers": [],
        "location": {
          "end": 44908684,
          "start": 44908683
        },
        "replacement": "T",
        "seqref": "NCBI:NC_000019.10"
      },
      "rs7412C": {
        "id": "AUbGAr-aBT",
        "identifiers": [],
        "location": {
          "end": 44908822,
          "start": 44908821
        },
        "replacement": "C",
        "seqref": "NCBI:NC_000019.10"
      },
      "rs7412T": {
        "id": "MY3zDtCFN8",
        "identifiers": [],
        "location": {
          "end": 44908822,
          "start": 44908821
        },