In [11]:
import itertools
import pandas as pd
from pivottablejs import pivot_ui

import ga4gh.client
print(ga4gh.__version__)

gc = ga4gh.client.HttpClient("http://localhost:8000")

0.1.dev616+ncbb0131b36d2


In [33]:
region_constraints = dict(referenceName="1", start=0, end=int(1e10))
variant_set_id = 'YnJjYTE6T1I0Rg'
variant_annotation_set = gc.searchVariantAnnotationSets(variant_set_id).next()
variant_annotations = list(gc.searchVariantAnnotations(variant_annotation_set.id, **region_constraints))

In [97]:
# list of <set(SO ids), variant_ids>
soset_variant = ({'so_terms': ";".join(sorted(set("{ef.id}:{ef.term}".format(ef=ef)
                                         for te in va.transcriptEffects for ef in te.effects))),
                  'variant_annotation_id': va.id}
                  for va in variant_annotations)
soset_variant_df = pd.DataFrame(soset_variant)
soset_variant_df.head()

Unnamed: 0,so_terms,variant_annotation_id
0,SO:0000605:intergenic_region,YnJjYTE6T1I0Rjp2YXJpYW50YW5ub3RhdGlvbnM6MTo0Nj...
1,SO:0000605:intergenic_region,YnJjYTE6T1I0Rjp2YXJpYW50YW5ub3RhdGlvbnM6MTo0Nj...
2,SO:0000605:intergenic_region,YnJjYTE6T1I0Rjp2YXJpYW50YW5ub3RhdGlvbnM6MTo0Nj...
3,SO:0000605:intergenic_region,YnJjYTE6T1I0Rjp2YXJpYW50YW5ub3RhdGlvbnM6MTo0Nj...
4,SO:0000605:intergenic_region,YnJjYTE6T1I0Rjp2YXJpYW50YW5ub3RhdGlvbnM6MTo0Nj...


In [98]:
# 1) Select Bar Chart from Table menu; 2) drag-drop so_terms to left column under count
pivot_ui(soset_variant_df)

In [99]:
soset_variant_df.groupby(["so_terms"])["variant_annotation_id"].nunique()

so_terms
SO:0000605:intergenic_region                                       697
SO:0000605:intergenic_region;SO:0001631:upstream_gene_variant       63
SO:0000605:intergenic_region;SO:0001632:downstream_gene_variant     56
SO:0001583:missense_variant                                         16
SO:0001587:stop_gained                                               1
SO:0001819:synonymous_variant                                        7
Name: variant_annotation_id, dtype: int64

In [123]:
# ensure that variant annotation ids are unique in all sets
def _vids(df,sit):
    return set(df[df["so_terms"] == sit]["variant_annotation_id"])

terms = soset_variant_df["so_terms"].unique()
for ti1 in range(len(terms)):
    vids1 = _vids(soset_variant_df,terms[ti1])
    for ti2 in range(ti1,len(terms)):
        vids2 = _vids(soset_variant_df,terms[ti2])
        print("{ti1} {ti2}  {l1:3d} {l2:3d}  {lo1:3d} {lo2:3d}  {li:3d}  {lu:3d}  {s}".format(
                ti1=ti1, ti2=ti2, l1=len(vids1), l2=len(vids2),
                lo1=len(vids1-vids2), lo2=len(vids2-vids1),
                li=len(vids1 & vids2), lu=len(vids1 | vids2),
                s="*" if len(vids1 & vids2) > 0 else ''
            ))

0 0  697 697    0   0  697  697  *
0 1  697  63  697  63    0  760  
0 2  697  16  697  16    0  713  
0 3  697   7  697   7    0  704  
0 4  697   1  697   1    0  698  
0 5  697  56  697  56    0  753  
1 1   63  63    0   0   63   63  *
1 2   63  16   63  16    0   79  
1 3   63   7   63   7    0   70  
1 4   63   1   63   1    0   64  
1 5   63  56   63  56    0  119  
2 2   16  16    0   0   16   16  *
2 3   16   7   16   7    0   23  
2 4   16   1   16   1    0   17  
2 5   16  56   16  56    0   72  
3 3    7   7    0   0    7    7  *
3 4    7   1    7   1    0    8  
3 5    7  56    7  56    0   63  
4 4    1   1    0   0    1    1  *
4 5    1  56    1  56    0   57  
5 5   56  56    0   0   56   56  *


----

# Experiment with searching for SO ids (or terms)

Trying to answer two questions:

1. Does searching for a single id match variant annotations associated with multiple ids? For example, does searching with {SO:1} match only {SO:1}, or does it also match {SO:1, SO:2}
1. Does searching with multiple effects imply conjunction (logical AND) or inclusive disjunction (logical OR)? For example, does searching with {SO:1, SO:2} match {SO:1} (OR) or only {SO:1, SO:2}? 

Using the data above, we can search for single and multiple terms.

We'll be using this function:

    Signature: gc.searchVariantAnnotations(variantAnnotationSetId, referenceName=None, referenceId=None, 
                                           start=None, end=None, featureIds=[], effects=[])
    Docstring:
    Returns an iterator over the Annotations fulfilling the specified conditions from the specified
    AnnotationSet.

    The JSON string for an effect term must be specified on the command line : 
    `--effects '{"term": "exon_variant"}'`.

In [81]:
def _mk_effect_filter(so_ids=[]):
    """return list of so_id effect filters for the given list of so_ids

    >>> print(_mk_effect_filter(so_ids="SO:1 SO:2 SO:3".split()))
    ['{"id":"SO:1"}', '{"id":"SO:2"}', '{"id":"SO:3"}']
    """
    return ['{{"id":"{so_id}"}}'.format(so_id=so_id) for so_id in so_ids]

def _fetch_variant_annotations(gc, so_ids=[], **args):
    return gc.searchVariantAnnotations(variant_annotation_set.id,
                                       effects=_mk_effect_filter(so_ids),
                                       **args)

In [126]:
# expected:
#so_terms
#SO:0000605:intergenic_region                                       697
#SO:0000605:intergenic_region;SO:0001631:upstream_gene_variant       63
#SO:0000605:intergenic_region;SO:0001632:downstream_gene_variant     56
#SO:0001583:missense_variant                                         16
#SO:0001587:stop_gained                                               1
#SO:0001819:synonymous_variant                                        7
        
[(so_set,
  len(list(_fetch_variant_annotations(gc, so_ids=so_set.split(), **region_constraints))))
 for so_set in ["SO:0001819", "SO:0001632", "SO:0000605", 
                "SO:0000605 SO:0001632",
                "SO:9999999", "SO:0000605 SO:999999"]
 ]

[('SO:0001819', 7),
 ('SO:0001632', 56),
 ('SO:0000605', 697),
 ('SO:0000605 SO:0001632', 697),
 ('SO:9999999', 0),
 ('SO:0000605 SO:999999', 697)]

Conclusion: Something's fishy.

SO:0001819 returns 7, which is correct. That SO doesn't occur anywhere else.

SO:0001632 returns 56, which suggests that it's finding the VAs in which it is one of two terms. That is, the filtering appears to use subsets

SO:0000605 returns 697, which suggests that it is finding only those VAs for which it is the sole term, and that it's not finding the VAs for which it is one of two terms. This appears to contradict the behavior for SO:0001632. One explanation is that searching is based on exact filtering first, and on subset if that fails.

SO:0000605,SO:0001632 returns 697 also, which excludes VAs for which SO:0000605 is a common term. This suggests that matching uses conjunction (AND).

SO:9999999 failes to find any hits, as expected.

SO:0000605,SO:999999 finds 697 hits. I can make up complicated guess as to why, but I won't here.