Skip to content

Commit

Permalink
Merge pull request #46 from standage/feature/index
Browse files Browse the repository at this point in the history
New index class
  • Loading branch information
standage committed Jan 15, 2017
2 parents 43a510e + a0ad6b5 commit 233cf3d
Show file tree
Hide file tree
Showing 9 changed files with 215 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ env/
env3/
*.swp
.cache/
tagcli
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ This project adheres to [Semantic Versioning](http://semver.org/).

## [Unreleased]
### Added
- An index class for efficient in-memory access of sequence features.
- Module for mRNA handling, with a function for selecting the primary mRNA from
a gene or other feature.
- New CLI command `tag pmrna`.
Expand Down
6 changes: 6 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ Feature
.. automodule:: tag.feature
:members:

Index
-----

.. automodule:: tag.index
:members:

Readers
-------

Expand Down
6 changes: 6 additions & 0 deletions tag/directive.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,12 @@ def type(self):
return self.dirtype
return None

@property
def slug(self):
if self.type == 'sequence-region':
return 'sequence {}[{}, {}]'.format(self.seqid, self.range.start+1,
self.range.end)

def __repr__(self):
return self._rawdata

Expand Down
120 changes: 120 additions & 0 deletions tag/index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#!/usr/bin/env python
#
# -----------------------------------------------------------------------------
# Copyright (C) 2016 Daniel Standage <daniel.standage@gmail.com>
#
# This file is part of tag (http://github.com/standage/tag) and is licensed
# under the BSD 3-clause license: see LICENSE.
# -----------------------------------------------------------------------------

from collections import defaultdict
from intervaltree import IntervalTree
import tag


class Index(defaultdict):
"""
In-memory index for efficient interval-based queries for genome features.
Implemented as a dictionary, with sequence IDs as keys to interval trees
of features for the corresponding scaffold or contig sequence.
>>> import tag
>>> index = tag.index.Index()
>>> index.consume_file('tests/testdata/pcan-123.gff3.gz')
>>> sorted(index.keys())
['scaffold_123', 'scaffold_124', 'scaffold_125']
>>> for feature in index.query('scaffold_123', 5000, 6000):
... print(feature.slug)
gene@scaffold_123[5583, 5894]
>>> for feature in index.query('scaffold_125', 19000, 87000, strict=False):
... print(feature.slug)
gene@scaffold_125[18994, 19335]
gene@scaffold_125[57450, 57680]
gene@scaffold_125[86995, 87151]
>>> reader = tag.reader.GFF3Reader(tag.pkgdata('grape-cpgat.gff3'))
>>> index = tag.index.Index()
>>> index.consume(reader)
>>> index.yield_inferred = False
>>> for entry in index:
... print(entry.slug)
sequence chr8[1, 100000]
gene@chr8[72, 5081]
gene@chr8[10538, 11678]
gene@chr8[22053, 23448]
"""

def __init__(self):
defaultdict.__init__(self, IntervalTree)
self.declared_regions = dict()
self.inferred_regions = dict()
self.yield_inferred = True

def consume_file(self, infile):
"""Load the specified GFF3 file into memory."""
reader = tag.reader.GFF3Reader(infilename=infile)
self.consume(reader)

def consume_seqreg(self, seqreg):
"""Load a :code:`##sequence-region` directive into memory."""
if not isinstance(seqreg, tag.directive.Directive) or \
seqreg.type != 'sequence-region':
raise ValueError('expected ##sequence-region directive')
if seqreg.seqid in self.declared_regions:
msg = 'duplicate sequence region "{}"'.format(seqreg.seqid)
raise ValueError(msg)
self.declared_regions[seqreg.seqid] = seqreg.range.copy()

def consume_feature(self, feature):
"""Load a :code:`Feature` object into memory."""
if not isinstance(feature, tag.feature.Feature):
raise ValueError('expected Feature object')
self[feature.seqid][feature.start:feature.end] = feature
if feature.seqid not in self.inferred_regions:
self.inferred_regions[feature.seqid] = feature._range.copy()
newrange = self.inferred_regions[feature.seqid].merge(feature._range)
self.inferred_regions[feature.seqid].start = newrange.start
self.inferred_regions[feature.seqid].end = newrange.end

def consume(self, entrystream):
"""
Load a stream of entries into memory.
Only Feature objects and sequence-region directives are loaded, all
other entries are discarded.
"""
for entry in entrystream:
if isinstance(entry, tag.directive.Directive) and \
entry.type == 'sequence-region':
self.consume_seqreg(entry)
elif isinstance(entry, tag.feature.Feature):
self.consume_feature(entry)

def __iter__(self):
regions = self.inferred_regions
if not self.yield_inferred:
regions = self.declared_regions

for seqid in sorted(list(self.keys())):
sr = regions[seqid]
template = '##sequence-region {} {} {}'
data = template.format(seqid, sr.start + 1, sr.end)
yield tag.directive.Directive(data)

for seqid in sorted(list(self.keys())):
for interval in sorted(self[seqid]):
yield interval.data

def query(self, seqid, start, end, strict=True):
"""
Query the index for features in the specified range.
:param seqid: ID of the sequence to query
:param start: start of the query interval
:param end: end of the query interval
:param strict: indicates whether query is strict containment or overlap
(:code:`True` and :code:`False`, respectively)
"""
return sorted([
intvl.data for intvl in self[seqid].search(start, end, strict)
])
3 changes: 3 additions & 0 deletions tag/range.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,3 +155,6 @@ def transform(self, offset):
'undefined'.format(offset, self._start+offset, self._end+offset))
self._start += offset
self._end += offset

def copy(self):
return Range(self._start, self._end)
2 changes: 2 additions & 0 deletions tests/test_directive.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def test_custom_directive():
assert d2.data == 'abc 1 2 3'
assert d3.type is None and d3.dirtype == 'Type'
assert d3.data == 'DNA NC_005213.1'
assert d3.slug is None


def test_sequence_region():
Expand All @@ -67,6 +68,7 @@ def test_sequence_region():
r3.range == Range(0, 1497228)
assert r4.type == 'sequence-region' and r4.seqid == '1' and \
r4.range == Range(0, 1000)
assert r4.slug == 'sequence 1[1, 1000]'

with pytest.raises(AssertionError) as ae:
r5 = Directive('##sequence-region BoGuScHr 123456 4321')
Expand Down
76 changes: 76 additions & 0 deletions tests/test_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/usr/bin/env python
#
# -----------------------------------------------------------------------------
# Copyright (C) 2016 Daniel Standage <daniel.standage@gmail.com>
#
# This file is part of tag (http://github.com/standage/tag) and is licensed
# under the BSD 3-clause license: see LICENSE.
# -----------------------------------------------------------------------------

import pytest
import tag


@pytest.fixture
def pcan():
return index


def test_rice():
reader = tag.reader.GFF3Reader(tag.pkgdata('osat-twoscaf.gff3.gz'))
index = tag.index.Index()
index.consume(reader)
assert sorted(list(index.keys())) == ['NW_015379189.1', 'NW_015379208.1']
assert len(index['NW_015379189.1']) == 5
assert len(index['NW_015379208.1']) == 2


def test_volvox():
data = '##sequence-region chr1 1 {}'
entries = [tag.directive.Directive(data.format(x)) for x in range(2)]
index = tag.index.Index()
with pytest.raises(ValueError) as ve:
index.consume(entries)
assert 'duplicate sequence region' in str(ve)


@pytest.mark.parametrize('infer,s1,s2,s3,e1,e2,e3', [
(True, 5583, 6021, 18994, 452177, 444332, 429634),
(False, 1, 1, 1, 463498, 452451, 449039)
])
def test_declared(infer, s1, s2, s3, e1, e2, e3):
reader = tag.reader.GFF3Reader(tag.pkgdata('pcan-123.gff3.gz'))
index = tag.index.Index()
index.consume(reader)

index.yield_inferred = infer
sr = [e for e in index if isinstance(e, tag.directive.Directive)]
test = list()
for seqid, start, end in zip([123, 124, 125], [s1, s2, s3], [e1, e2, e3]):
data = '##sequence-region scaffold_{} {} {}'.format(seqid, start, end)
test.append(data)
assert [str(s) for s in sorted(sr)] == test


def test_consume():
reader = tag.reader.GFF3Reader(tag.pkgdata('pdom-withseq.gff3'))
entries = [e for e in reader]
assert len(entries) == 6
index = tag.index.Index()

with pytest.raises(ValueError) as ve:
index.consume_seqreg(entries[0])
assert 'expected ##sequence-region directive' in str(ve)

with pytest.raises(ValueError) as ve:
index.consume_seqreg(entries[4])
assert 'expected ##sequence-region directive' in str(ve)

with pytest.raises(ValueError) as ve:
index.consume_feature(entries[0])
assert 'expected Feature object' in str(ve)

index.consume_seqreg(entries[1])
index.consume_feature(entries[3])
index.consume_file('tests/testdata/pcan-123.gff3.gz')
assert len(index) == 4
Binary file added tests/testdata/pcan-123.gff3.gz
Binary file not shown.

0 comments on commit 233cf3d

Please sign in to comment.