Skip to content

Commit

Permalink
Merge pull request #37 from standage/add/generators
Browse files Browse the repository at this point in the history
Script and generators
  • Loading branch information
standage committed Dec 16, 2016
2 parents 4fde89f + 8a9b199 commit 2ef7685
Show file tree
Hide file tree
Showing 31 changed files with 586 additions and 107 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ This project adheres to [Semantic Versioning](http://semver.org/).
- Annotation I/O
- GFF3Reader
- GFF3Writer
- Composable generator functions for streaming annotation processing
- A command line interface through the `tag` script
- Package scaffolding
- README
- documentation
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ install:
pip install .

devenv:
pip install pytest pytest-cov pep8 sphinx
pip install pytest pytest-cov pep8 sphinx intervaltree

style:
pep8 tag/*.py tests/*.py scripts/*.py
pep8 tag/*.py tests/*.py tag/cli/*.py bin/tag

loc:
cloc --exclude-list-file=<(echo tag/_version.py) tag/*.py
Expand Down
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,14 @@
**tag** is a free open-source software package for analyzing genome annotation data.

```python
# Compute number of exons per gene
import tag
reader = tag.reader.GFF3Reader(infilename='/data/genomes/mybug.gff3.gz')
for gene in tag.select.features(reader, type='gene'):
exons = [feat for feat in gene if feat.type == exon]
print('num exons:', len(exons))
```

To install the most recent stable release execute `pip install tag` from your terminal.
Full installation instructions and project documentation are available at https://tag.readthedocs.io.
27 changes: 27 additions & 0 deletions bin/tag
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/usr/bin/env python
#
# -----------------------------------------------------------------------------
# Copyright (C) 2016 Daniel Standage <daniel.standage@gmail.com>
#
# This file is part of tag (http://github.com/standage/tag) and is licensed
# under the BSD 3-clause license: see LICENSE.
# -----------------------------------------------------------------------------

from __future__ import print_function
import tag


mains = {
'gff3': tag.cli.gff3.main,
'occ': tag.cli.occ.main,
}


def main(args):
assert args.cmd in mains
mainmethod = mains[args.cmd]
mainmethod(args)


if __name__ == '__main__':
main(tag.cli.parser().parse_args())
6 changes: 6 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,9 @@ but may include others in the future.

.. automodule:: tag.writer
:members:

Selectors
---------

.. automodule:: tag.select
:members:
8 changes: 5 additions & 3 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,11 @@ Most GFF parsers will load data into memory for you--the trivial bit--but will n

.. code:: python
# Calculate number of exons per gene
for gene in gff3reader:
exons = [subfeat for subfeat in gene if subfeat.type == 'exon']
# Compute number of exons per gene
import tag
reader = tag.reader.GFF3Reader(infilename='/data/genomes/mybug.gff3.gz')
for gene in tag.select.features(reader, type='gene'):
exons = [feat for feat in gene if feat.type == exon]
print('num exons:', len(exons))
See :doc:`the primer on annotation formats <formats>` for more information.
Expand Down
7 changes: 3 additions & 4 deletions docs/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,9 @@ Please see the :doc:`API documentation <api>` for a description of the data stru
>>> import tag
>>> reader = tag.reader.GFF3Reader(infilename='/data/genomes/mybug.gff3.gz')
>>> for entry in reader
... if hasattr(entry, 'type') and entry.type == 'intron':
... if len(entry) > 100000:
... print(entry.slug)
>>> for entry in tag.select.features(reader, type='intron'):
... if len(entry) > 100000:
... print(entry.slug)
intron@scaffold3[37992, 149255]
intron@scaffold55[288477, 389001]
intron@scaffold192[1057, 196433]
Expand Down
62 changes: 0 additions & 62 deletions scripts/cds-occ.py

This file was deleted.

24 changes: 0 additions & 24 deletions scripts/tag-gff3.py

This file was deleted.

3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
author_email='daniel.standage@gmail.com',
license='BSD-3',
packages=['tag'],
scripts=list(glob.glob('scripts/*.py')),
scripts=list(glob.glob('bin/tag')),
install_requires=['intervaltree'],
classifiers=[
'Development Status :: 4 - Beta',
'Environment :: Console',
Expand Down
4 changes: 3 additions & 1 deletion tag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@

try:
import __builtin__ as builtins
except:
except: # pragma: no cover
import builtins
from . import comment
from . import feature
from . import range
from . import reader
from . import writer
from . import cli
from . import select
from gzip import open as gzopen

from ._version import get_versions
Expand Down
28 changes: 28 additions & 0 deletions tag/cli/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env python
#
# -----------------------------------------------------------------------------
# Copyright (C) 2016 Daniel Standage <daniel.standage@gmail.com>
#
# This file is part of tag (http://github.com/standage/tag) and is licensed
# under the BSD 3-clause license: see LICENSE.
# -----------------------------------------------------------------------------

import argparse
import sys
import tag
from . import gff3
from . import occ


def parser():
parser = argparse.ArgumentParser()
parser.add_argument('-v', '--version', action='version',
version='tag v{}'.format(tag.__version__))
parser.add_argument('-l', '--logfile', metavar='FILE', default=sys.stderr,
type=argparse.FileType('w'))
subparsers = parser.add_subparsers(dest='cmd', metavar='cmd',
help='gff3 | occ')
tag.cli.gff3.subparser(subparsers)
tag.cli.occ.subparser(subparsers)

return parser
27 changes: 27 additions & 0 deletions tag/cli/gff3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/usr/bin/env python
#
# -----------------------------------------------------------------------------
# Copyright (C) 2015 Daniel Standage <daniel.standage@gmail.com>
#
# This file is part of tag (http://github.com/standage/tag) and is licensed
# under the BSD 3-clause license: see LICENSE.
# -----------------------------------------------------------------------------

import argparse
import tag


def subparser(subparsers):
subparser = subparsers.add_parser('gff3')
subparser.add_argument('-o', '--out', metavar='FILE',
default='/dev/stdout', help='write output to the '
'specified file; default is terminal (stdout)')
subparser.add_argument('-r', '--relax', action='store_false', default=True,
dest='strict', help='relax parsing stringency')
subparser.add_argument('gff3', help='input file in GFF3 format')


def main(args):
reader = tag.reader.GFF3Reader(infilename=args.gff3, strict=args.strict)
writer = tag.writer.GFF3Writer(reader, args.out)
writer.write()
58 changes: 58 additions & 0 deletions tag/cli/occ.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env python
#
# -----------------------------------------------------------------------------
# Copyright (C) 2015 Daniel Standage <daniel.standage@gmail.com>
#
# This file is part of tag (http://github.com/standage/tag) and is licensed
# under the BSD 3-clause license: see LICENSE.
# -----------------------------------------------------------------------------

from __future__ import print_function
import argparse
from collections import defaultdict
from intervaltree import IntervalTree
import tag


def interval_set_span(intset):
begin = min([x for x, y, z in intset])
end = max([y for x, y, z in intset])
feats = set([z for x, y, z in intset])
return begin, end, feats


def subparser(subparsers):
subparser = subparsers.add_parser('occ')
subparser.add_argument('-r', '--relax', action='store_false', default=True,
dest='strict', help='relax parsing stringency')
subparser.add_argument('gff3', help='input file')
subparser.add_argument('type', help='feature type')


def main(args):
features = defaultdict(IntervalTree)
reader = tag.reader.GFF3Reader(infilename=args.gff3, strict=args.strict)
for feature in tag.select.features(reader, type=args.type, traverse=True):
features[feature.seqid].addi(feature.start, feature.end, feature)
if feature.is_multi and feature.is_toplevel:
for sib in feature.siblings:
features[sib.seqid].addi(sib.start, sib.end, sib)

total_occ = 0
ints_acct_for = defaultdict(IntervalTree)
for seqid in features:
for interval in features[seqid]:
begin, end, feat = interval
if ints_acct_for[seqid][begin:end] != set():
continue

feats = set([feat])
overlapping = features[seqid][begin:end]
testbegin, testend, testfeats = interval_set_span(overlapping)
while set(feats) < testfeats:
begin, end, feats = testbegin, testend, testfeats
overlapping = features[seqid][begin:end]
testbegin, testend, testfeats = interval_set_span(overlapping)
total_occ += end - begin
ints_acct_for[seqid].addi(begin, end, feats)
print(total_occ)
4 changes: 4 additions & 0 deletions tag/comment.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ def __init__(self, data):
assert data.startswith('#')
self._rawdata = data

@property
def entry_type(self):
return 'tag.comment.Comment'

def __repr__(self):
return self._rawdata

Expand Down
4 changes: 4 additions & 0 deletions tag/directive.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,10 @@ def __init__(self, data):

assert self.dirtype is not None

@property
def entry_type(self):
return 'tag.directive.Directive'

@property
def type(self):
if self.dirtype in dirtypes:
Expand Down
4 changes: 4 additions & 0 deletions tag/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ def __init__(self, data):
assert self.phase in [0, 1, 2], \
'invalid phase "{}"'.format(self.phase)

@property
def entry_type(self):
return 'tag.feature.Feature'

def __str__(self):
"""String representation of the feature, sans children."""
score = '.'
Expand Down

0 comments on commit 2ef7685

Please sign in to comment.