Skip to content

Commit

Permalink
Better handling of separator directives and feature selection (#75)
Browse files Browse the repository at this point in the history
This update mashes together two changes. First, it gives finer control to how and when separator directives are printed after complex features. Second, it enables selection of multiple feature types simultaneously in the `tag.select.features` function and updates the `bae` module to correctly use this function in `traverse` mode.
  • Loading branch information
standage authored Jun 27, 2019
1 parent 36c1a9b commit c78a163
Show file tree
Hide file tree
Showing 12 changed files with 76 additions and 21 deletions.
3 changes: 1 addition & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,11 @@ This project adheres to [Semantic Versioning](http://semver.org/).
### Changed
- Minor updates to compensate for a couple years' worth of neglect (see #64).
- Refactored `GFF3Reader` to better support processing of unsorted GFF3 data (see #65).
- Modified `GFF3Writer` to intermittently output separator directives (`###`) in large stretches of simple (childless) features (see #68).
- Implemented finer control of how and when output separator directives (`###`) are printed to GFF3 output (see #68, #75).
- Reorganized test suite code and data (see #70).
- Refactored the `Feature` and `Score` APIs with more sane default and static constructors (see #74).



## [0.3.3] - 2017-06-21
### Fixed
- Missing `extent` query from the index implementation.
Expand Down
19 changes: 12 additions & 7 deletions tag/bae.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,26 @@ def eval_locus(features):
intervals = defaultdict(int)
coverage = defaultdict(int)

numorfs = len(list(tag.select.features(features, type='CDS')))

for feature in features:
numorfs = len(list(
tag.select.features(features, type='CDS', traverse=True)
))
selector = tag.select.features(
features, type=set(['CDS', 'translated_nucleotide_match']),
traverse=True,
)
for feature in selector:
starts[feature.start] += 1
ends[feature.end] += 1
intervals[feature.range] += 1

aligns = tag.select.features(features, type='translated_nucleotide_match')
ranges = [a.range for a in aligns]
for block in tag.Range.merge_overlapping(ranges):
for feature in tag.select.features(features, type='CDS'):
bpoverlap = feature.range.overlap_extent(block)
coverage[feature] += bpoverlap
for feat in tag.select.features(features, type='CDS', traverse=True):
bpoverlap = feat.range.overlap_extent(block)
coverage[feat] += bpoverlap

for feature in tag.select.features(features, type='CDS'):
for feature in tag.select.features(features, type='CDS', traverse=True):
start_confirmed = starts[feature.start] - 1
start_shared = starts[feature.start] / sum(starts.values())
end_confirmed = ends[feature.end] - 1
Expand Down
1 change: 1 addition & 0 deletions tag/cli/bae.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,5 @@ def main(args):
)
evalstream = tag.bae.eval_stream(locusstream)
writer = tag.writer.GFF3Writer(evalstream, args.out)
writer.complex_separators = False
writer.write()
1 change: 1 addition & 0 deletions tag/cli/locuspocus.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,5 @@ def main(args):
minperc=args.min_perc
)
writer = tag.writer.GFF3Writer(locusstream, args.out)
writer.complex_separators = False
writer.write()
2 changes: 0 additions & 2 deletions tag/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,6 @@ def __repr__(self):
if string != '':
string += '\n'
string += str(feature)
if self.is_complex:
string += '\n###'
return string

def __len__(self):
Expand Down
11 changes: 8 additions & 3 deletions tag/select.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,24 @@ def features(entrystream, type=None, traverse=False):
to :code:`True` to search each feature graph for the
specified feature type
"""
def _typecheck(feat):
if isinstance(type, str):
return feat.type == type
return feat.type in type

for feature in entry_type_filter(entrystream, tag.Feature):
if traverse:
if type is None:
message = 'cannot traverse without a specific feature type'
raise ValueError(message)
if type == feature.type:
if _typecheck(feature):
yield feature
else:
for subfeature in feature:
if type == subfeature.type:
if _typecheck(subfeature):
yield subfeature
else:
if not type or type == feature.type:
if not type or _typecheck(feature):
yield feature


Expand Down
1 change: 0 additions & 1 deletion tag/tests/data/amel-cdna-multi-out.gff3
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,3 @@ NC_007070.3 RefSeq cDNA_match 318980 319128 . + . ID=cDNA_match1;Target=XM_01691
NC_007070.3 RefSeq cDNA_match 320138 320990 . + . ID=cDNA_match1;Target=XM_016914591.1 1434 2286 +;for_remapping=2;gap_count=1;num_ident=3505;num_mismatch=0;pct_coverage=99.9715;pct_coverage_hiqual=99.9715;pct_identity_gap=99.9715;pct_identity_ungap=100;rank=1
NC_007070.3 RefSeq cDNA_match 323501 324025 . + . ID=cDNA_match1;Target=XM_016914591.1 2287 2811 +;for_remapping=2;gap_count=1;num_ident=3505;num_mismatch=0;pct_coverage=99.9715;pct_coverage_hiqual=99.9715;pct_identity_gap=99.9715;pct_identity_ungap=100;rank=1
NC_007070.3 RefSeq cDNA_match 325091 325784 . + . ID=cDNA_match1;Gap=M255 I1 M439;Target=XM_016914591.1 2812 3506 +;for_remapping=2;gap_count=1;num_ident=3505;num_mismatch=0;pct_coverage=99.9715;pct_coverage_hiqual=99.9715;pct_identity_gap=99.9715;pct_identity_ungap=100;rank=1
###
1 change: 0 additions & 1 deletion tag/tests/data/eden-mod.gff3
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,3 @@ ctg123 . CDS 5000 5500 . + 1 ID=cds00004;Parent=mRNA3;Name=edenprotein.4
ctg123 . CDS 7000 7600 . + 1 ID=cds00003;Parent=mRNA3;Name=edenprotein.3
ctg123 . CDS 7000 7600 . + 1 ID=cds00004;Parent=mRNA3;Name=edenprotein.4
ctg123 . exon 7000 9000 . + . ID=exon00005;Parent=mRNA3
###
30 changes: 30 additions & 0 deletions tag/tests/data/grape-cpgat-no-sep.gff3
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
##gff-version 3
##sequence-region chr8 1 100000
chr8 CpGAT gene 72 5081 . - . ID=gene1;Name=chr8.g1
chr8 CpGAT mRNA 72 5081 . - . ID=mRNA1;Parent=gene1;Name=chr8.g1.t1;index=1
chr8 CpGAT exon 72 167 . - 1 Parent=mRNA1
chr8 CpGAT CDS 72 167 . - 0 ID=CDS1;Parent=mRNA1
chr8 AEGeAn::CanonGFF3 intron 168 348 . - . Parent=mRNA1
chr8 CpGAT exon 349 522 . - 1 Parent=mRNA1
chr8 CpGAT CDS 349 522 . - 0 ID=CDS1;Parent=mRNA1
chr8 AEGeAn::CanonGFF3 intron 523 610 . - . Parent=mRNA1
chr8 CpGAT exon 611 702 . - 1 Parent=mRNA1
chr8 CpGAT CDS 611 702 . - 2 ID=CDS1;Parent=mRNA1
chr8 AEGeAn::CanonGFF3 intron 703 4915 . - . Parent=mRNA1
chr8 CpGAT exon 4916 5081 . - 1 Parent=mRNA1
chr8 CpGAT CDS 4916 5081 . - 0 ID=CDS1;Parent=mRNA1
chr8 CpGAT gene 10538 11678 . - . ID=gene2;Name=chr8.g2
chr8 CpGAT mRNA 10538 11678 . - . ID=mRNA2;Parent=gene2;Name=chr8.g2.t1;index=1
chr8 CpGAT three_prime_UTR 10538 10746 . - . Parent=mRNA2
chr8 CpGAT exon 10538 11678 . - 1 Parent=mRNA2
chr8 CpGAT CDS 10747 11577 . - 0 Parent=mRNA2
chr8 CpGAT five_prime_UTR 11578 11678 . - . Parent=mRNA2
chr8 CpGAT gene 22053 23448 . + . ID=gene3;Name=chr8.g3
chr8 CpGAT mRNA 22053 23448 . + . ID=mRNA3;Parent=gene3;Name=chr8.g3.t1;index=1
chr8 CpGAT five_prime_UTR 22053 22166 . + . Parent=mRNA3
chr8 CpGAT exon 22053 22550 . + 1 Parent=mRNA3
chr8 CpGAT CDS 22167 22550 . + 0 ID=CDS2;Parent=mRNA3
chr8 AEGeAn::CanonGFF3 intron 22551 22650 . + . Parent=mRNA3
chr8 CpGAT CDS 22651 23022 . + 0 ID=CDS2;Parent=mRNA3
chr8 CpGAT exon 22651 23448 . + 1 Parent=mRNA3
chr8 CpGAT three_prime_UTR 23023 23448 . + . Parent=mRNA3
4 changes: 3 additions & 1 deletion tag/tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ def test_grape():
reader = GFF3Reader(instream=infile)
output = ''
for record in reader:
output += '%r\n' % record
output += repr(record) + '\n'
if isinstance(record, Feature) and record.is_complex:
output += '###\n'
assert output == open(data_file('grape-cpgat-sorted.gff3')).read()


Expand Down
12 changes: 12 additions & 0 deletions tag/tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,18 @@ def test_write_file():
assert obs_out.strip() == exp_out.strip()


def test_no_complex_separators():
reader = GFF3Reader(infilename=data_file('grape-cpgat.gff3'))
with NamedTemporaryFile(suffix='.gff3', mode='w+t') as outfile:
writer = GFF3Writer(reader, outfile)
writer.complex_separators = False
writer.write()
outfile.seek(0)
obs_out = outfile.read()
exp_out = data_stream('grape-cpgat-no-sep.gff3').read()
assert obs_out.strip() == exp_out.strip()


@pytest.mark.parametrize('gff3', [
'minimus.gff3',
'prokka.gff3',
Expand Down
12 changes: 8 additions & 4 deletions tag/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __init__(self, instream, outfile='-'):
else:
self.outfile = outfile
self.retainids = False
self.complex_separators = True
self.feature_counts = defaultdict(int)
self._seq_written = False
self._block_count = 0
Expand Down Expand Up @@ -83,12 +84,15 @@ def write(self, blockitvl=0):
feature.add_attribute('ID', fid)
else:
feature.drop_attribute('ID')
if entry.is_complex:
self._block_count = 0
else:
self._block_count += 1
if isinstance(entry, Sequence) and not self._seq_written:
print('##FASTA', file=self.outfile)
self._seq_written = True
print(repr(entry), file=self.outfile)
if isinstance(entry, Feature):
if entry.is_complex:
self._block_count = 0
if self.complex_separators:
print('###', file=self.outfile)
else:
self._block_count += 1
self._write_separator(blockitvl)

0 comments on commit c78a163

Please sign in to comment.