Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better handling of separator directives and feature selection #75

Merged
merged 4 commits into from
Jun 27, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,11 @@ This project adheres to [Semantic Versioning](http://semver.org/).
### Changed
- Minor updates to compensate for a couple years' worth of neglect (see #64).
- Refactored `GFF3Reader` to better support processing of unsorted GFF3 data (see #65).
- Modified `GFF3Writer` to intermittently output separator directives (`###`) in large stretches of simple (childless) features (see #68).
- Implemented finer control of how and when output separator directives (`###`) are printed to GFF3 output (see #68, #75).
- Reorganized test suite code and data (see #70).
- Refactored the `Feature` and `Score` APIs with more sane default and static constructors (see #74).



## [0.3.3] - 2017-06-21
### Fixed
- Missing `extent` query from the index implementation.
Expand Down
19 changes: 12 additions & 7 deletions tag/bae.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,26 @@ def eval_locus(features):
intervals = defaultdict(int)
coverage = defaultdict(int)

numorfs = len(list(tag.select.features(features, type='CDS')))

for feature in features:
numorfs = len(list(
tag.select.features(features, type='CDS', traverse=True)
))
selector = tag.select.features(
features, type=set(['CDS', 'translated_nucleotide_match']),
traverse=True,
)
for feature in selector:
starts[feature.start] += 1
ends[feature.end] += 1
intervals[feature.range] += 1

aligns = tag.select.features(features, type='translated_nucleotide_match')
ranges = [a.range for a in aligns]
for block in tag.Range.merge_overlapping(ranges):
for feature in tag.select.features(features, type='CDS'):
bpoverlap = feature.range.overlap_extent(block)
coverage[feature] += bpoverlap
for feat in tag.select.features(features, type='CDS', traverse=True):
bpoverlap = feat.range.overlap_extent(block)
coverage[feat] += bpoverlap

for feature in tag.select.features(features, type='CDS'):
for feature in tag.select.features(features, type='CDS', traverse=True):
start_confirmed = starts[feature.start] - 1
start_shared = starts[feature.start] / sum(starts.values())
end_confirmed = ends[feature.end] - 1
Expand Down
1 change: 1 addition & 0 deletions tag/cli/bae.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,5 @@ def main(args):
)
evalstream = tag.bae.eval_stream(locusstream)
writer = tag.writer.GFF3Writer(evalstream, args.out)
writer.complex_separators = False
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here, we want separators to mark boundaries between loci, not between features within a locus.

writer.write()
1 change: 1 addition & 0 deletions tag/cli/locuspocus.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,5 @@ def main(args):
minperc=args.min_perc
)
writer = tag.writer.GFF3Writer(locusstream, args.out)
writer.complex_separators = False
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here, we want separators to mark boundaries between loci, not between features within a locus.

writer.write()
2 changes: 0 additions & 2 deletions tag/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,6 @@ def __repr__(self):
if string != '':
string += '\n'
string += str(feature)
if self.is_complex:
string += '\n###'
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is now delegated to the GFF3 writer.

return string

def __len__(self):
Expand Down
11 changes: 8 additions & 3 deletions tag/select.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,24 @@ def features(entrystream, type=None, traverse=False):
to :code:`True` to search each feature graph for the
specified feature type
"""
def _typecheck(feat):
if isinstance(type, str):
return feat.type == type
return feat.type in type

for feature in entry_type_filter(entrystream, tag.Feature):
if traverse:
if type is None:
message = 'cannot traverse without a specific feature type'
raise ValueError(message)
if type == feature.type:
if _typecheck(feature):
yield feature
else:
for subfeature in feature:
if type == subfeature.type:
if _typecheck(subfeature):
yield subfeature
else:
if not type or type == feature.type:
if not type or _typecheck(feature):
yield feature


Expand Down
1 change: 0 additions & 1 deletion tag/tests/data/amel-cdna-multi-out.gff3
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,3 @@ NC_007070.3 RefSeq cDNA_match 318980 319128 . + . ID=cDNA_match1;Target=XM_01691
NC_007070.3 RefSeq cDNA_match 320138 320990 . + . ID=cDNA_match1;Target=XM_016914591.1 1434 2286 +;for_remapping=2;gap_count=1;num_ident=3505;num_mismatch=0;pct_coverage=99.9715;pct_coverage_hiqual=99.9715;pct_identity_gap=99.9715;pct_identity_ungap=100;rank=1
NC_007070.3 RefSeq cDNA_match 323501 324025 . + . ID=cDNA_match1;Target=XM_016914591.1 2287 2811 +;for_remapping=2;gap_count=1;num_ident=3505;num_mismatch=0;pct_coverage=99.9715;pct_coverage_hiqual=99.9715;pct_identity_gap=99.9715;pct_identity_ungap=100;rank=1
NC_007070.3 RefSeq cDNA_match 325091 325784 . + . ID=cDNA_match1;Gap=M255 I1 M439;Target=XM_016914591.1 2812 3506 +;for_remapping=2;gap_count=1;num_ident=3505;num_mismatch=0;pct_coverage=99.9715;pct_coverage_hiqual=99.9715;pct_identity_gap=99.9715;pct_identity_ungap=100;rank=1
###
1 change: 0 additions & 1 deletion tag/tests/data/eden-mod.gff3
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,3 @@ ctg123 . CDS 5000 5500 . + 1 ID=cds00004;Parent=mRNA3;Name=edenprotein.4
ctg123 . CDS 7000 7600 . + 1 ID=cds00003;Parent=mRNA3;Name=edenprotein.3
ctg123 . CDS 7000 7600 . + 1 ID=cds00004;Parent=mRNA3;Name=edenprotein.4
ctg123 . exon 7000 9000 . + . ID=exon00005;Parent=mRNA3
###
30 changes: 30 additions & 0 deletions tag/tests/data/grape-cpgat-no-sep.gff3
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
##gff-version 3
##sequence-region chr8 1 100000
chr8 CpGAT gene 72 5081 . - . ID=gene1;Name=chr8.g1
chr8 CpGAT mRNA 72 5081 . - . ID=mRNA1;Parent=gene1;Name=chr8.g1.t1;index=1
chr8 CpGAT exon 72 167 . - 1 Parent=mRNA1
chr8 CpGAT CDS 72 167 . - 0 ID=CDS1;Parent=mRNA1
chr8 AEGeAn::CanonGFF3 intron 168 348 . - . Parent=mRNA1
chr8 CpGAT exon 349 522 . - 1 Parent=mRNA1
chr8 CpGAT CDS 349 522 . - 0 ID=CDS1;Parent=mRNA1
chr8 AEGeAn::CanonGFF3 intron 523 610 . - . Parent=mRNA1
chr8 CpGAT exon 611 702 . - 1 Parent=mRNA1
chr8 CpGAT CDS 611 702 . - 2 ID=CDS1;Parent=mRNA1
chr8 AEGeAn::CanonGFF3 intron 703 4915 . - . Parent=mRNA1
chr8 CpGAT exon 4916 5081 . - 1 Parent=mRNA1
chr8 CpGAT CDS 4916 5081 . - 0 ID=CDS1;Parent=mRNA1
chr8 CpGAT gene 10538 11678 . - . ID=gene2;Name=chr8.g2
chr8 CpGAT mRNA 10538 11678 . - . ID=mRNA2;Parent=gene2;Name=chr8.g2.t1;index=1
chr8 CpGAT three_prime_UTR 10538 10746 . - . Parent=mRNA2
chr8 CpGAT exon 10538 11678 . - 1 Parent=mRNA2
chr8 CpGAT CDS 10747 11577 . - 0 Parent=mRNA2
chr8 CpGAT five_prime_UTR 11578 11678 . - . Parent=mRNA2
chr8 CpGAT gene 22053 23448 . + . ID=gene3;Name=chr8.g3
chr8 CpGAT mRNA 22053 23448 . + . ID=mRNA3;Parent=gene3;Name=chr8.g3.t1;index=1
chr8 CpGAT five_prime_UTR 22053 22166 . + . Parent=mRNA3
chr8 CpGAT exon 22053 22550 . + 1 Parent=mRNA3
chr8 CpGAT CDS 22167 22550 . + 0 ID=CDS2;Parent=mRNA3
chr8 AEGeAn::CanonGFF3 intron 22551 22650 . + . Parent=mRNA3
chr8 CpGAT CDS 22651 23022 . + 0 ID=CDS2;Parent=mRNA3
chr8 CpGAT exon 22651 23448 . + 1 Parent=mRNA3
chr8 CpGAT three_prime_UTR 23023 23448 . + . Parent=mRNA3
4 changes: 3 additions & 1 deletion tag/tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ def test_grape():
reader = GFF3Reader(instream=infile)
output = ''
for record in reader:
output += '%r\n' % record
output += repr(record) + '\n'
if isinstance(record, Feature) and record.is_complex:
output += '###\n'
assert output == open(data_file('grape-cpgat-sorted.gff3')).read()


Expand Down
12 changes: 12 additions & 0 deletions tag/tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,18 @@ def test_write_file():
assert obs_out.strip() == exp_out.strip()


def test_no_complex_separators():
reader = GFF3Reader(infilename=data_file('grape-cpgat.gff3'))
with NamedTemporaryFile(suffix='.gff3', mode='w+t') as outfile:
writer = GFF3Writer(reader, outfile)
writer.complex_separators = False
writer.write()
outfile.seek(0)
obs_out = outfile.read()
exp_out = data_stream('grape-cpgat-no-sep.gff3').read()
assert obs_out.strip() == exp_out.strip()


@pytest.mark.parametrize('gff3', [
'minimus.gff3',
'prokka.gff3',
Expand Down
12 changes: 8 additions & 4 deletions tag/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __init__(self, instream, outfile='-'):
else:
self.outfile = outfile
self.retainids = False
self.complex_separators = True
self.feature_counts = defaultdict(int)
self._seq_written = False
self._block_count = 0
Expand Down Expand Up @@ -83,12 +84,15 @@ def write(self, blockitvl=0):
feature.add_attribute('ID', fid)
else:
feature.drop_attribute('ID')
if entry.is_complex:
self._block_count = 0
else:
self._block_count += 1
if isinstance(entry, Sequence) and not self._seq_written:
print('##FASTA', file=self.outfile)
self._seq_written = True
print(repr(entry), file=self.outfile)
if isinstance(entry, Feature):
if entry.is_complex:
self._block_count = 0
if self.complex_separators:
print('###', file=self.outfile)
else:
self._block_count += 1
self._write_separator(blockitvl)