From be3905e0d77cebfe8133aea6afebce8c34b2ab32 Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Thu, 27 Jun 2019 16:10:53 -0400 Subject: [PATCH 1/4] Improved separator handling --- tag/cli/bae.py | 1 + tag/cli/locuspocus.py | 1 + tag/feature.py | 2 -- tag/tests/data/amel-cdna-multi-out.gff3 | 1 - tag/tests/data/eden-mod.gff3 | 1 - tag/tests/data/grape-cpgat-no-sep.gff3 | 30 +++++++++++++++++++++++++ tag/tests/test_reader.py | 4 +++- tag/tests/test_writer.py | 12 ++++++++++ tag/writer.py | 12 ++++++---- 9 files changed, 55 insertions(+), 9 deletions(-) create mode 100644 tag/tests/data/grape-cpgat-no-sep.gff3 diff --git a/tag/cli/bae.py b/tag/cli/bae.py index 35a2b9c..e70dfbb 100644 --- a/tag/cli/bae.py +++ b/tag/cli/bae.py @@ -53,4 +53,5 @@ def main(args): ) evalstream = tag.bae.eval_stream(locusstream) writer = tag.writer.GFF3Writer(evalstream, args.out) + writer.complex_separators = False writer.write() diff --git a/tag/cli/locuspocus.py b/tag/cli/locuspocus.py index 2b0cec7..40f8773 100644 --- a/tag/cli/locuspocus.py +++ b/tag/cli/locuspocus.py @@ -58,4 +58,5 @@ def main(args): minperc=args.min_perc ) writer = tag.writer.GFF3Writer(locusstream, args.out) + writer.complex_separators = False writer.write() diff --git a/tag/feature.py b/tag/feature.py index d6a65b9..6bfd72d 100644 --- a/tag/feature.py +++ b/tag/feature.py @@ -103,8 +103,6 @@ def __repr__(self): if string != '': string += '\n' string += str(feature) - if self.is_complex: - string += '\n###' return string def __len__(self): diff --git a/tag/tests/data/amel-cdna-multi-out.gff3 b/tag/tests/data/amel-cdna-multi-out.gff3 index 1c0e983..9d39686 100644 --- a/tag/tests/data/amel-cdna-multi-out.gff3 +++ b/tag/tests/data/amel-cdna-multi-out.gff3 @@ -5,4 +5,3 @@ NC_007070.3 RefSeq cDNA_match 318980 319128 . + . ID=cDNA_match1;Target=XM_01691 NC_007070.3 RefSeq cDNA_match 320138 320990 . + . ID=cDNA_match1;Target=XM_016914591.1 1434 2286 +;for_remapping=2;gap_count=1;num_ident=3505;num_mismatch=0;pct_coverage=99.9715;pct_coverage_hiqual=99.9715;pct_identity_gap=99.9715;pct_identity_ungap=100;rank=1 NC_007070.3 RefSeq cDNA_match 323501 324025 . + . ID=cDNA_match1;Target=XM_016914591.1 2287 2811 +;for_remapping=2;gap_count=1;num_ident=3505;num_mismatch=0;pct_coverage=99.9715;pct_coverage_hiqual=99.9715;pct_identity_gap=99.9715;pct_identity_ungap=100;rank=1 NC_007070.3 RefSeq cDNA_match 325091 325784 . + . ID=cDNA_match1;Gap=M255 I1 M439;Target=XM_016914591.1 2812 3506 +;for_remapping=2;gap_count=1;num_ident=3505;num_mismatch=0;pct_coverage=99.9715;pct_coverage_hiqual=99.9715;pct_identity_gap=99.9715;pct_identity_ungap=100;rank=1 -### diff --git a/tag/tests/data/eden-mod.gff3 b/tag/tests/data/eden-mod.gff3 index d95586a..b3f77c9 100644 --- a/tag/tests/data/eden-mod.gff3 +++ b/tag/tests/data/eden-mod.gff3 @@ -21,4 +21,3 @@ ctg123 . CDS 5000 5500 . + 1 ID=cds00004;Parent=mRNA3;Name=edenprotein.4 ctg123 . CDS 7000 7600 . + 1 ID=cds00003;Parent=mRNA3;Name=edenprotein.3 ctg123 . CDS 7000 7600 . + 1 ID=cds00004;Parent=mRNA3;Name=edenprotein.4 ctg123 . exon 7000 9000 . + . ID=exon00005;Parent=mRNA3 -### diff --git a/tag/tests/data/grape-cpgat-no-sep.gff3 b/tag/tests/data/grape-cpgat-no-sep.gff3 new file mode 100644 index 0000000..c5084c9 --- /dev/null +++ b/tag/tests/data/grape-cpgat-no-sep.gff3 @@ -0,0 +1,30 @@ +##gff-version 3 +##sequence-region chr8 1 100000 +chr8 CpGAT gene 72 5081 . - . ID=gene1;Name=chr8.g1 +chr8 CpGAT mRNA 72 5081 . - . ID=mRNA1;Parent=gene1;Name=chr8.g1.t1;index=1 +chr8 CpGAT exon 72 167 . - 1 Parent=mRNA1 +chr8 CpGAT CDS 72 167 . - 0 ID=CDS1;Parent=mRNA1 +chr8 AEGeAn::CanonGFF3 intron 168 348 . - . Parent=mRNA1 +chr8 CpGAT exon 349 522 . - 1 Parent=mRNA1 +chr8 CpGAT CDS 349 522 . - 0 ID=CDS1;Parent=mRNA1 +chr8 AEGeAn::CanonGFF3 intron 523 610 . - . Parent=mRNA1 +chr8 CpGAT exon 611 702 . - 1 Parent=mRNA1 +chr8 CpGAT CDS 611 702 . - 2 ID=CDS1;Parent=mRNA1 +chr8 AEGeAn::CanonGFF3 intron 703 4915 . - . Parent=mRNA1 +chr8 CpGAT exon 4916 5081 . - 1 Parent=mRNA1 +chr8 CpGAT CDS 4916 5081 . - 0 ID=CDS1;Parent=mRNA1 +chr8 CpGAT gene 10538 11678 . - . ID=gene2;Name=chr8.g2 +chr8 CpGAT mRNA 10538 11678 . - . ID=mRNA2;Parent=gene2;Name=chr8.g2.t1;index=1 +chr8 CpGAT three_prime_UTR 10538 10746 . - . Parent=mRNA2 +chr8 CpGAT exon 10538 11678 . - 1 Parent=mRNA2 +chr8 CpGAT CDS 10747 11577 . - 0 Parent=mRNA2 +chr8 CpGAT five_prime_UTR 11578 11678 . - . Parent=mRNA2 +chr8 CpGAT gene 22053 23448 . + . ID=gene3;Name=chr8.g3 +chr8 CpGAT mRNA 22053 23448 . + . ID=mRNA3;Parent=gene3;Name=chr8.g3.t1;index=1 +chr8 CpGAT five_prime_UTR 22053 22166 . + . Parent=mRNA3 +chr8 CpGAT exon 22053 22550 . + 1 Parent=mRNA3 +chr8 CpGAT CDS 22167 22550 . + 0 ID=CDS2;Parent=mRNA3 +chr8 AEGeAn::CanonGFF3 intron 22551 22650 . + . Parent=mRNA3 +chr8 CpGAT CDS 22651 23022 . + 0 ID=CDS2;Parent=mRNA3 +chr8 CpGAT exon 22651 23448 . + 1 Parent=mRNA3 +chr8 CpGAT three_prime_UTR 23023 23448 . + . Parent=mRNA3 diff --git a/tag/tests/test_reader.py b/tag/tests/test_reader.py index dee87ee..7e2a6dd 100644 --- a/tag/tests/test_reader.py +++ b/tag/tests/test_reader.py @@ -20,7 +20,9 @@ def test_grape(): reader = GFF3Reader(instream=infile) output = '' for record in reader: - output += '%r\n' % record + output += repr(record) + '\n' + if isinstance(record, Feature) and record.is_complex: + output += '###\n' assert output == open(data_file('grape-cpgat-sorted.gff3')).read() diff --git a/tag/tests/test_writer.py b/tag/tests/test_writer.py index c23f00a..c90841b 100644 --- a/tag/tests/test_writer.py +++ b/tag/tests/test_writer.py @@ -43,6 +43,18 @@ def test_write_file(): assert obs_out.strip() == exp_out.strip() +def test_no_complex_separators(): + reader = GFF3Reader(infilename=data_file('grape-cpgat.gff3')) + with NamedTemporaryFile(suffix='.gff3', mode='w+t') as outfile: + writer = GFF3Writer(reader, outfile) + writer.complex_separators = False + writer.write() + outfile.seek(0) + obs_out = outfile.read() + exp_out = data_stream('grape-cpgat-no-sep.gff3').read() + assert obs_out.strip() == exp_out.strip() + + @pytest.mark.parametrize('gff3', [ 'minimus.gff3', 'prokka.gff3', diff --git a/tag/writer.py b/tag/writer.py index a7b677d..01528b5 100644 --- a/tag/writer.py +++ b/tag/writer.py @@ -41,6 +41,7 @@ def __init__(self, instream, outfile='-'): else: self.outfile = outfile self.retainids = False + self.complex_separators = True self.feature_counts = defaultdict(int) self._seq_written = False self._block_count = 0 @@ -83,12 +84,15 @@ def write(self, blockitvl=0): feature.add_attribute('ID', fid) else: feature.drop_attribute('ID') - if entry.is_complex: - self._block_count = 0 - else: - self._block_count += 1 if isinstance(entry, Sequence) and not self._seq_written: print('##FASTA', file=self.outfile) self._seq_written = True print(repr(entry), file=self.outfile) + if isinstance(entry, Feature): + if entry.is_complex: + self._block_count = 0 + if self.complex_separators: + print('###', file=self.outfile) + else: + self._block_count += 1 self._write_separator(blockitvl) From 0ce8c4ee7a25eefcd7c77721672d39f15f429aee Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Thu, 27 Jun 2019 16:51:42 -0400 Subject: [PATCH 2/4] Fix traversal and type checking --- tag/bae.py | 15 ++++++++++----- tag/select.py | 11 ++++++++--- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/tag/bae.py b/tag/bae.py index 070fa71..c0416f8 100644 --- a/tag/bae.py +++ b/tag/bae.py @@ -26,9 +26,14 @@ def eval_locus(features): intervals = defaultdict(int) coverage = defaultdict(int) - numorfs = len(list(tag.select.features(features, type='CDS'))) - - for feature in features: + numorfs = len(list( + tag.select.features(features, type='CDS', traverse=True) + )) + selector = tag.select.features( + features, type=set(['CDS', 'translated_nucleotide_match']), + traverse=True, + ) + for feature in selector: starts[feature.start] += 1 ends[feature.end] += 1 intervals[feature.range] += 1 @@ -36,11 +41,11 @@ def eval_locus(features): aligns = tag.select.features(features, type='translated_nucleotide_match') ranges = [a.range for a in aligns] for block in tag.Range.merge_overlapping(ranges): - for feature in tag.select.features(features, type='CDS'): + for feature in tag.select.features(features, type='CDS', traverse=True): bpoverlap = feature.range.overlap_extent(block) coverage[feature] += bpoverlap - for feature in tag.select.features(features, type='CDS'): + for feature in tag.select.features(features, type='CDS', traverse=True): start_confirmed = starts[feature.start] - 1 start_shared = starts[feature.start] / sum(starts.values()) end_confirmed = ends[feature.end] - 1 diff --git a/tag/select.py b/tag/select.py index c888acc..fbb0d7a 100644 --- a/tag/select.py +++ b/tag/select.py @@ -23,19 +23,24 @@ def features(entrystream, type=None, traverse=False): to :code:`True` to search each feature graph for the specified feature type """ + def _typecheck(feat): + if isinstance(type, str): + return feat.type == type + return feat.type in type + for feature in entry_type_filter(entrystream, tag.Feature): if traverse: if type is None: message = 'cannot traverse without a specific feature type' raise ValueError(message) - if type == feature.type: + if _typecheck(feature): yield feature else: for subfeature in feature: - if type == subfeature.type: + if _typecheck(subfeature): yield subfeature else: - if not type or type == feature.type: + if not type or _typecheck(feature): yield feature From 6960acaa9556a90db222ba7335a06838067ce8eb Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Thu, 27 Jun 2019 17:00:13 -0400 Subject: [PATCH 3/4] Appease style gods --- tag/bae.py | 6 +++--- tag/select.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tag/bae.py b/tag/bae.py index c0416f8..3ffc7fc 100644 --- a/tag/bae.py +++ b/tag/bae.py @@ -41,9 +41,9 @@ def eval_locus(features): aligns = tag.select.features(features, type='translated_nucleotide_match') ranges = [a.range for a in aligns] for block in tag.Range.merge_overlapping(ranges): - for feature in tag.select.features(features, type='CDS', traverse=True): - bpoverlap = feature.range.overlap_extent(block) - coverage[feature] += bpoverlap + for feat in tag.select.features(features, type='CDS', traverse=True): + bpoverlap = feat.range.overlap_extent(block) + coverage[feat] += bpoverlap for feature in tag.select.features(features, type='CDS', traverse=True): start_confirmed = starts[feature.start] - 1 diff --git a/tag/select.py b/tag/select.py index fbb0d7a..003d47c 100644 --- a/tag/select.py +++ b/tag/select.py @@ -25,7 +25,7 @@ def features(entrystream, type=None, traverse=False): """ def _typecheck(feat): if isinstance(type, str): - return feat.type == type + return feat.type == type return feat.type in type for feature in entry_type_filter(entrystream, tag.Feature): From 39d57a88c183b9d93a553ba3e7285d052afbfb9d Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Thu, 27 Jun 2019 17:35:17 -0400 Subject: [PATCH 4/4] Update change log --- CHANGELOG.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6fceb6d..3da8a86 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,12 +11,11 @@ This project adheres to [Semantic Versioning](http://semver.org/). ### Changed - Minor updates to compensate for a couple years' worth of neglect (see #64). - Refactored `GFF3Reader` to better support processing of unsorted GFF3 data (see #65). -- Modified `GFF3Writer` to intermittently output separator directives (`###`) in large stretches of simple (childless) features (see #68). +- Implemented finer control of how and when output separator directives (`###`) are printed to GFF3 output (see #68, #75). - Reorganized test suite code and data (see #70). - Refactored the `Feature` and `Score` APIs with more sane default and static constructors (see #74). - ## [0.3.3] - 2017-06-21 ### Fixed - Missing `extent` query from the index implementation.