standage · standage · Jun 27, 2019 · Jun 27, 2019 · Jun 27, 2019 · Jun 27, 2019
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,12 +11,11 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 ### Changed
 - Minor updates to compensate for a couple years' worth of neglect (see #64).
 - Refactored `GFF3Reader` to better support processing of unsorted GFF3 data (see #65).
-- Modified `GFF3Writer` to intermittently output separator directives (`###`) in large stretches of simple (childless) features (see #68).
+- Implemented finer control of how and when output separator directives (`###`) are printed to GFF3 output (see #68, #75).
 - Reorganized test suite code and data (see #70).
 - Refactored the `Feature` and `Score` APIs with more sane default and static constructors (see #74).
 
 
-
 ## [0.3.3] - 2017-06-21
 ### Fixed
 - Missing `extent` query from the index implementation.

diff --git a/tag/bae.py b/tag/bae.py
@@ -26,21 +26,26 @@ def eval_locus(features):
     intervals = defaultdict(int)
     coverage = defaultdict(int)
 
-    numorfs = len(list(tag.select.features(features, type='CDS')))
-
-    for feature in features:
+    numorfs = len(list(
+        tag.select.features(features, type='CDS', traverse=True)
+    ))
+    selector = tag.select.features(
+        features, type=set(['CDS', 'translated_nucleotide_match']),
+        traverse=True,
+    )
+    for feature in selector:
         starts[feature.start] += 1
         ends[feature.end] += 1
         intervals[feature.range] += 1
 
     aligns = tag.select.features(features, type='translated_nucleotide_match')
     ranges = [a.range for a in aligns]
     for block in tag.Range.merge_overlapping(ranges):
-        for feature in tag.select.features(features, type='CDS'):
-            bpoverlap = feature.range.overlap_extent(block)
-            coverage[feature] += bpoverlap
+        for feat in tag.select.features(features, type='CDS', traverse=True):
+            bpoverlap = feat.range.overlap_extent(block)
+            coverage[feat] += bpoverlap
 
-    for feature in tag.select.features(features, type='CDS'):
+    for feature in tag.select.features(features, type='CDS', traverse=True):
         start_confirmed = starts[feature.start] - 1
         start_shared = starts[feature.start] / sum(starts.values())
         end_confirmed = ends[feature.end] - 1

diff --git a/tag/cli/bae.py b/tag/cli/bae.py
@@ -53,4 +53,5 @@ def main(args):
     )
     evalstream = tag.bae.eval_stream(locusstream)
     writer = tag.writer.GFF3Writer(evalstream, args.out)
+    writer.complex_separators = False
     writer.write()
diff --git a/tag/cli/locuspocus.py b/tag/cli/locuspocus.py
@@ -58,4 +58,5 @@ def main(args):
         minperc=args.min_perc
     )
     writer = tag.writer.GFF3Writer(locusstream, args.out)
+    writer.complex_separators = False
     writer.write()
diff --git a/tag/feature.py b/tag/feature.py
@@ -103,8 +103,6 @@ def __repr__(self):
             if string != '':
                 string += '\n'
             string += str(feature)
-        if self.is_complex:
-            string += '\n###'
         return string
 
     def __len__(self):

diff --git a/tag/select.py b/tag/select.py
@@ -23,19 +23,24 @@ def features(entrystream, type=None, traverse=False):
                      to :code:`True` to search each feature graph for the
                      specified feature type
     """
+    def _typecheck(feat):
+        if isinstance(type, str):
+            return feat.type == type
+        return feat.type in type
+
     for feature in entry_type_filter(entrystream, tag.Feature):
         if traverse:
             if type is None:
                 message = 'cannot traverse without a specific feature type'
                 raise ValueError(message)
-            if type == feature.type:
+            if _typecheck(feature):
                 yield feature
             else:
                 for subfeature in feature:
-                    if type == subfeature.type:
+                    if _typecheck(subfeature):
                         yield subfeature
         else:
-            if not type or type == feature.type:
+            if not type or _typecheck(feature):
                 yield feature
 
 

diff --git a/tag/tests/data/amel-cdna-multi-out.gff3 b/tag/tests/data/amel-cdna-multi-out.gff3
@@ -5,4 +5,3 @@ NC_007070.3	RefSeq	cDNA_match	318980	319128	.	+	.	ID=cDNA_match1;Target=XM_01691
 NC_007070.3	RefSeq	cDNA_match	320138	320990	.	+	.	ID=cDNA_match1;Target=XM_016914591.1 1434 2286 +;for_remapping=2;gap_count=1;num_ident=3505;num_mismatch=0;pct_coverage=99.9715;pct_coverage_hiqual=99.9715;pct_identity_gap=99.9715;pct_identity_ungap=100;rank=1
 NC_007070.3	RefSeq	cDNA_match	323501	324025	.	+	.	ID=cDNA_match1;Target=XM_016914591.1 2287 2811 +;for_remapping=2;gap_count=1;num_ident=3505;num_mismatch=0;pct_coverage=99.9715;pct_coverage_hiqual=99.9715;pct_identity_gap=99.9715;pct_identity_ungap=100;rank=1
 NC_007070.3	RefSeq	cDNA_match	325091	325784	.	+	.	ID=cDNA_match1;Gap=M255 I1 M439;Target=XM_016914591.1 2812 3506 +;for_remapping=2;gap_count=1;num_ident=3505;num_mismatch=0;pct_coverage=99.9715;pct_coverage_hiqual=99.9715;pct_identity_gap=99.9715;pct_identity_ungap=100;rank=1
-###
diff --git a/tag/tests/data/eden-mod.gff3 b/tag/tests/data/eden-mod.gff3
@@ -21,4 +21,3 @@ ctg123	.	CDS	5000	5500	.	+	1	ID=cds00004;Parent=mRNA3;Name=edenprotein.4
 ctg123	.	CDS	7000	7600	.	+	1	ID=cds00003;Parent=mRNA3;Name=edenprotein.3
 ctg123	.	CDS	7000	7600	.	+	1	ID=cds00004;Parent=mRNA3;Name=edenprotein.4
 ctg123	.	exon	7000	9000	.	+	.	ID=exon00005;Parent=mRNA3
-###
diff --git a/tag/tests/data/grape-cpgat-no-sep.gff3 b/tag/tests/data/grape-cpgat-no-sep.gff3
@@ -0,0 +1,30 @@
+##gff-version 3
+##sequence-region   chr8 1 100000
+chr8	CpGAT	gene	72	5081	.	-	.	ID=gene1;Name=chr8.g1
+chr8	CpGAT	mRNA	72	5081	.	-	.	ID=mRNA1;Parent=gene1;Name=chr8.g1.t1;index=1
+chr8	CpGAT	exon	72	167	.	-	1	Parent=mRNA1
+chr8	CpGAT	CDS	72	167	.	-	0	ID=CDS1;Parent=mRNA1
+chr8	AEGeAn::CanonGFF3	intron	168	348	.	-	.	Parent=mRNA1
+chr8	CpGAT	exon	349	522	.	-	1	Parent=mRNA1
+chr8	CpGAT	CDS	349	522	.	-	0	ID=CDS1;Parent=mRNA1
+chr8	AEGeAn::CanonGFF3	intron	523	610	.	-	.	Parent=mRNA1
+chr8	CpGAT	exon	611	702	.	-	1	Parent=mRNA1
+chr8	CpGAT	CDS	611	702	.	-	2	ID=CDS1;Parent=mRNA1
+chr8	AEGeAn::CanonGFF3	intron	703	4915	.	-	.	Parent=mRNA1
+chr8	CpGAT	exon	4916	5081	.	-	1	Parent=mRNA1
+chr8	CpGAT	CDS	4916	5081	.	-	0	ID=CDS1;Parent=mRNA1
+chr8	CpGAT	gene	10538	11678	.	-	.	ID=gene2;Name=chr8.g2
+chr8	CpGAT	mRNA	10538	11678	.	-	.	ID=mRNA2;Parent=gene2;Name=chr8.g2.t1;index=1
+chr8	CpGAT	three_prime_UTR	10538	10746	.	-	.	Parent=mRNA2
+chr8	CpGAT	exon	10538	11678	.	-	1	Parent=mRNA2
+chr8	CpGAT	CDS	10747	11577	.	-	0	Parent=mRNA2
+chr8	CpGAT	five_prime_UTR	11578	11678	.	-	.	Parent=mRNA2
+chr8	CpGAT	gene	22053	23448	.	+	.	ID=gene3;Name=chr8.g3
+chr8	CpGAT	mRNA	22053	23448	.	+	.	ID=mRNA3;Parent=gene3;Name=chr8.g3.t1;index=1
+chr8	CpGAT	five_prime_UTR	22053	22166	.	+	.	Parent=mRNA3
+chr8	CpGAT	exon	22053	22550	.	+	1	Parent=mRNA3
+chr8	CpGAT	CDS	22167	22550	.	+	0	ID=CDS2;Parent=mRNA3
+chr8	AEGeAn::CanonGFF3	intron	22551	22650	.	+	.	Parent=mRNA3
+chr8	CpGAT	CDS	22651	23022	.	+	0	ID=CDS2;Parent=mRNA3
+chr8	CpGAT	exon	22651	23448	.	+	1	Parent=mRNA3
+chr8	CpGAT	three_prime_UTR	23023	23448	.	+	.	Parent=mRNA3
diff --git a/tag/tests/test_reader.py b/tag/tests/test_reader.py
@@ -20,7 +20,9 @@ def test_grape():
         reader = GFF3Reader(instream=infile)
         output = ''
         for record in reader:
-            output += '%r\n' % record
+            output += repr(record) + '\n'
+            if isinstance(record, Feature) and record.is_complex:
+                output += '###\n'
         assert output == open(data_file('grape-cpgat-sorted.gff3')).read()
 
 

diff --git a/tag/tests/test_writer.py b/tag/tests/test_writer.py
@@ -43,6 +43,18 @@ def test_write_file():
     assert obs_out.strip() == exp_out.strip()
 
 
+def test_no_complex_separators():
+    reader = GFF3Reader(infilename=data_file('grape-cpgat.gff3'))
+    with NamedTemporaryFile(suffix='.gff3', mode='w+t') as outfile:
+        writer = GFF3Writer(reader, outfile)
+        writer.complex_separators = False
+        writer.write()
+        outfile.seek(0)
+        obs_out = outfile.read()
+    exp_out = data_stream('grape-cpgat-no-sep.gff3').read()
+    assert obs_out.strip() == exp_out.strip()
+
+
 @pytest.mark.parametrize('gff3', [
     'minimus.gff3',
     'prokka.gff3',

diff --git a/tag/writer.py b/tag/writer.py
@@ -41,6 +41,7 @@ def __init__(self, instream, outfile='-'):
         else:
             self.outfile = outfile
         self.retainids = False
+        self.complex_separators = True
         self.feature_counts = defaultdict(int)
         self._seq_written = False
         self._block_count = 0
@@ -83,12 +84,15 @@ def write(self, blockitvl=0):
                         feature.add_attribute('ID', fid)
                     else:
                         feature.drop_attribute('ID')
-                    if entry.is_complex:
-                        self._block_count = 0
-                    else:
-                        self._block_count += 1
             if isinstance(entry, Sequence) and not self._seq_written:
                 print('##FASTA', file=self.outfile)
                 self._seq_written = True
             print(repr(entry), file=self.outfile)
+            if isinstance(entry, Feature):
+                if entry.is_complex:
+                    self._block_count = 0
+                    if self.complex_separators:
+                        print('###', file=self.outfile)
+                else:
+                    self._block_count += 1
             self._write_separator(blockitvl)