Better handling of separator directives and feature selection (#75)

This update mashes together two changes. First, it gives finer control to how and when separator directives are printed after complex features. Second, it enables selection of multiple feature types simultaneously in the `tag.select.features` function and updates the `bae` module to correctly use this function in `traverse` mode.
standage · Jun 27, 2019 · c78a163 · c78a163
1 parent 36c1a9b
commit c78a163
Show file tree

Hide file tree

Showing 12 changed files with 76 additions and 21 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,12 +11,11 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 ### Changed
 - Minor updates to compensate for a couple years' worth of neglect (see #64).
 - Refactored `GFF3Reader` to better support processing of unsorted GFF3 data (see #65).
-- Modified `GFF3Writer` to intermittently output separator directives (`###`) in large stretches of simple (childless) features (see #68).
+- Implemented finer control of how and when output separator directives (`###`) are printed to GFF3 output (see #68, #75).
 - Reorganized test suite code and data (see #70).
 - Refactored the `Feature` and `Score` APIs with more sane default and static constructors (see #74).
 
 
-
 ## [0.3.3] - 2017-06-21
 ### Fixed
 - Missing `extent` query from the index implementation.

diff --git a/tag/bae.py b/tag/bae.py
@@ -26,21 +26,26 @@ def eval_locus(features):
     intervals = defaultdict(int)
     coverage = defaultdict(int)
 
-    numorfs = len(list(tag.select.features(features, type='CDS')))
-
-    for feature in features:
+    numorfs = len(list(
+        tag.select.features(features, type='CDS', traverse=True)
+    ))
+    selector = tag.select.features(
+        features, type=set(['CDS', 'translated_nucleotide_match']),
+        traverse=True,
+    )
+    for feature in selector:
         starts[feature.start] += 1
         ends[feature.end] += 1
         intervals[feature.range] += 1
 
     aligns = tag.select.features(features, type='translated_nucleotide_match')
     ranges = [a.range for a in aligns]
     for block in tag.Range.merge_overlapping(ranges):
-        for feature in tag.select.features(features, type='CDS'):
-            bpoverlap = feature.range.overlap_extent(block)
-            coverage[feature] += bpoverlap
+        for feat in tag.select.features(features, type='CDS', traverse=True):
+            bpoverlap = feat.range.overlap_extent(block)
+            coverage[feat] += bpoverlap
 
-    for feature in tag.select.features(features, type='CDS'):
+    for feature in tag.select.features(features, type='CDS', traverse=True):
         start_confirmed = starts[feature.start] - 1
         start_shared = starts[feature.start] / sum(starts.values())
         end_confirmed = ends[feature.end] - 1

diff --git a/tag/cli/bae.py b/tag/cli/bae.py
@@ -53,4 +53,5 @@ def main(args):
     )
     evalstream = tag.bae.eval_stream(locusstream)
     writer = tag.writer.GFF3Writer(evalstream, args.out)
+    writer.complex_separators = False
     writer.write()
diff --git a/tag/cli/locuspocus.py b/tag/cli/locuspocus.py
@@ -58,4 +58,5 @@ def main(args):
         minperc=args.min_perc
     )
     writer = tag.writer.GFF3Writer(locusstream, args.out)
+    writer.complex_separators = False
     writer.write()
diff --git a/tag/feature.py b/tag/feature.py
@@ -103,8 +103,6 @@ def __repr__(self):
             if string != '':
                 string += '\n'
             string += str(feature)
-        if self.is_complex:
-            string += '\n###'
         return string
 
     def __len__(self):

diff --git a/tag/select.py b/tag/select.py
@@ -23,19 +23,24 @@ def features(entrystream, type=None, traverse=False):
                      to :code:`True` to search each feature graph for the
                      specified feature type
     """
+    def _typecheck(feat):
+        if isinstance(type, str):
+            return feat.type == type
+        return feat.type in type
+
     for feature in entry_type_filter(entrystream, tag.Feature):
         if traverse:
             if type is None:
                 message = 'cannot traverse without a specific feature type'
                 raise ValueError(message)
-            if type == feature.type:
+            if _typecheck(feature):
                 yield feature
             else:
                 for subfeature in feature:
-                    if type == subfeature.type:
+                    if _typecheck(subfeature):
                         yield subfeature
         else:
-            if not type or type == feature.type:
+            if not type or _typecheck(feature):
                 yield feature
 
 

diff --git a/tag/tests/data/amel-cdna-multi-out.gff3 b/tag/tests/data/amel-cdna-multi-out.gff3
@@ -5,4 +5,3 @@ NC_007070.3	RefSeq	cDNA_match	318980	319128	.	+	.	ID=cDNA_match1;Target=XM_01691
 NC_007070.3	RefSeq	cDNA_match	320138	320990	.	+	.	ID=cDNA_match1;Target=XM_016914591.1 1434 2286 +;for_remapping=2;gap_count=1;num_ident=3505;num_mismatch=0;pct_coverage=99.9715;pct_coverage_hiqual=99.9715;pct_identity_gap=99.9715;pct_identity_ungap=100;rank=1
 NC_007070.3	RefSeq	cDNA_match	323501	324025	.	+	.	ID=cDNA_match1;Target=XM_016914591.1 2287 2811 +;for_remapping=2;gap_count=1;num_ident=3505;num_mismatch=0;pct_coverage=99.9715;pct_coverage_hiqual=99.9715;pct_identity_gap=99.9715;pct_identity_ungap=100;rank=1
 NC_007070.3	RefSeq	cDNA_match	325091	325784	.	+	.	ID=cDNA_match1;Gap=M255 I1 M439;Target=XM_016914591.1 2812 3506 +;for_remapping=2;gap_count=1;num_ident=3505;num_mismatch=0;pct_coverage=99.9715;pct_coverage_hiqual=99.9715;pct_identity_gap=99.9715;pct_identity_ungap=100;rank=1
-###
diff --git a/tag/tests/data/eden-mod.gff3 b/tag/tests/data/eden-mod.gff3
@@ -21,4 +21,3 @@ ctg123	.	CDS	5000	5500	.	+	1	ID=cds00004;Parent=mRNA3;Name=edenprotein.4
 ctg123	.	CDS	7000	7600	.	+	1	ID=cds00003;Parent=mRNA3;Name=edenprotein.3
 ctg123	.	CDS	7000	7600	.	+	1	ID=cds00004;Parent=mRNA3;Name=edenprotein.4
 ctg123	.	exon	7000	9000	.	+	.	ID=exon00005;Parent=mRNA3
-###
diff --git a/tag/tests/data/grape-cpgat-no-sep.gff3 b/tag/tests/data/grape-cpgat-no-sep.gff3
@@ -0,0 +1,30 @@
+##gff-version 3
+##sequence-region   chr8 1 100000
+chr8	CpGAT	gene	72	5081	.	-	.	ID=gene1;Name=chr8.g1
+chr8	CpGAT	mRNA	72	5081	.	-	.	ID=mRNA1;Parent=gene1;Name=chr8.g1.t1;index=1
+chr8	CpGAT	exon	72	167	.	-	1	Parent=mRNA1
+chr8	CpGAT	CDS	72	167	.	-	0	ID=CDS1;Parent=mRNA1
+chr8	AEGeAn::CanonGFF3	intron	168	348	.	-	.	Parent=mRNA1
+chr8	CpGAT	exon	349	522	.	-	1	Parent=mRNA1
+chr8	CpGAT	CDS	349	522	.	-	0	ID=CDS1;Parent=mRNA1
+chr8	AEGeAn::CanonGFF3	intron	523	610	.	-	.	Parent=mRNA1
+chr8	CpGAT	exon	611	702	.	-	1	Parent=mRNA1
+chr8	CpGAT	CDS	611	702	.	-	2	ID=CDS1;Parent=mRNA1
+chr8	AEGeAn::CanonGFF3	intron	703	4915	.	-	.	Parent=mRNA1
+chr8	CpGAT	exon	4916	5081	.	-	1	Parent=mRNA1
+chr8	CpGAT	CDS	4916	5081	.	-	0	ID=CDS1;Parent=mRNA1
+chr8	CpGAT	gene	10538	11678	.	-	.	ID=gene2;Name=chr8.g2
+chr8	CpGAT	mRNA	10538	11678	.	-	.	ID=mRNA2;Parent=gene2;Name=chr8.g2.t1;index=1
+chr8	CpGAT	three_prime_UTR	10538	10746	.	-	.	Parent=mRNA2
+chr8	CpGAT	exon	10538	11678	.	-	1	Parent=mRNA2
+chr8	CpGAT	CDS	10747	11577	.	-	0	Parent=mRNA2
+chr8	CpGAT	five_prime_UTR	11578	11678	.	-	.	Parent=mRNA2
+chr8	CpGAT	gene	22053	23448	.	+	.	ID=gene3;Name=chr8.g3
+chr8	CpGAT	mRNA	22053	23448	.	+	.	ID=mRNA3;Parent=gene3;Name=chr8.g3.t1;index=1
+chr8	CpGAT	five_prime_UTR	22053	22166	.	+	.	Parent=mRNA3
+chr8	CpGAT	exon	22053	22550	.	+	1	Parent=mRNA3
+chr8	CpGAT	CDS	22167	22550	.	+	0	ID=CDS2;Parent=mRNA3
+chr8	AEGeAn::CanonGFF3	intron	22551	22650	.	+	.	Parent=mRNA3
+chr8	CpGAT	CDS	22651	23022	.	+	0	ID=CDS2;Parent=mRNA3
+chr8	CpGAT	exon	22651	23448	.	+	1	Parent=mRNA3
+chr8	CpGAT	three_prime_UTR	23023	23448	.	+	.	Parent=mRNA3
diff --git a/tag/tests/test_reader.py b/tag/tests/test_reader.py
@@ -20,7 +20,9 @@ def test_grape():
         reader = GFF3Reader(instream=infile)
         output = ''
         for record in reader:
-            output += '%r\n' % record
+            output += repr(record) + '\n'
+            if isinstance(record, Feature) and record.is_complex:
+                output += '###\n'
         assert output == open(data_file('grape-cpgat-sorted.gff3')).read()
 
 

diff --git a/tag/tests/test_writer.py b/tag/tests/test_writer.py
@@ -43,6 +43,18 @@ def test_write_file():
     assert obs_out.strip() == exp_out.strip()
 
 
+def test_no_complex_separators():
+    reader = GFF3Reader(infilename=data_file('grape-cpgat.gff3'))
+    with NamedTemporaryFile(suffix='.gff3', mode='w+t') as outfile:
+        writer = GFF3Writer(reader, outfile)
+        writer.complex_separators = False
+        writer.write()
+        outfile.seek(0)
+        obs_out = outfile.read()
+    exp_out = data_stream('grape-cpgat-no-sep.gff3').read()
+    assert obs_out.strip() == exp_out.strip()
+
+
 @pytest.mark.parametrize('gff3', [
     'minimus.gff3',
     'prokka.gff3',

diff --git a/tag/writer.py b/tag/writer.py
@@ -41,6 +41,7 @@ def __init__(self, instream, outfile='-'):
         else:
             self.outfile = outfile
         self.retainids = False
+        self.complex_separators = True
         self.feature_counts = defaultdict(int)
         self._seq_written = False
         self._block_count = 0
@@ -83,12 +84,15 @@ def write(self, blockitvl=0):
                         feature.add_attribute('ID', fid)
                     else:
                         feature.drop_attribute('ID')
-                    if entry.is_complex:
-                        self._block_count = 0
-                    else:
-                        self._block_count += 1
             if isinstance(entry, Sequence) and not self._seq_written:
                 print('##FASTA', file=self.outfile)
                 self._seq_written = True
             print(repr(entry), file=self.outfile)
+            if isinstance(entry, Feature):
+                if entry.is_complex:
+                    self._block_count = 0
+                    if self.complex_separators:
+                        print('###', file=self.outfile)
+                else:
+                    self._block_count += 1
             self._write_separator(blockitvl)