Add gff2gff.py script contributed by @flashton2003 in #530

samtools · Sep 7, 2020 · d1b74cd · d1b74cd · flashton2003 · Sep 7, 2020
1 parent ebf8aa3
commit d1b74cd
Showing 1 changed file with 172 additions and 0 deletions.
diff --git a/misc/gff2gff.py b/misc/gff2gff.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python
+#
+# This is script was contributed by Philip Ashton (https://github.com/flashton2003)
+# with the intention to convert from the many GFF flavours to the Ensembl GFF3 format
+# supported by bcftools csq. Please expand as necessary and send a pull request.
+# 
+# See also
+#   https://github.com/samtools/bcftools/issues/530#issuecomment-268278248
+#   https://github.com/samtools/bcftools/issues/1078#issuecomment-527484831
+#   https://github.com/samtools/bcftools/issues/1208#issuecomment-620642373
+#
+import sys
+import gffutils
+
+class PseudoFeature():
+    def __init__(self, chrom, start, stop, strand):
+        self.chrom = chrom
+        self.start = start
+        self.stop = stop
+        self.strand = strand
+
+
+class FeatureGroup():
+    def __init__(self, gene_id):
+        ## see here https://github.com/samtools/bcftools/issues/530#issuecomment-614410603
+        self.id = gene_id
+        ## the below should be lists of instances of Feature class.
+        self.gene = None # one feature group should have a single gene
+        self.transcript = None
+        ## assuming here that ncRNA is more like transcript than exon
+        ## i.e. there is only one ncRNA feature per feature group.
+        self.ncRNA = None
+        self.exons = []
+        self.CDSs = []
+
+    def add_gene(self, feature):
+        self.gene = feature
+
+    def add_transcript(self, feature):
+        self.transcript = feature # one feature group should have a single gene
+
+    def add_exon(self, feature):
+        self.exons.append(feature)
+
+    def add_cds(self, feature):
+        self.CDSs.append(feature)
+
+    def add_ncRNA(self, feature):
+        self.ncRNA = feature
+
+
+def get_args():
+    if len(sys.argv) != 3:
+        print('Usage: gff2gff.py <gff_inhandle> </path/where/you/want/to/save/gffutils_db>')
+        sys.exit()
+    else:
+        return sys.argv[1], sys.argv[2]
+
+def group_features(db):
+    feature_groups = {}
+    for feature in db.all_features():
+        if feature.featuretype not in ('repeat_region', 'regulatory', 'stem_loop', 'gene_component_region', 'repeat_region'):
+            if feature.featuretype == 'gene':
+                ## for any particular feature group, the gene doesn't necassarily come
+                ## before the CDS in the gff, so need to have the if/else logic
+                gene_id = feature.id.split('.')[0]
+                if gene_id not in feature_groups:
+                    feature_groups[gene_id] = FeatureGroup(gene_id)
+                    feature_groups[gene_id].add_gene(feature)
+                else:
+                    feature_groups[gene_id].add_gene(feature)
+            ## in this gff, the transcripts are referred to as mRNA
+            ## not all genes have mRNA records
+            elif feature.featuretype == 'mRNA':
+                gene_id = feature.id.split('.')[0]
+                # print(feature.id)
+                # print(gene_id)
+                feature_groups[gene_id].add_transcript(feature)
+            elif feature.featuretype == 'exon':
+                gene_id = feature.attributes['locus_tag'][0]
+                if gene_id not in feature_groups:
+                    feature_groups[gene_id] = FeatureGroup(gene_id)
+                    feature_groups[gene_id].add_exon(feature)
+                else:
+                    feature_groups[gene_id].add_exon(feature)
+
+            elif feature.featuretype == 'CDS':
+                gene_id = feature.attributes['locus_tag'][0]
+                if gene_id not in feature_groups:
+                    feature_groups[gene_id] = FeatureGroup(gene_id)
+                    feature_groups[gene_id].add_cds(feature)
+                else:
+                    feature_groups[gene_id].add_cds(feature)
+
+            elif feature.featuretype == 'ncRNA':
+                gene_id = feature.attributes['locus_tag'][0]
+                # print(gene_id)
+                if gene_id not in feature_groups:
+                    feature_groups[gene_id] = FeatureGroup(gene_id)
+                    feature_groups[gene_id].add_ncRNA(feature)
+                else:
+                    feature_groups[gene_id].add_ncRNA(feature)
+    return feature_groups
+
+def make_transcript_pseudofeature(CDSs):
+    ## takes list of CDSs, list len will often be 1, but will sometimes be 2
+    start = min([x.start for x in CDSs])
+    stop = max([x.stop for x in CDSs])
+    chrom = CDSs[0].chrom
+    ## check that both on the same strand
+    assert len(set([x.strand for x in CDSs])) == 1
+    strand = CDSs[0].strand
+    pf = PseudoFeature(chrom, start, stop, strand)
+    return pf
+
+
+def check_feature_groups(feature_groups):
+    ## checking that feature group has a gene, at least one CDS, and a transcript
+    ## if doesn't have a transcript (as many don't), then make a pseudo feature corresponding to a transcript
+    ## only checking that have transcript for everything
+    for gene_id in feature_groups:
+        ## don't want to analyse ncRNAs here, but need to include them up 
+        ## to this point so that we know that that gene is an ncRNA gene
+        if feature_groups[gene_id].ncRNA != None:
+            continue
+        assert feature_groups[gene_id].gene != None
+        assert len(feature_groups[gene_id].CDSs) > 0
+        if feature_groups[gene_id].transcript == None:
+            ## using pseudofeature because the gffutils feature says it's not really intended
+            ## for direct instantiation by users
+            feature_groups[gene_id].transcript = make_transcript_pseudofeature(feature_groups[gene_id].CDSs)
+
+def print_features(feature_groups):
+    for gene_id in feature_groups:
+        feature_group = feature_groups[gene_id]
+        if feature_group.ncRNA != None:
+            continue
+        print('###')
+        name = feature_group.gene.attributes['Name'][0]
+        gene_attributes = f'ID=gene:{gene_id};Name={name};biotype=protein_coding;gene_id:{gene_id}'
+        print(feature_group.gene.chrom, 'EMBL', 'gene', feature_group.gene.start, feature_group.gene.stop, '.', feature_group.gene.strand, '.', gene_attributes, sep = '\t')
+
+        transcript_attributes = f'ID=transcript:{gene_id};Parent=gene:{gene_id};Name={name};biotype=protein_coding;transcript_id={gene_id}'
+        print(feature_group.transcript.chrom, 'EMBL', 'transcript', feature_group.transcript.start, feature_group.transcript.stop, '.', feature_group.transcript.strand, '.', transcript_attributes, sep = '\t')
+
+        cds_attributes =  f'Parent=transcript:{gene_id};Name={name}'
+
+        for c in feature_group.CDSs:
+            print(c.chrom, 'EMBL', 'CDS', c.start, c.stop, '.', c.strand, '0', cds_attributes, sep = '\t')
+
+def main():
+    '''
+    Ensembl gff should have one gene and one transcript per "feature group"
+    Then, can have multiple CDS/exons
+    read in from the input gff, one FeatueGroup instance has one gene, one transcript and (potentially)
+    multiple CDS/exon
+    Exons aren't printed, as not needed bt bcftools csq
+    '''
+    gff_handle, db_handle = get_args()
+    fn = gffutils.example_filename(gff_handle)
+    db = gffutils.create_db(fn, dbfn=db_handle, force=True, keep_order=True,merge_strategy='merge', sort_attribute_values=True)
+    feature_groups = group_features(db)
+    check_feature_groups(feature_groups)
+    print_features(feature_groups)
+
+
+
+## gff input is generated from a genbank file by bp_genbank2gff3.pl
+
+if __name__ == '__main__':
+    main()
+