Skip to content

Commit

Permalink
Add gff2gff.py script contributed by @flashton2003 in #530
Browse files Browse the repository at this point in the history
  • Loading branch information
pd3 committed Sep 7, 2020
1 parent ebf8aa3 commit d1b74cd
Showing 1 changed file with 172 additions and 0 deletions.
172 changes: 172 additions & 0 deletions misc/gff2gff.py
@@ -0,0 +1,172 @@
#!/usr/bin/env python
#
# This is script was contributed by Philip Ashton (https://github.com/flashton2003)
# with the intention to convert from the many GFF flavours to the Ensembl GFF3 format
# supported by bcftools csq. Please expand as necessary and send a pull request.
#
# See also
# https://github.com/samtools/bcftools/issues/530#issuecomment-268278248
# https://github.com/samtools/bcftools/issues/1078#issuecomment-527484831
# https://github.com/samtools/bcftools/issues/1208#issuecomment-620642373
#
import sys
import gffutils

class PseudoFeature():
def __init__(self, chrom, start, stop, strand):
self.chrom = chrom
self.start = start
self.stop = stop
self.strand = strand


class FeatureGroup():
def __init__(self, gene_id):
## see here https://github.com/samtools/bcftools/issues/530#issuecomment-614410603
self.id = gene_id
## the below should be lists of instances of Feature class.
self.gene = None # one feature group should have a single gene
self.transcript = None
## assuming here that ncRNA is more like transcript than exon
## i.e. there is only one ncRNA feature per feature group.
self.ncRNA = None
self.exons = []
self.CDSs = []

def add_gene(self, feature):
self.gene = feature

def add_transcript(self, feature):
self.transcript = feature # one feature group should have a single gene

def add_exon(self, feature):
self.exons.append(feature)

def add_cds(self, feature):
self.CDSs.append(feature)

def add_ncRNA(self, feature):
self.ncRNA = feature


def get_args():
if len(sys.argv) != 3:
print('Usage: gff2gff.py <gff_inhandle> </path/where/you/want/to/save/gffutils_db>')
sys.exit()
else:
return sys.argv[1], sys.argv[2]

def group_features(db):
feature_groups = {}
for feature in db.all_features():
if feature.featuretype not in ('repeat_region', 'regulatory', 'stem_loop', 'gene_component_region', 'repeat_region'):
if feature.featuretype == 'gene':
## for any particular feature group, the gene doesn't necassarily come
## before the CDS in the gff, so need to have the if/else logic
gene_id = feature.id.split('.')[0]
if gene_id not in feature_groups:
feature_groups[gene_id] = FeatureGroup(gene_id)
feature_groups[gene_id].add_gene(feature)
else:
feature_groups[gene_id].add_gene(feature)
## in this gff, the transcripts are referred to as mRNA
## not all genes have mRNA records
elif feature.featuretype == 'mRNA':
gene_id = feature.id.split('.')[0]
# print(feature.id)
# print(gene_id)
feature_groups[gene_id].add_transcript(feature)
elif feature.featuretype == 'exon':
gene_id = feature.attributes['locus_tag'][0]
if gene_id not in feature_groups:
feature_groups[gene_id] = FeatureGroup(gene_id)
feature_groups[gene_id].add_exon(feature)
else:
feature_groups[gene_id].add_exon(feature)

elif feature.featuretype == 'CDS':
gene_id = feature.attributes['locus_tag'][0]
if gene_id not in feature_groups:
feature_groups[gene_id] = FeatureGroup(gene_id)
feature_groups[gene_id].add_cds(feature)
else:
feature_groups[gene_id].add_cds(feature)

elif feature.featuretype == 'ncRNA':
gene_id = feature.attributes['locus_tag'][0]
# print(gene_id)
if gene_id not in feature_groups:
feature_groups[gene_id] = FeatureGroup(gene_id)
feature_groups[gene_id].add_ncRNA(feature)
else:
feature_groups[gene_id].add_ncRNA(feature)
return feature_groups

def make_transcript_pseudofeature(CDSs):
## takes list of CDSs, list len will often be 1, but will sometimes be 2
start = min([x.start for x in CDSs])
stop = max([x.stop for x in CDSs])
chrom = CDSs[0].chrom
## check that both on the same strand
assert len(set([x.strand for x in CDSs])) == 1
strand = CDSs[0].strand
pf = PseudoFeature(chrom, start, stop, strand)
return pf


def check_feature_groups(feature_groups):
## checking that feature group has a gene, at least one CDS, and a transcript
## if doesn't have a transcript (as many don't), then make a pseudo feature corresponding to a transcript
## only checking that have transcript for everything
for gene_id in feature_groups:
## don't want to analyse ncRNAs here, but need to include them up
## to this point so that we know that that gene is an ncRNA gene
if feature_groups[gene_id].ncRNA != None:
continue
assert feature_groups[gene_id].gene != None
assert len(feature_groups[gene_id].CDSs) > 0
if feature_groups[gene_id].transcript == None:
## using pseudofeature because the gffutils feature says it's not really intended
## for direct instantiation by users
feature_groups[gene_id].transcript = make_transcript_pseudofeature(feature_groups[gene_id].CDSs)

def print_features(feature_groups):
for gene_id in feature_groups:
feature_group = feature_groups[gene_id]
if feature_group.ncRNA != None:
continue
print('###')
name = feature_group.gene.attributes['Name'][0]
gene_attributes = f'ID=gene:{gene_id};Name={name};biotype=protein_coding;gene_id:{gene_id}'
print(feature_group.gene.chrom, 'EMBL', 'gene', feature_group.gene.start, feature_group.gene.stop, '.', feature_group.gene.strand, '.', gene_attributes, sep = '\t')

transcript_attributes = f'ID=transcript:{gene_id};Parent=gene:{gene_id};Name={name};biotype=protein_coding;transcript_id={gene_id}'
print(feature_group.transcript.chrom, 'EMBL', 'transcript', feature_group.transcript.start, feature_group.transcript.stop, '.', feature_group.transcript.strand, '.', transcript_attributes, sep = '\t')

cds_attributes = f'Parent=transcript:{gene_id};Name={name}'

for c in feature_group.CDSs:
print(c.chrom, 'EMBL', 'CDS', c.start, c.stop, '.', c.strand, '0', cds_attributes, sep = '\t')

def main():
'''
Ensembl gff should have one gene and one transcript per "feature group"
Then, can have multiple CDS/exons
read in from the input gff, one FeatueGroup instance has one gene, one transcript and (potentially)
multiple CDS/exon
Exons aren't printed, as not needed bt bcftools csq
'''
gff_handle, db_handle = get_args()
fn = gffutils.example_filename(gff_handle)
db = gffutils.create_db(fn, dbfn=db_handle, force=True, keep_order=True,merge_strategy='merge', sort_attribute_values=True)
feature_groups = group_features(db)
check_feature_groups(feature_groups)
print_features(feature_groups)



## gff input is generated from a genbank file by bp_genbank2gff3.pl

if __name__ == '__main__':
main()

1 comment on commit d1b74cd

@flashton2003
Copy link

@flashton2003 flashton2003 commented on d1b74cd Sep 7, 2020 via email

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.