Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add gff2gff.py script contributed by @flashton2003 in #530
- Loading branch information
Showing
1 changed file
with
172 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,172 @@ | ||
#!/usr/bin/env python | ||
# | ||
# This is script was contributed by Philip Ashton (https://github.com/flashton2003) | ||
# with the intention to convert from the many GFF flavours to the Ensembl GFF3 format | ||
# supported by bcftools csq. Please expand as necessary and send a pull request. | ||
# | ||
# See also | ||
# https://github.com/samtools/bcftools/issues/530#issuecomment-268278248 | ||
# https://github.com/samtools/bcftools/issues/1078#issuecomment-527484831 | ||
# https://github.com/samtools/bcftools/issues/1208#issuecomment-620642373 | ||
# | ||
import sys | ||
import gffutils | ||
|
||
class PseudoFeature(): | ||
def __init__(self, chrom, start, stop, strand): | ||
self.chrom = chrom | ||
self.start = start | ||
self.stop = stop | ||
self.strand = strand | ||
|
||
|
||
class FeatureGroup(): | ||
def __init__(self, gene_id): | ||
## see here https://github.com/samtools/bcftools/issues/530#issuecomment-614410603 | ||
self.id = gene_id | ||
## the below should be lists of instances of Feature class. | ||
self.gene = None # one feature group should have a single gene | ||
self.transcript = None | ||
## assuming here that ncRNA is more like transcript than exon | ||
## i.e. there is only one ncRNA feature per feature group. | ||
self.ncRNA = None | ||
self.exons = [] | ||
self.CDSs = [] | ||
|
||
def add_gene(self, feature): | ||
self.gene = feature | ||
|
||
def add_transcript(self, feature): | ||
self.transcript = feature # one feature group should have a single gene | ||
|
||
def add_exon(self, feature): | ||
self.exons.append(feature) | ||
|
||
def add_cds(self, feature): | ||
self.CDSs.append(feature) | ||
|
||
def add_ncRNA(self, feature): | ||
self.ncRNA = feature | ||
|
||
|
||
def get_args(): | ||
if len(sys.argv) != 3: | ||
print('Usage: gff2gff.py <gff_inhandle> </path/where/you/want/to/save/gffutils_db>') | ||
sys.exit() | ||
else: | ||
return sys.argv[1], sys.argv[2] | ||
|
||
def group_features(db): | ||
feature_groups = {} | ||
for feature in db.all_features(): | ||
if feature.featuretype not in ('repeat_region', 'regulatory', 'stem_loop', 'gene_component_region', 'repeat_region'): | ||
if feature.featuretype == 'gene': | ||
## for any particular feature group, the gene doesn't necassarily come | ||
## before the CDS in the gff, so need to have the if/else logic | ||
gene_id = feature.id.split('.')[0] | ||
if gene_id not in feature_groups: | ||
feature_groups[gene_id] = FeatureGroup(gene_id) | ||
feature_groups[gene_id].add_gene(feature) | ||
else: | ||
feature_groups[gene_id].add_gene(feature) | ||
## in this gff, the transcripts are referred to as mRNA | ||
## not all genes have mRNA records | ||
elif feature.featuretype == 'mRNA': | ||
gene_id = feature.id.split('.')[0] | ||
# print(feature.id) | ||
# print(gene_id) | ||
feature_groups[gene_id].add_transcript(feature) | ||
elif feature.featuretype == 'exon': | ||
gene_id = feature.attributes['locus_tag'][0] | ||
if gene_id not in feature_groups: | ||
feature_groups[gene_id] = FeatureGroup(gene_id) | ||
feature_groups[gene_id].add_exon(feature) | ||
else: | ||
feature_groups[gene_id].add_exon(feature) | ||
|
||
elif feature.featuretype == 'CDS': | ||
gene_id = feature.attributes['locus_tag'][0] | ||
if gene_id not in feature_groups: | ||
feature_groups[gene_id] = FeatureGroup(gene_id) | ||
feature_groups[gene_id].add_cds(feature) | ||
else: | ||
feature_groups[gene_id].add_cds(feature) | ||
|
||
elif feature.featuretype == 'ncRNA': | ||
gene_id = feature.attributes['locus_tag'][0] | ||
# print(gene_id) | ||
if gene_id not in feature_groups: | ||
feature_groups[gene_id] = FeatureGroup(gene_id) | ||
feature_groups[gene_id].add_ncRNA(feature) | ||
else: | ||
feature_groups[gene_id].add_ncRNA(feature) | ||
return feature_groups | ||
|
||
def make_transcript_pseudofeature(CDSs): | ||
## takes list of CDSs, list len will often be 1, but will sometimes be 2 | ||
start = min([x.start for x in CDSs]) | ||
stop = max([x.stop for x in CDSs]) | ||
chrom = CDSs[0].chrom | ||
## check that both on the same strand | ||
assert len(set([x.strand for x in CDSs])) == 1 | ||
strand = CDSs[0].strand | ||
pf = PseudoFeature(chrom, start, stop, strand) | ||
return pf | ||
|
||
|
||
def check_feature_groups(feature_groups): | ||
## checking that feature group has a gene, at least one CDS, and a transcript | ||
## if doesn't have a transcript (as many don't), then make a pseudo feature corresponding to a transcript | ||
## only checking that have transcript for everything | ||
for gene_id in feature_groups: | ||
## don't want to analyse ncRNAs here, but need to include them up | ||
## to this point so that we know that that gene is an ncRNA gene | ||
if feature_groups[gene_id].ncRNA != None: | ||
continue | ||
assert feature_groups[gene_id].gene != None | ||
assert len(feature_groups[gene_id].CDSs) > 0 | ||
if feature_groups[gene_id].transcript == None: | ||
## using pseudofeature because the gffutils feature says it's not really intended | ||
## for direct instantiation by users | ||
feature_groups[gene_id].transcript = make_transcript_pseudofeature(feature_groups[gene_id].CDSs) | ||
|
||
def print_features(feature_groups): | ||
for gene_id in feature_groups: | ||
feature_group = feature_groups[gene_id] | ||
if feature_group.ncRNA != None: | ||
continue | ||
print('###') | ||
name = feature_group.gene.attributes['Name'][0] | ||
gene_attributes = f'ID=gene:{gene_id};Name={name};biotype=protein_coding;gene_id:{gene_id}' | ||
print(feature_group.gene.chrom, 'EMBL', 'gene', feature_group.gene.start, feature_group.gene.stop, '.', feature_group.gene.strand, '.', gene_attributes, sep = '\t') | ||
|
||
transcript_attributes = f'ID=transcript:{gene_id};Parent=gene:{gene_id};Name={name};biotype=protein_coding;transcript_id={gene_id}' | ||
print(feature_group.transcript.chrom, 'EMBL', 'transcript', feature_group.transcript.start, feature_group.transcript.stop, '.', feature_group.transcript.strand, '.', transcript_attributes, sep = '\t') | ||
|
||
cds_attributes = f'Parent=transcript:{gene_id};Name={name}' | ||
|
||
for c in feature_group.CDSs: | ||
print(c.chrom, 'EMBL', 'CDS', c.start, c.stop, '.', c.strand, '0', cds_attributes, sep = '\t') | ||
|
||
def main(): | ||
''' | ||
Ensembl gff should have one gene and one transcript per "feature group" | ||
Then, can have multiple CDS/exons | ||
read in from the input gff, one FeatueGroup instance has one gene, one transcript and (potentially) | ||
multiple CDS/exon | ||
Exons aren't printed, as not needed bt bcftools csq | ||
''' | ||
gff_handle, db_handle = get_args() | ||
fn = gffutils.example_filename(gff_handle) | ||
db = gffutils.create_db(fn, dbfn=db_handle, force=True, keep_order=True,merge_strategy='merge', sort_attribute_values=True) | ||
feature_groups = group_features(db) | ||
check_feature_groups(feature_groups) | ||
print_features(feature_groups) | ||
|
||
|
||
|
||
## gff input is generated from a genbank file by bp_genbank2gff3.pl | ||
|
||
if __name__ == '__main__': | ||
main() | ||
|
d1b74cd
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.