From d1b74cda228f3964f55db80f4b1f655c6eea36ec Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Mon, 7 Sep 2020 18:00:00 +0200 Subject: [PATCH] Add gff2gff.py script contributed by @flashton2003 in #530 --- misc/gff2gff.py | 172 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 misc/gff2gff.py diff --git a/misc/gff2gff.py b/misc/gff2gff.py new file mode 100644 index 000000000..208403fe3 --- /dev/null +++ b/misc/gff2gff.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python +# +# This is script was contributed by Philip Ashton (https://github.com/flashton2003) +# with the intention to convert from the many GFF flavours to the Ensembl GFF3 format +# supported by bcftools csq. Please expand as necessary and send a pull request. +# +# See also +# https://github.com/samtools/bcftools/issues/530#issuecomment-268278248 +# https://github.com/samtools/bcftools/issues/1078#issuecomment-527484831 +# https://github.com/samtools/bcftools/issues/1208#issuecomment-620642373 +# +import sys +import gffutils + +class PseudoFeature(): + def __init__(self, chrom, start, stop, strand): + self.chrom = chrom + self.start = start + self.stop = stop + self.strand = strand + + +class FeatureGroup(): + def __init__(self, gene_id): + ## see here https://github.com/samtools/bcftools/issues/530#issuecomment-614410603 + self.id = gene_id + ## the below should be lists of instances of Feature class. + self.gene = None # one feature group should have a single gene + self.transcript = None + ## assuming here that ncRNA is more like transcript than exon + ## i.e. there is only one ncRNA feature per feature group. + self.ncRNA = None + self.exons = [] + self.CDSs = [] + + def add_gene(self, feature): + self.gene = feature + + def add_transcript(self, feature): + self.transcript = feature # one feature group should have a single gene + + def add_exon(self, feature): + self.exons.append(feature) + + def add_cds(self, feature): + self.CDSs.append(feature) + + def add_ncRNA(self, feature): + self.ncRNA = feature + + +def get_args(): + if len(sys.argv) != 3: + print('Usage: gff2gff.py ') + sys.exit() + else: + return sys.argv[1], sys.argv[2] + +def group_features(db): + feature_groups = {} + for feature in db.all_features(): + if feature.featuretype not in ('repeat_region', 'regulatory', 'stem_loop', 'gene_component_region', 'repeat_region'): + if feature.featuretype == 'gene': + ## for any particular feature group, the gene doesn't necassarily come + ## before the CDS in the gff, so need to have the if/else logic + gene_id = feature.id.split('.')[0] + if gene_id not in feature_groups: + feature_groups[gene_id] = FeatureGroup(gene_id) + feature_groups[gene_id].add_gene(feature) + else: + feature_groups[gene_id].add_gene(feature) + ## in this gff, the transcripts are referred to as mRNA + ## not all genes have mRNA records + elif feature.featuretype == 'mRNA': + gene_id = feature.id.split('.')[0] + # print(feature.id) + # print(gene_id) + feature_groups[gene_id].add_transcript(feature) + elif feature.featuretype == 'exon': + gene_id = feature.attributes['locus_tag'][0] + if gene_id not in feature_groups: + feature_groups[gene_id] = FeatureGroup(gene_id) + feature_groups[gene_id].add_exon(feature) + else: + feature_groups[gene_id].add_exon(feature) + + elif feature.featuretype == 'CDS': + gene_id = feature.attributes['locus_tag'][0] + if gene_id not in feature_groups: + feature_groups[gene_id] = FeatureGroup(gene_id) + feature_groups[gene_id].add_cds(feature) + else: + feature_groups[gene_id].add_cds(feature) + + elif feature.featuretype == 'ncRNA': + gene_id = feature.attributes['locus_tag'][0] + # print(gene_id) + if gene_id not in feature_groups: + feature_groups[gene_id] = FeatureGroup(gene_id) + feature_groups[gene_id].add_ncRNA(feature) + else: + feature_groups[gene_id].add_ncRNA(feature) + return feature_groups + +def make_transcript_pseudofeature(CDSs): + ## takes list of CDSs, list len will often be 1, but will sometimes be 2 + start = min([x.start for x in CDSs]) + stop = max([x.stop for x in CDSs]) + chrom = CDSs[0].chrom + ## check that both on the same strand + assert len(set([x.strand for x in CDSs])) == 1 + strand = CDSs[0].strand + pf = PseudoFeature(chrom, start, stop, strand) + return pf + + +def check_feature_groups(feature_groups): + ## checking that feature group has a gene, at least one CDS, and a transcript + ## if doesn't have a transcript (as many don't), then make a pseudo feature corresponding to a transcript + ## only checking that have transcript for everything + for gene_id in feature_groups: + ## don't want to analyse ncRNAs here, but need to include them up + ## to this point so that we know that that gene is an ncRNA gene + if feature_groups[gene_id].ncRNA != None: + continue + assert feature_groups[gene_id].gene != None + assert len(feature_groups[gene_id].CDSs) > 0 + if feature_groups[gene_id].transcript == None: + ## using pseudofeature because the gffutils feature says it's not really intended + ## for direct instantiation by users + feature_groups[gene_id].transcript = make_transcript_pseudofeature(feature_groups[gene_id].CDSs) + +def print_features(feature_groups): + for gene_id in feature_groups: + feature_group = feature_groups[gene_id] + if feature_group.ncRNA != None: + continue + print('###') + name = feature_group.gene.attributes['Name'][0] + gene_attributes = f'ID=gene:{gene_id};Name={name};biotype=protein_coding;gene_id:{gene_id}' + print(feature_group.gene.chrom, 'EMBL', 'gene', feature_group.gene.start, feature_group.gene.stop, '.', feature_group.gene.strand, '.', gene_attributes, sep = '\t') + + transcript_attributes = f'ID=transcript:{gene_id};Parent=gene:{gene_id};Name={name};biotype=protein_coding;transcript_id={gene_id}' + print(feature_group.transcript.chrom, 'EMBL', 'transcript', feature_group.transcript.start, feature_group.transcript.stop, '.', feature_group.transcript.strand, '.', transcript_attributes, sep = '\t') + + cds_attributes = f'Parent=transcript:{gene_id};Name={name}' + + for c in feature_group.CDSs: + print(c.chrom, 'EMBL', 'CDS', c.start, c.stop, '.', c.strand, '0', cds_attributes, sep = '\t') + +def main(): + ''' + Ensembl gff should have one gene and one transcript per "feature group" + Then, can have multiple CDS/exons + read in from the input gff, one FeatueGroup instance has one gene, one transcript and (potentially) + multiple CDS/exon + Exons aren't printed, as not needed bt bcftools csq + ''' + gff_handle, db_handle = get_args() + fn = gffutils.example_filename(gff_handle) + db = gffutils.create_db(fn, dbfn=db_handle, force=True, keep_order=True,merge_strategy='merge', sort_attribute_values=True) + feature_groups = group_features(db) + check_feature_groups(feature_groups) + print_features(feature_groups) + + + +## gff input is generated from a genbank file by bp_genbank2gff3.pl + +if __name__ == '__main__': + main() +