In [1]:
from collections import defaultdict
import warnings
import logging
import gffutils
import pybedtools
import pandas as pd
from copy import deepcopy
import re

In [2]:
gencode_v25 = '/home/cmb-06/as/wenzhenl/genomes/hg38/annotation/gencode.v25.annotation.gtf'
gencode_v25_db = '/home/cmb-06/as/wenzhenl/genomes/hg38/annotation/gencode.v25.annotation.gtf.db'
prefix = '/staging/as/wenzhenl/hg38_'

In [3]:
#db = gffutils.create_db(gencode_v25, dbfn=gencode_v25_db, force=True, 
  #                      merge_strategy='merge',
    #                    disable_infer_genes=True, disable_infer_transcripts=True)

In [4]:
db = gffutils.FeatureDB(gencode_v25_db, keep_order=True)

In [5]:
all_cds = defaultdict(list)
all_utrs = defaultdict(list)

In [6]:
for cds in db.features_of_type('CDS', order_by='start'):
    assert(len(cds['gene_id']) == 1)
    all_cds[cds['gene_id'][0]].append(cds)

In [7]:
for utr in db.features_of_type('UTR', order_by='start'):
    assert(len(utr['gene_id']) == 1)
    all_utrs[utr['gene_id'][0]].append(utr)

In [8]:
all_utr3 = defaultdict(list)
all_utr5 = defaultdict(list)

In [9]:
for gene, gene_cds in all_cds.items():
    # find first cds
    first_cds = gene_cds[0]
    for cds in gene_cds:
        if cds.start < first_cds.start:
            first_cds = cds
    # find last cds
    last_cds = gene_cds[-1]
    for cds in gene_cds:
        if cds.stop > last_cds.stop:
            last_cds = cds
            
    if gene in all_utrs:
        for orig_utr in all_utrs[gene]:
            utr = deepcopy(orig_utr)
            strand = utr.strand
            if utr.start < first_cds.start:
                if utr.stop >= first_cds.start:
                    utr.stop = first_cds.start - 1
                if strand == '+':
                    all_utr5[gene].append(utr)
                else:
                    all_utr3[gene].append(utr)
            elif utr.stop > last_cds.stop:
                if utr.start <= last_cds.stop:
                    utr.start = last_cds.stop + 1
                if strand == '+':
                    all_utr3[gene].append(utr)
                else:
                    all_utr5[gene].append(utr)

In [10]:
def create_bed(region_dict):
    bed = ""
    for gene, regions in sorted(region_dict.items(), key=lambda x: x[0]):
        if regions:
            regions = list(db.merge(regions))
            regions.sort(key=lambda x: x.start)
            for region in regions:
                bed += '{}\t{}\t{}\t{}\t{}\t{}\n'.format(region.chrom, region.start-1, region.stop, re.sub('\.\d+', '', gene), '.', region.strand)
    return bed

In [11]:
utr3_bed = create_bed(all_utr3)
utr3_bedtool = pybedtools.BedTool(utr3_bed, from_string=True)
utr3_bedtool.remove_invalid().sort().saveas('{}.UTR3.bed'.format(prefix))

<BedTool(/staging/as/wenzhenl/hg38_.UTR3.bed)>

In [12]:
utr5_bed = create_bed(all_utr5)
utr5_bedtool = pybedtools.BedTool(utr5_bed, from_string=True)
utr5_bedtool.remove_invalid().sort().saveas('{}.UTR5.bed'.format(prefix))

<BedTool(/staging/as/wenzhenl/hg38_.UTR5.bed)>

In [13]:
cds_bed = create_bed(all_cds)
cds_bedtool = pybedtools.BedTool(cds_bed, from_string=True)
cds_bedtool.remove_invalid().sort().saveas('{}.cds.bed'.format(prefix))

<BedTool(/staging/as/wenzhenl/hg38_.cds.bed)>