Take 3'UTR coordinates from hgTables, see
https://groups.google.com/a/soe.ucsc.edu/g/genome/c/3R59gKDrKu8

* add HGNC symbol (table from Biomart)
* remove irrelevant contigs (e.g. decoy sequences)
* take only contiguous UTR regions

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

mpl.rcParams.update({'xtick.labelsize': 16, 'ytick.labelsize': 16, 
                     'axes.titlesize':16, 'axes.labelsize':18})

In [2]:
data_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/mlm/'

In [3]:
all_utr_df = pd.read_csv(data_dir + 'UTR_coords/GRCh38_3_prime_UTR.bed.gz', sep='\t', names=['chrom','start','stop','utr_name','score','strand'])

gene_annot_df = pd.read_csv(data_dir + 'UTR_coords/GRCh38_EnsembleCanonical_HGNC.tsv.gz', sep='\t', skiprows=1,header=None,
                           names=['transcript_id','canonical','HGNC_symbol'],usecols=[1,2,3]) #matching between Ensembl and HGNC gene names

all_utr_df['transcript_id'] = all_utr_df.utr_name.apply(lambda x:x.split('.')[0]) #transcript ID from UTR ID

df = all_utr_df.merge(gene_annot_df)

df = df[(df.canonical==1) & (~df.HGNC_symbol.isna())]#take only Ensembl canonical and with HGNC symbol

df = df[~df.chrom.str.contains('_')] #exclude decoy seqeunces, etc

df.drop_duplicates(subset=['transcript_id'],keep=False,inplace=True) #take only UTR regions that are not splitted

In [70]:
df.to_csv(data_dir + 'UTR_coords/GRCh38_3_prime_UTR_clean.bed',sep='\t',index=None,header=None)