get statistics about 3'UTR sequences of Zoonomia species in Ensembl

In [1]:
import pandas as pd
import numpy as np

from urllib.request import urlopen
import gzip
import re

from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import urllib

In [2]:
species_list='/lustre/groups/epigenereg01/workspace/projects/vale/mlm/fasta/241_species.txt'

In [3]:
zoonomia_species = pd.read_csv(species_list,header=None).iloc[:,0].str.lower().values

In [4]:
genes = pd.read_csv('/lustre/groups/epigenereg01/workspace/projects/vale/mlm/UTR_coords/GRCh38_3_prime_UTR_clean.bed', sep='\t', usecols=[8],header=None).iloc[:,0].values
len(genes)

18178

In [5]:
ensembl_release = 112

In [6]:
html = urllib.request.urlopen(f"https://ftp.ensembl.org/pub/release-{ensembl_release}/gtf/")
soup = BeautifulSoup(html)

In [7]:
ensembl_species = []
for img in soup.find_all('img'):
    if img['alt']=="[DIR]":
        a = img.find_next("a", href=True)
        species = a['href'].rstrip('/')
        ensembl_species.append(species)

In [8]:
ensembl_species = [species for species in ensembl_species if species in zoonomia_species]
len(ensembl_species)

72

In [9]:
def get_gtf_url(species):
    html = urllib.request.urlopen(f"https://ftp.ensembl.org/pub/release-{ensembl_release}/gtf/{species}")
    soup = BeautifulSoup(html)
    for a in soup.find_all("a", href=True):
        href = a['href']
        if str(ensembl_release) + '.gtf.gz' in href:
            return f'https://ftp.ensembl.org/pub/release-{ensembl_release}/gtf/{species}/{href}'

In [10]:
def get_utrs_species(gtf_url):
    utrs = []
    streamed_file = urlopen(gtf_url)
    with gzip.GzipFile(fileobj=streamed_file) as f_in:
        for idx,line in enumerate(f_in):
            line = line.decode()
            if 'three_prime_utr' in line:
                gene_name = re.search('gene_name "([^;]*)";',line)
                is_canonical = "Ensembl_canonical" in line
                if gene_name and is_canonical:
                    gene_name = gene_name.groups()[0]
                    if gene_name in genes:
                        contig,source,region,start,end,_,strand, *_ = line.split()
                        start = int(start) - 1 #to 0-based, inclusive
                        end = int(end) #to 0-based, non-inclusive
                        length = end-start
                        utrs.append((species,contig,strand,gene_name,start,end,length))
    return utrs

In [11]:
utrs_df = []
for species in tqdm(ensembl_species):
    gtf_url = get_gtf_url(species)
    utrs_species = get_utrs_species(gtf_url)
    utrs_species = pd.DataFrame(utrs_species,columns=['species','contig','strand','HGNC_Symbol','start','end','length'])
    utrs_df.append(utrs_species)

ensembl_utr_df = pd.concat(utrs_df)

  0%|          | 0/72 [00:00<?, ?it/s]

In [12]:
# UTR coordinates for Zoonomia species, detected based on stop codon position +. human 3'UTR length
data_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/mlm/' 

zoonomia_utr_df = pd.read_csv(data_dir + 'UTR_coords/GRCh38_3_prime_UTR_all_species.tsv.gz', sep='\t')
zoonomia_utr_df.rename(columns={'3_prime_UTR_start':'zoonomia_start','3_prime_UTR_end':'zoonomia_end'},inplace=True)
zoonomia_utr_df['zoonomia_length'] = zoonomia_utr_df.zoonomia_end - zoonomia_utr_df.zoonomia_start
zoonomia_utr_df.species = zoonomia_utr_df.species.str.lower()

In [13]:
comparative_df = ensembl_utr_df[['HGNC_Symbol','species','contig','start','end','length']].merge(zoonomia_utr_df)

In [14]:
comparative_df['5_end_shift'] = comparative_df.apply(lambda x:x.start-x.zoonomia_start if x.human_transcript_strand==x.MAF_strand else x.end-x.zoonomia_end, axis=1)
comparative_df['3_end_shift'] = comparative_df.apply(lambda x:x.end-x.zoonomia_end if x.human_transcript_strand==x.MAF_strand else x.start-x.zoonomia_start, axis=1)

In [16]:
comparative_df[['5_end_shift','3_end_shift']].describe()

Unnamed: 0,5_end_shift,3_end_shift
count,103463.0,103463.0
mean,-1012.846,-997.9633
std,207127.2,207150.8
min,-36374130.0,-36375630.0
25%,0.0,-322.0
50%,0.0,0.0
75%,0.0,326.0
max,30745560.0,30745560.0


In [49]:
(comparative_df['5_end_shift']==0).mean()

0.9261281810888917

In [68]:
(comparative_df.length>comparative_df.zoonomia_end-comparative_df.zoonomia_start).mean()

0.3770816620434358

In [69]:
def get_intersection(interval1, interval2):
        new_min = max(interval1[0], interval2[0])
        new_max = min(interval1[1], interval2[1])
        #print(new_min,new_max)
        return max(new_max-new_min,0)

def get_jaccard(interval1, interval2):
    intersection = get_intersection(interval1, interval2)
    union = (interval2[1]-interval2[0])+(interval1[1]-interval1[0])-intersection
    #print(intersection,union)
    return intersection/union 

In [70]:
comparative_df['intersection'] = comparative_df.apply(lambda x: get_intersection([x.start,x.end],[x.zoonomia_start,x.zoonomia_end]),axis=1)

In [71]:
comparative_df['jaccard_idx'] = comparative_df.apply(lambda x: get_jaccard([x.start,x.end],[x.zoonomia_start,x.zoonomia_end]),axis=1)

In [72]:
comparative_df.jaccard_idx.median()

0.6925215457325549