In [1]:
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import matplotlib.pyplot as plt
from tqdm import tqdm
from glob import glob
import re

import sklearn, sklearn.linear_model, sklearn.metrics, sklearn.pipeline,sklearn.ensemble

import matplotlib
import scipy.stats

matplotlib.rcParams.update({'font.size': 16})

In [2]:
data_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/MLM/'

In [3]:
#3'UTR table

utr_table = pd.read_csv(data_dir + 'UTR_coords/GRCh38_3_prime_UTR_clean-sorted.bed', sep='\t',
                       header = None, names=['seq_start','seq_end','seq_name','strand'], usecols=[1,2,3,5])

In [4]:
gnomAD_all = pd.read_csv(data_dir + 'gnomAD/gnomAD_GRCh38.utr3.tsv.gz', sep='\t', names=['chrom','pos','ref','alt','seq_name','AC','AF'])
gnomAD_neg = gnomAD_all.loc[gnomAD_all.AF>0.05, ['chrom','pos','ref','alt','seq_name']]
gnomAD_neg.pos = gnomAD_neg.pos-1 #position to 0-based
gnomAD_neg['label'] = 0

In [22]:
eqtl_scores = pd.read_csv(data_dir + f'GRASP/GRASP2eQTL.3utr.bed', sep='\t', 
                           names=['chrom','pos','info'], usecols=[0,1,3])

meqtl_scores = pd.read_csv(data_dir + f'GRASP/GRASP2meQTL.3utr.bed', sep='\t', 
                           names=['chrom','pos','info'], usecols=[0,1,3])

eqtl_scores['QTL_type'] = 'eQTL'
meqtl_scores['QTL_type'] = 'meQTL'

qtl_scores = pd.concat([eqtl_scores,meqtl_scores])

In [23]:
def search_pattern(pattern, string, placeholder=None):
    p = re.search(pattern, string)
    if p:
        return p.groups()[0]
    else:
        return placeholder    

qtl_scores['ref'] = qtl_scores['info'].apply(lambda x:search_pattern('dbSNPalleles=(.*)/',x))
qtl_scores['alt'] = qtl_scores['info'].apply(lambda x:search_pattern('dbSNPalleles=[ACGT]/(.*);',x))
qtl_scores['seq_name'] = qtl_scores['info'].apply(lambda x:search_pattern('seq_name=(.*)',x))

qtl_scores = qtl_scores[(~qtl_scores.ref.isna())&(~qtl_scores.alt.isna())]
qtl_scores = qtl_scores[(qtl_scores.ref.str.len()==1)&(qtl_scores.alt.str.len()==1)]

In [24]:
qtl_scores = qtl_scores[['chrom','pos','ref','alt','seq_name','QTL_type']]
qtl_scores['label'] = 1

In [25]:
utr_variants = pd.concat([qtl_scores,gnomAD_neg]).reset_index(drop=True).drop_duplicates(subset=['chrom','pos','ref','alt'], keep='first')

In [26]:
len_ref = utr_variants.ref.apply(len)
len_alt = utr_variants.alt.apply(len)

utr_variants.loc[(len_ref==1)&(len_alt==1),'vartype'] = 'SNP'
utr_variants.loc[(len_ref>1)&(len_alt==1),'vartype'] = 'DEL'
utr_variants.loc[(len_ref==1)&(len_alt>1),'vartype'] = 'INS'

utr_variants = utr_variants[utr_variants.vartype == 'SNP'] 

  utr_variants.loc[(len_ref==1)&(len_alt==1),'vartype'] = 'SNP'


In [27]:
utr_variants = utr_variants.merge(utr_table, how='left')

In [28]:
utr_variants.loc[utr_variants.strand=='+','pos_rel'] = utr_variants[utr_variants.strand=='+'].pos - utr_variants[utr_variants.strand=='+'].seq_start
utr_variants.loc[utr_variants.strand=='-','pos_rel'] = utr_variants[utr_variants.strand=='-'].seq_end - utr_variants[utr_variants.strand=='-'].pos-1
utr_variants.pos_rel = utr_variants.pos_rel.astype(int)

In [29]:
utr_variants = utr_variants[utr_variants.pos_rel<5000] #limit by the maximum sequence length on species-aware model

In [30]:
models = { 'Species-agnostic':'motif_predictions/species_agnostic/probas','Species-aware':'motif_predictions/species_aware/probas', 
          'DNABERT': 'motif_predictions/split_75_25/dnabert/default/', '11-mer':'motif_predictions/K-mer/11_mer',
          'PhyloP100': 'PhyloP/PhyloP100_3UTR' ,'PhyloP241': 'PhyloP/PhyloP241_3UTR'}

In [31]:
#utr_variants = pd.read_csv(data_dir + 'gnomAD/model_scores.tsv.gz', sep='\t')

In [32]:
for model_name in ('PhyloP100','PhyloP241','11-mer',):

    print(model_name)

    with open(data_dir + models[model_name] + '.pickle', 'rb') as f:
        probas = dict(pickle.load(f))
        
    for var_idx, var in tqdm(utr_variants.iterrows(), total=len(utr_variants)):
        if var.seq_name in probas.keys():
            if var.vartype=='SNP':
                utr_variants.at[var_idx,model_name] = probas[var.seq_name][var.pos_rel]
            else:
                if var.vartype=='INS':
                    left, right = var.pos_rel-2, var.pos_rel+2
                else:
                    if var.strand=='+':
                        left, right = var.pos_rel, var.pos_rel+len(var.ref)
                    else:
                        left, right = var.pos_rel-len(var.ref), var.pos_rel
                utr_variants.at[var_idx,model_name] = np.mean(probas[var.seq_name][left:right])

PhyloP100


100%|██████████| 60134/60134 [00:03<00:00, 15114.35it/s]


PhyloP241


100%|██████████| 60134/60134 [00:03<00:00, 15206.10it/s]


11-mer


100%|██████████| 60134/60134 [00:03<00:00, 15302.30it/s]


In [33]:
mapping = {'A':0,'C':1,'G':2,'T':3}

In [34]:
human_fasta = data_dir + 'fasta/240_species/species/Homo_sapiens.fa' #3'UTR on hegative strand should already be reversed

human_utr = defaultdict(str)

with open(human_fasta, 'r') as f:
    for line in f:
        if line.startswith('>'):
            seq_name = line[1:].split(':')[0]
        else:
            human_utr[seq_name] += line.rstrip().upper()

In [35]:
def get_dnabert(res_path):
    res = {}
    for probas_file in glob(res_path + 'preds*.npy'):
        seq_matching = pd.read_csv(probas_file.replace('/preds','/seq').replace('.npy','.csv')).set_index('seq_name').squeeze()
        probas = np.load(probas_file)
        s = 0
        for seq_name, seq in seq_matching.items():
            #res[seq_name] = [x[mapping[seq[seq_idx]]] for seq_idx,x in enumerate(probas[s:s+len(seq),:])]
            res[seq_name] = probas[s:s+len(seq),:]
            s += len(seq)
        assert s==len(probas)
    return res

In [36]:
def add_model_res(model_name):

    print(model_name)

    if not 'DNABERT' in model_name:
        with open(data_dir + models[model_name] + '.pickle', 'rb') as f:
            probas = dict(pickle.load(f))
        #if 'Species' in model_name:
        #    probas = {k:v/v.sum(1, keepdims=True) for k,v in probas.items()} #normalize probabilities to 1
    else:
        probas = get_dnabert(data_dir + models[model_name])
        
    for var_idx, var in tqdm(utr_variants.iterrows(), total=len(utr_variants)):
        if var.seq_name in probas.keys():
            if var.vartype=='SNP':
                utr_variants.at[var_idx, model_name+'_alt'] = probas[var.seq_name][var.pos_rel, mapping[var.alt]]
                utr_variants.at[var_idx, model_name+'_ref'] = probas[var.seq_name][var.pos_rel, mapping[var.ref]]
            else:
                if var.vartype=='INS':
                    left, right = var.pos_rel-2, var.pos_rel+2
                else:
                    if var.strand=='+':
                        left, right = var.pos_rel, var.pos_rel+len(var.ref)
                    else:
                        left, right = var.pos_rel-len(var.ref), var.pos_rel
                ref_score = []
                seq = human_utr[var.seq_name]
                for pos_rel in range(max(left,0),min(right,len(seq))):
                    ref_score.append(probas[var.seq_name][pos_rel, mapping[seq[pos_rel]]]) 
                    #ref_score.append(np.max(probas[var.seq_name][pos_rel])) 
                utr_variants.at[var_idx, model_name+'_ref'] = np.mean(ref_score)

In [37]:
for model_name in ('Species-aware', 'Species-agnostic', 'DNABERT'):
    add_model_res(model_name)

Species-aware


100%|██████████| 60134/60134 [00:05<00:00, 10217.28it/s]


Species-agnostic


100%|██████████| 60134/60134 [00:05<00:00, 10261.50it/s]


DNABERT


100%|██████████| 60134/60134 [00:03<00:00, 19840.66it/s]


In [38]:
utr_variants.to_csv(data_dir + f'GRASP/model_scores.tsv.gz', sep='\t', index=None)

In [39]:
utr_variants

Unnamed: 0,chrom,pos,ref,alt,seq_name,QTL_type,label,vartype,seq_start,seq_end,...,pos_rel,PhyloP100,PhyloP241,11-mer,Species-aware_alt,Species-aware_ref,Species-agnostic_alt,Species-agnostic_ref,DNABERT_alt,DNABERT_ref
0,chr1,100082437,C,T,ENST00000370152.8_utr3_11_0_chr1_100082210_f,eQTL,1,SNP,100082209,100083377,...,228,3.389,7.029,0.376044,0.432109,0.021974,0.421849,0.020861,,
1,chr1,100541020,A,G,ENST00000315033.5_utr3_1_0_chr1_100540122_f,eQTL,1,SNP,100540121,100542021,...,899,0.436,1.825,0.140695,0.287335,0.187009,0.266291,0.210644,,
2,chr1,10442652,C,T,ENST00000309048.8_utr3_4_0_chr1_10442406_f,eQTL,1,SNP,10442405,10442808,...,247,0.171,-0.140,0.527256,0.280972,0.194812,0.288658,0.164913,,
3,chr1,1055036,A,G,ENST00000379370.7_utr3_35_0_chr1_1054982_f,eQTL,1,SNP,1054981,1056116,...,55,0.281,2.565,0.451307,0.242463,0.244533,0.335231,0.198622,,
4,chr1,107571242,C,T,ENST00000370056.9_utr3_26_0_chr1_107571161_r,eQTL,1,SNP,107571160,107573330,...,2087,3.091,0.436,0.143238,0.529939,0.080935,0.597279,0.045834,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66812,chrX,155774973,G,T,ENST00000695325.1_utr3_2_0_chrX_155774739_f,,0,SNP,155774738,155782459,...,235,2.205,4.597,0.155799,0.476815,0.242620,0.435704,0.286503,0.464461,0.298754
66813,chrX,155775591,C,T,ENST00000695325.1_utr3_2_0_chrX_155774739_f,,0,SNP,155774738,155782459,...,853,1.299,0.854,0.035936,0.464169,0.088179,0.448774,0.069748,0.694404,0.035328
66814,chrX,155776960,A,G,ENST00000695325.1_utr3_2_0_chrX_155774739_f,,0,SNP,155774738,155782459,...,2222,2.374,3.702,0.240397,0.153028,0.273667,0.177053,0.349643,0.094252,0.434636
66815,chrX,155777573,G,A,ENST00000695325.1_utr3_2_0_chrX_155774739_f,,0,SNP,155774738,155782459,...,2835,2.189,5.932,0.306997,0.170795,0.140824,0.171251,0.127239,0.169549,0.136584
