In [1]:
import numpy as np
import pandas as pd
import os

import pickle
from collections import defaultdict
import matplotlib.pyplot as plt
from tqdm import tqdm
from glob import glob

import matplotlib
import scipy.stats

matplotlib.rcParams.update({'font.size': 16})

In [2]:
data_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/mlm/'

In [3]:
probs_dir = data_dir + 'human_3utr/probs/'

models = {'StateSpace':'stspace',
          'StateSpace-SA':'stspace-spaw',
          'DNABERT': 'dnabert', 
          'DNABERT-3UTR': 'dnabert-3utr', 
          'DNABERT-2': 'dnabert2', 
          'DNABERT2-3UTR': 'dnabert2-3utr', 
          '13-mer':'K-mer/13_mer',
          'NTv2-250M': 'ntrans-v2-250m',
          'NTv2-250M-3UTR': 'ntrans-v2-250m-3utr',
          'PhyloP-100way': 'PhyloP100' ,
          'PhyloP-241way': 'PhyloP241' ,
         }

# Get scores from probabilities

In [4]:
def get_model(glob_path):
    res = {}
    for probas_file in glob(glob_path):
        #print(probas_file)
        with open(probas_file, 'rb') as f:
            fold_res = pickle.load(f)
            #fold_res = {seq_name:{'probs':prob,'seq':seq} for seq_name,prob,seq in zip(fold_res['seq_names'],fold_res['probs'],fold_res['seqs'])}
            fold_res = {seq_name:prob for seq_name,prob in zip(fold_res['seq_names'],fold_res['probs'])}
            res.update(fold_res)
    return res

In [5]:
all_model_probas = {}

for model, model_path in models.items():
    all_model_probas[model] = get_model(probs_dir + '/' + model_path + '/predictions*.pickle')
    print(f'{model} loaded, {len(all_model_probas[model])} sequences')

StateSpace loaded, 18134 sequences
StateSpace-SA loaded, 18134 sequences
DNABERT loaded, 18134 sequences
DNABERT-3UTR loaded, 18134 sequences
DNABERT2 loaded, 0 sequences
DNABERT2-3UTR loaded, 0 sequences
13-mer loaded, 18134 sequences
NTv2-250M loaded, 18134 sequences
NTv2-250M-3UTR loaded, 18134 sequences
PhyloP-100way loaded, 18134 sequences
PhyloP-241way loaded, 18134 sequences


In [6]:
def reverse_complement(seq):
    '''
    Take sequence reverse complement
    '''
    compl_dict = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}
    compl_seq = ''.join([compl_dict.get(x,x) for x in seq])
    rev_seq = compl_seq[::-1]
    return rev_seq

In [7]:
mapping = {'A':0,'C':1,'G':2,'T':3}

In [8]:
human_fasta = data_dir + 'fasta/Homo_sapiens_rna.fa' #3'UTR on hegative strand should  be reverse complemented

def read_fasta(fasta):

    seqs = defaultdict(str)
    
    with open(fasta, 'r') as f:
        for line in f:
            if line.startswith('>'):
                seq_name = line[1:].rstrip()
            else:
                seqs[seq_name] += line.rstrip().upper()
    return seqs

In [9]:
human_utr = read_fasta(human_fasta)

In [10]:
utr_variants = pd.read_csv(data_dir+'variants/selected/variants_snp.tsv', sep='\t') 

In [11]:
utr_table = pd.read_csv(data_dir + 'UTR_coords/GRCh38_3_prime_UTR_clean-sorted.bed', sep='\t',
                       header = None, names=['seq_start','seq_end','seq_name','strand'], usecols=[1,2,3,5])

In [12]:
#utr_variants['var_id'] = utr_variants.chrom + '_' + utr_variants.pos.astype(str) + '_' + utr_variants.ref + '_' + utr_variants.alt
#utr_variants.set_index('var_id', inplace=True)

In [13]:
# take reverse complement of ref and alt for variants in genes on the negative strand 
# since model predictions are reverse complemented for these sequences
# this is already taken into account for pos_rel (see dataprep)

utr_variants.ref = utr_variants.apply(lambda x:reverse_complement(x.ref) if x.strand=='-' else x.ref, axis=1)
utr_variants.alt = utr_variants.apply(lambda x:reverse_complement(x.alt) if x.strand=='-' else x.alt, axis=1)

In [14]:
# get PhyloP conservation scores at variant positions

for model_name in ('PhyloP-100way','PhyloP-241way',):

    print(model_name)

    probas = all_model_probas[model_name]
    
    for var_idx, var in tqdm(utr_variants.iterrows(), total=len(utr_variants)):
        if var.seq_name in probas.keys() and var.seq_name in human_utr.keys():
            if var.vartype=='SNP':
                assert human_utr[var.seq_name][var.pos_rel] == var.ref
                utr_variants.at[var_idx,model_name+'-pref'] = probas[var.seq_name][var.pos_rel]
            else:
                if var.vartype=='INS':
                    left, right = var.pos_rel-2, var.pos_rel+2
                else:
                    if var.strand=='+':
                        left, right = var.pos_rel, var.pos_rel+len(var.ref)
                    else:
                        left, right = var.pos_rel-len(var.ref), var.pos_rel
                    assert human_utr[var.seq_name][left:right] == var.ref
                utr_variants.at[var_idx,model_name+'-pref'] = np.mean(probas[var.seq_name][left:right])

PhyloP-100way


100%|██████████| 61000/61000 [00:05<00:00, 11714.17it/s]


PhyloP-241way


100%|██████████| 61000/61000 [00:05<00:00, 11777.75it/s]


In [15]:
def add_model_res(model_name):

    print(model_name)

    probas = all_model_probas[model_name]
        
    for var_idx, var in tqdm(utr_variants.iterrows(), total=len(utr_variants)):
        if var.seq_name in probas.keys() and var.seq_name in human_utr.keys():
            if var.vartype=='SNP':
                assert human_utr[var.seq_name][var.pos_rel] == var.ref
                utr_variants.at[var_idx, model_name+'-palt'] = probas[var.seq_name][var.pos_rel, mapping[var.alt]]
                utr_variants.at[var_idx, model_name+'-pref'] = probas[var.seq_name][var.pos_rel, mapping[var.ref]]
            else:
                if var.vartype=='INS':
                    left, right = var.pos_rel-2, var.pos_rel+2
                else:
                    if var.strand=='+':
                        left, right = var.pos_rel, var.pos_rel+len(var.ref)
                    else:
                        left, right = var.pos_rel-len(var.ref), var.pos_rel
                ref_score = []
                seq = human_utr[var.seq_name]
                assert seq[left:right] == var.ref
                for pos_rel in range(max(left,0),min(right,len(seq))):
                    ref_score.append(probas[var.seq_name][pos_rel, mapping[seq[pos_rel]]]) 
                    #ref_score.append(np.max(probas[var.seq_name][pos_rel])) 
                utr_variants.at[var_idx, model_name+'-pref'] = np.mean(ref_score)

In [16]:
for model_name in ('StateSpace', 'StateSpace-SA',
          'DNABERT', 'DNABERT-3UTR', '13-mer','NTv2-250M','NTv2-250M-3UTR'):
    add_model_res(model_name)

StateSpace


100%|██████████| 61000/61000 [00:07<00:00, 8448.38it/s]


StateSpace-SA


100%|██████████| 61000/61000 [00:07<00:00, 8372.01it/s]


DNABERT


100%|██████████| 61000/61000 [00:07<00:00, 8190.75it/s]


DNABERT-3UTR


100%|██████████| 61000/61000 [00:07<00:00, 8024.41it/s]


13-mer


100%|██████████| 61000/61000 [00:07<00:00, 8066.58it/s]


NTv2-250M


100%|██████████| 61000/61000 [00:07<00:00, 8004.46it/s]


NTv2-250M-3UTR


100%|██████████| 61000/61000 [00:07<00:00, 8075.69it/s]


# Get scores from embeddings

In [17]:
def compute_embeddings_score(seq_names,embeddings,losses,model_name):
    res = []
    #loss_ref_avg, loss_ref_central, loss_alt_avg, loss_alt_central = None, None, None, None #we don't compute score base on losses here
    for idx in range(0,len(embeddings),2):
        assert seq_names[idx]==seq_names[idx+1].replace('alt','ref')
        emb_ref, emb_alt = embeddings[idx], embeddings[idx+1]
        l2 = np.linalg.norm(emb_ref-emb_alt)
        l1 = np.linalg.norm((emb_ref-emb_alt), ord=1)
        dot = np.dot(emb_ref,emb_alt)
        cosine = dot/(np.linalg.norm(emb_ref)*np.linalg.norm(emb_alt))
        loss_ref, loss_alt = losses[idx], losses[idx+1]
        varname = seq_names[idx].replace('_ref','').split('_')
        res.append((varname[0],int(varname[1]),varname[2],varname[3],l1,l2,dot,cosine,loss_ref,loss_alt))
    res = pd.DataFrame(res,columns=['chrom','pos','ref','alt',
        f'{model_name}-l1',f'{model_name}-l2',f'{model_name}-dot',f'{model_name}-cosine',
        f'{model_name}-loss_ref', f'{model_name}-loss_alt'])
    return res

In [18]:
emb_dir  = data_dir + 'variants/embeddings/'

for model_name in ('StateSpace', 'StateSpace-SA', 'DNABERT-2', 'DNABERT2-3UTR',
          'DNABERT', 'DNABERT-3UTR', 'NTv2-250M','NTv2-250M-3UTR'):
    
    print(model_name)
    
    with open(emb_dir + models[model_name] + '/predictions.pickle', 'rb') as f:
        data = pickle.load(f)
        
    embeddings_scores = compute_embeddings_score(data['seq_names'],data['embeddings'], data['losses'],model_name)
    
    utr_variants = utr_variants.merge(embeddings_scores, how='left')

StateSpace
StateSpace-SA
DNABERT2
DNABERT2-3UTR
DNABERT
DNABERT-3UTR
NTv2-250M
NTv2-250M-3UTR


# Get scores from supervised learning

In [19]:
for classifier in ('LogisticRegression','MLP'):
    
    pred_dir  = data_dir + f'variants/predictions/merge_embeddings_1/{classifier}/'
    
    for model_name in ('StateSpace', 'StateSpace-SA', 'DNABERT-2', 'DNABERT2-3UTR',
          'DNABERT', 'DNABERT-3UTR', 'NTv2-250M','NTv2-250M-3UTR'):

        model_res = []

        for subset in ('clinvar','gnomAD','eQTL-susie','eQTL-GRASP'):

            pred_res = pred_dir + subset  + '-' + models[model_name] + '.tsv'
            
            if os.path.isfile(pred_res):
                subset_df = pd.read_csv(pred_res,sep='\t')
                model_res.append(subset_df)
            else:
                print(pred_res)

        if len(model_res)>0:
            model_res = pd.concat(model_res)
            model_res.rename(columns={'y_pred':model_name+'-'+classifier},inplace=True)
            utr_variants = utr_variants.merge(model_res,how='left')

In [20]:
utr_variants.to_csv(data_dir + 'variants/model_scores_snp.tsv', sep='\t', index=None)