In [1]:
import pandas as pd
import numpy as np
import pickle
from tqdm.notebook import tqdm
from glob import glob
from collections import defaultdict

import sys
sys.path.append("/home/icb/sergey.vilov/workspace/MLM/utils") 

from misc import model_alias, dna_models

In [2]:
data_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/mlm/'

In [3]:
eclip_tsv  = data_dir + 'motif_analysis/eclip.tsv'

eclip_df = pd.read_csv(eclip_tsv,sep='\t')

#eclip_df = eclip_df.drop(columns=['stepwiseR_minus1','RBP'])

eclip_df.head()

  eclip_df = pd.read_csv(eclip_tsv,sep='\t')


Unnamed: 0,chrom,pos,seq_name,strand,motif,motif_id,motif_start,RBP,stepwiseR_minus1,eCLIP_RBNS
0,chr7,127591380,ENST00000000233.10_utr3_5_0_chr7_127591300_f,+,TGCCC,chr7_127591380_TGCCC_f,81,PCBP1,2.26,True
1,chr7,127591402,ENST00000000233.10_utr3_5_0_chr7_127591300_f,+,TCCCC,chr7_127591402_TCCCC_f,103,PCBP1,1.92,True
2,chr7,127591403,ENST00000000233.10_utr3_5_0_chr7_127591300_f,+,CCCCC,chr7_127591403_CCCCC_f,104,PCBP1,2.99,True
3,chr7,127591404,ENST00000000233.10_utr3_5_0_chr7_127591300_f,+,CCCCA,chr7_127591404_CCCCA_f,105,PCBP1,2.49,True
4,chr7,127591542,ENST00000000233.10_utr3_5_0_chr7_127591300_f,+,GCCCC,chr7_127591542_GCCCC_f,243,PCBP1,3.7,True


In [4]:
eclip_hits = eclip_df[['seq_name','motif_start','motif']].groupby('seq_name').apply(lambda x:x.values.tolist(),include_groups=False)

eclip_hits.head() #relative k-mer positions for each 3'UTR sequence

seq_name
ENST00000000233.10_utr3_5_0_chr7_127591300_f    [[81, TGCCC], [103, TCCCC], [104, CCCCC], [105...
ENST00000000412.8_utr3_6_0_chr12_8940361_r                         [[1058, CCCAC], [1024, GTCCC]]
ENST00000001008.6_utr3_9_0_chr12_2803259_f      [[474, TTCCC], [485, ACCCC], [486, CCCCC], [48...
ENST00000001146.7_utr3_5_0_chr2_72129238_r      [[717, ACACA], [19, AGCCC], [1731, CCCCA], [38...
ENST00000002165.11_utr3_6_0_chr6_143494812_r    [[128, TTCCC], [136, TTCCC], [138, CCCAC], [16...
dtype: object

In [5]:
pred_dir =  data_dir + 'human_3utr/probs/'

models = ['DNABERT','DNBT-3UTR-RNA','NT-MS-v2-100M','NT-3UTR-RNA',
          'STSP-3UTR-RNA','STSP-3UTR-RNA-HS','STSP-3UTR-DNA',
         'PhyloP-100way','PhyloP-241way','CADD-1.7','Zoo-AL']

In [6]:
strand_info = pd.read_csv(data_dir + 'UTR_coords/GRCh38_3_prime_UTR_clean-sorted.bed', sep='\t', header = None, names=['seq_name','strand'], usecols=[3,5]).set_index('seq_name').squeeze()

In [7]:
mapping = {'A':0,'C':1,'G':2,'T':3}

In [8]:
def get_model(glob_path,seq_list,dna_model=False):
    
    res = {}
    for probas_file in glob(glob_path):
        with open(probas_file, 'rb') as f:
            fold_res = pickle.load(f)
            fold_res = {seq_name:{'probs':prob,'seq':seq.upper()} for seq_name,prob,seq in zip(fold_res['seq_names'],fold_res['probs'],fold_res['seqs'])
                        if seq_name in seq_list}
            res.update(fold_res)

    if dna_model:
        for seq_name in res.keys():
            if strand_info.loc[seq_name]=='-':
                #reverse complement to match rna sequences
                res[seq_name]['seq'] = reverse_complement(res[seq_name]['seq'])
                if res[seq_name]['probs'].ndim>1:
                    res[seq_name]['probs'] = res[seq_name]['probs'][::-1,[3,2,1,0]] #reverse complement probabilities
                else:
                    res[seq_name]['probs'] = res[seq_name]['probs'][::-1] #reverse complement probabilities

    return res

def read_fasta(fasta):

    seqs = defaultdict(str)
    
    with open(fasta, 'r') as f:
        for line in f:
            if line.startswith('>'):
                seq_name = line[1:].rstrip()
            else:
                seq = line.rstrip()
                seqs[seq_name] += seq.upper()
    return seqs

def reverse_complement(seq):
    '''
    Take sequence reverse complement
    '''
    compl_dict = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}
    compl_seq = ''.join([compl_dict.get(x,x) for x in seq])
    rev_seq = compl_seq[::-1]
    return rev_seq

In [9]:
human_3utr = read_fasta(data_dir + '/fasta/Homo_sapiens_rna.fa')
human_3utr = {seq_name:seq for seq_name,seq in human_3utr.items() if seq_name in eclip_hits.index}

len(human_3utr)

11358

In [10]:
model_probs = {}

for model in models:
    dna_model = model in dna_models
    model_probs[model] = get_model(pred_dir + '/' + model_alias[model] + '/predictions*.pickle',
                                   human_3utr.keys(), dna_model)
    
    print(f'{model} loaded, {len(model_probs[model])} sequences')

DNABERT loaded, 11358 sequences
DNBT-3UTR-RNA loaded, 11358 sequences
NT-MS-v2-100M loaded, 11358 sequences
NT-3UTR-RNA loaded, 11358 sequences
STSP-3UTR-RNA loaded, 11358 sequences
STSP-3UTR-RNA-HS loaded, 11358 sequences
STSP-3UTR-DNA loaded, 11358 sequences
PhyloP-100way loaded, 11358 sequences
PhyloP-241way loaded, 11358 sequences
CADD-1.7 loaded, 11358 sequences
Zoo-AL loaded, 11358 sequences


In [11]:
agg_strategy = 'MEAN' #MEAN or MAX

In [12]:
agg_func_LM = lambda x: np.mean(x) if agg_strategy=='MEAN' else np.max(x)
agg_func_zoo_al = lambda x: np.mean(x)

In [13]:
motif_len = 5

res = []

for seq_name,seq in tqdm(human_3utr.items()):
    
    for model in models:

            for motif_start,motif in eclip_hits.loc[seq_name]:
                
                motif_end = motif_start+motif_len
                
                assert seq[motif_start:motif_end]==motif
                
                motif_mapping = [mapping[x] for x in motif]
                
                assert model_probs[model][seq_name]['seq'][motif_start:motif_end] == motif
                
                motif_scores = model_probs[model][seq_name]['probs'][motif_start:motif_end]
                
                if  'PhyloP' in model:
                    
                    score_ref, score_alt, score_ref_vs_alt = np.max(motif_scores), None, None
                    
                elif 'CADD' in model:
                    
                    score_ref, score_alt, score_ref_vs_alt = None, np.max(motif_scores), None
                    
                else:
                    
                    motif_scores = motif_scores[::,:4]
                    motif_scores = motif_scores/motif_scores.sum(1,keepdims=True)
                    
                    pref = np.array([motif_scores[pos_idx, ref_nt_idx] for pos_idx, ref_nt_idx in enumerate(motif_mapping)])
                    paltmin = np.array([np.delete(motif_scores[pos_idx],ref_nt_idx).min() for pos_idx, ref_nt_idx in enumerate(motif_mapping)])
                    
                    log_palt_minus1 = -np.log(paltmin+1e-14)
                    log_pref_palt = np.log(pref+1e-14) - np.log(paltmin+1e-14)

                    if model == 'Zoo-AL':
                        score_ref, score_alt, score_ref_vs_alt = agg_func_zoo_al(pref), agg_func_zoo_al(log_palt_minus1), agg_func_zoo_al(log_pref_palt)
                    else:
                        score_ref, score_alt, score_ref_vs_alt = agg_func_LM(pref), agg_func_LM(log_palt_minus1), agg_func_LM(log_pref_palt)
                    
                res.append((seq_name, motif, motif_start, model, score_ref, score_alt, score_ref_vs_alt))

In [14]:
res = pd.DataFrame(res,columns = ['seq_name','motif','motif_start','model','score_ref','score_alt', 'score_ref_vs_alt'])

In [15]:
res = res.merge(eclip_df).rename(columns={'seq_name':'UTR_ID','motif_start':'motif_start_UTR'})

res = res[['UTR_ID','chrom','pos','motif_start_UTR','motif','stepwiseR_minus1','RBP',
                               'model','eCLIP_RBNS','score_ref','score_alt','score_ref_vs_alt']]

res.loc[res.model.isin(dna_models),'fasta_file'] = 'fasta/Homo_sapiens_dna_fwd.fa'
res.loc[~res.model.isin(dna_models),'fasta_file'] = 'fasta/Homo_sapiens_rna.fa'

res.head()

Unnamed: 0,UTR_ID,chrom,pos,motif_start_UTR,motif,stepwiseR_minus1,RBP,model,eCLIP_RBNS,score_ref,score_alt,score_ref_vs_alt,fasta_file
0,ENST00000641515.2_utr3_2_0_chr1_70009_f,chr1,71397,1389,ACACA,,,DNABERT,False,0.353159,2.40957,1.121219,fasta/Homo_sapiens_dna_fwd.fa
1,ENST00000641515.2_utr3_2_0_chr1_70009_f,chr1,70043,35,ACACA,,,DNABERT,False,0.324869,2.654499,1.425663,fasta/Homo_sapiens_dna_fwd.fa
2,ENST00000641515.2_utr3_2_0_chr1_70009_f,chr1,71398,1390,CACAT,,,DNABERT,False,0.311161,2.48205,1.035661,fasta/Homo_sapiens_dna_fwd.fa
3,ENST00000641515.2_utr3_2_0_chr1_70009_f,chr1,71097,1089,GTGTG,,,DNABERT,False,0.660861,3.309041,2.860582,fasta/Homo_sapiens_dna_fwd.fa
4,ENST00000641515.2_utr3_2_0_chr1_70009_f,chr1,70260,252,TATAT,,,DNABERT,False,0.346097,2.186077,1.108866,fasta/Homo_sapiens_dna_fwd.fa


In [16]:
res.to_csv(data_dir + f'all_scores/motifs_k{motif_len}_agg_{agg_strategy}_zoo-al_hs.tsv.gz',sep='\t',index=None)