In [1]:
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import matplotlib.pyplot as plt
from tqdm import tqdm
from glob import glob

import sklearn, sklearn.linear_model, sklearn.metrics, sklearn.pipeline,sklearn.ensemble

import matplotlib
import scipy.stats

matplotlib.rcParams.update({'font.size': 16})

In [2]:
data_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/MLM/'

In [3]:
#3'UTR table

utr_table = pd.read_csv(data_dir + 'UTR_coords/GRCh38_3_prime_UTR_clean-sorted.bed', sep='\t',
                       header = None, names=['seq_start','seq_end','seq_name','strand'], usecols=[1,2,3,5])

In [4]:
utr_variants = pd.read_csv(data_dir + 'gnomAD/gnomAD_GRCh38.utr3.tsv.gz', names=['chrom','pos','ref','alt','seq_name','AC','AF','AF_popmax'],
                        comment='#',sep='\t',header=None)

In [5]:
utr_variants.pos = utr_variants.pos-1 #to 0-based

len_ref = utr_variants.ref.apply(len)
len_alt = utr_variants.alt.apply(len)

utr_variants.loc[(len_ref==1)&(len_alt==1),'vartype'] = 'SNP'
utr_variants.loc[(len_ref>1)&(len_alt==1),'vartype'] = 'DEL'
utr_variants.loc[(len_ref==1)&(len_alt>1),'vartype'] = 'INS'

utr_variants = utr_variants[~utr_variants.isna()] #not SNPs and not INDELs

  utr_variants.loc[(len_ref==1)&(len_alt==1),'vartype'] = 'SNP'


In [6]:
utr_variants = utr_variants.merge(utr_table, how='left')

In [7]:
utr_variants.loc[utr_variants.strand=='+','pos_rel'] = utr_variants[utr_variants.strand=='+'].pos - utr_variants[utr_variants.strand=='+'].seq_start
utr_variants.loc[utr_variants.strand=='-','pos_rel'] = utr_variants[utr_variants.strand=='-'].seq_end - utr_variants[utr_variants.strand=='-'].pos-1
utr_variants.pos_rel = utr_variants.pos_rel.astype(int)

In [8]:
utr_variants = utr_variants[utr_variants.pos_rel<5000] #limit by the maximum sequence length on species-aware model

In [13]:
models = { 'Species-agnostic':'motif_predictions/species_agnostic/probas','Species-aware':'motif_predictions/species_aware/probas', 
          'DNABERT': 'motif_predictions/split_75_25/dnabert/default/', '11-mer':'motif_predictions/K-mer/11_mer',
          'PhyloP100': 'PhyloP/PhyloP100_3UTR' ,'PhyloP241': 'PhyloP/PhyloP241_3UTR'}

In [14]:
#utr_variants = pd.read_csv(data_dir + 'gnomAD/model_scores.tsv.gz', sep='\t')

In [None]:
for model_name in ('PhyloP100','PhyloP241','11-mer',):

    print(model_name)

    with open(data_dir + models[model_name] + '.pickle', 'rb') as f:
        probas = dict(pickle.load(f))
        
    for var_idx, var in tqdm(utr_variants.iterrows(), total=len(utr_variants)):
        if var.seq_name in probas.keys():
            if var.vartype=='SNP':
                utr_variants.at[var_idx,model_name] = probas[var.seq_name][var.pos_rel]
            else:
                if var.vartype=='INS':
                    left, right = var.pos_rel-2, var.pos_rel+2
                else:
                    if var.strand=='+':
                        left, right = var.pos_rel, var.pos_rel+len(var.ref)
                    else:
                        left, right = var.pos_rel-len(var.ref), var.pos_rel
                utr_variants.at[var_idx,model_name] = np.mean(probas[var.seq_name][left:right])

In [16]:
mapping = {'A':0,'C':1,'G':2,'T':3}

In [20]:
human_fasta = data_dir + 'fasta/240_species/species/Homo_sapiens.fa' #3'UTR on hegative strand should already be reversed

human_utr = defaultdict(str)

with open(human_fasta, 'r') as f:
    for line in f:
        if line.startswith('>'):
            seq_name = line[1:].split(':')[0]
        else:
            human_utr[seq_name] += line.rstrip().upper()

In [17]:
def get_dnabert(res_path):
    res = {}
    for probas_file in glob(res_path + 'preds*.npy'):
        seq_matching = pd.read_csv(probas_file.replace('/preds','/seq').replace('.npy','.csv')).set_index('seq_name').squeeze()
        probas = np.load(probas_file)
        s = 0
        for seq_name, seq in seq_matching.items():
            #res[seq_name] = [x[mapping[seq[seq_idx]]] for seq_idx,x in enumerate(probas[s:s+len(seq),:])]
            res[seq_name] = probas[s:s+len(seq),:]
            s += len(seq)
        assert s==len(probas)
    return res

In [21]:
def add_model_res(model_name):

    print(model_name)

    if not 'DNABERT' in model_name:
        with open(data_dir + models[model_name] + '.pickle', 'rb') as f:
            probas = dict(pickle.load(f))
        #if 'Species' in model_name:
        #    probas = {k:v/v.sum(1, keepdims=True) for k,v in probas.items()} #normalize probabilities to 1
    else:
        probas = get_dnabert(data_dir + models[model_name])
        
    for var_idx, var in tqdm(utr_variants.iterrows(), total=len(utr_variants)):
        if var.seq_name in probas.keys():
            if var.vartype=='SNP':
                utr_variants.at[var_idx, model_name+'_alt'] = probas[var.seq_name][var.pos_rel, mapping[var.alt]]
                utr_variants.at[var_idx, model_name+'_ref'] = probas[var.seq_name][var.pos_rel, mapping[var.ref]]
            else:
                if var.vartype=='INS':
                    left, right = var.pos_rel-2, var.pos_rel+2
                else:
                    if var.strand=='+':
                        left, right = var.pos_rel, var.pos_rel+len(var.ref)
                    else:
                        left, right = var.pos_rel-len(var.ref), var.pos_rel
                ref_score = []
                seq = human_utr[var.seq_name]
                for pos_rel in range(max(left,0),min(right,len(seq))):
                    ref_score.append(probas[var.seq_name][pos_rel, mapping[seq[pos_rel]]]) 
                    #ref_score.append(np.max(probas[var.seq_name][pos_rel])) 
                utr_variants.at[var_idx, model_name+'_ref'] = np.mean(ref_score)

In [22]:
add_model_res('DNABERT')

DNABERT


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 6377294/6377294 [05:36<00:00, 18962.88it/s]


In [None]:
for model_name in ('Species-aware', 'Species-agnostic', 'DNABERT'):
    add_model_res(model_name)

In [23]:
utr_variants.to_csv(data_dir + 'gnomAD/model_scores.tsv.gz', sep='\t', index=None)

In [24]:
! ls -alh {data_dir + 'gnomAD/model_scores.tsv.gz'}

-rw-rw-r--+ 1 sergey.vilov OG-ICB-User 416M Oct 10 15:38 /lustre/groups/epigenereg01/workspace/projects/vale/MLM/gnomAD/model_scores.tsv.gz
