In [1]:
import pickle
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import defaultdict

In [2]:
data_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/mlm/'

In [3]:
model = 'PhyloP-100way' #PhyloP-100way or PhyloP-241way

In [4]:
utr_table = pd.read_csv(data_dir + 'UTR_coords/GRCh38_3_prime_UTR_clean-sorted.bed', sep='\t',
                       header = None, names=['chrom','seq_start','seq_end','seq_name','strand'], usecols=[0,1,2,3,5]).set_index('seq_name')

In [5]:
phylop_res = pd.read_csv(data_dir + f'variants/prefiltered/PhyloP/{model}.3utr.scores.tsv.gz', sep='\t',
                           header = None, names=['chrom','pos',f'{model}-score']).sort_values(by=['chrom','pos']).set_index('chrom')

In [6]:
phylop_res.pos = phylop_res.pos-1 #to 0-based

In [7]:
fasta = data_dir + '/fasta/Homo_sapiens_dna_fwd.fa' #rna dataset

human_seqs = defaultdict(str)

with open(fasta, 'r') as f:
    for line in f:
        if line.startswith('>'):
            seq_name = line[1:].rstrip()
        else:
            human_seqs[seq_name] += line.rstrip().upper()

In [8]:
utr_table = utr_table[utr_table.index.isin(human_seqs.keys())]

In [9]:
res = []

for seq_name, (chrom, start_seq,end_seq,strand) in tqdm(utr_table.iterrows(),total=len(utr_table)):
    L = end_seq-start_seq #sequence length
    seq_scores = np.full((L,), np.nan)
    df = phylop_res.loc[chrom]
    start_idx = np.searchsorted(df.pos,start_seq)
    end_idx = np.searchsorted(df.pos,end_seq)
    for _, row in df.iloc[start_idx:end_idx].iterrows():
        seq_scores[int(row.pos)-start_seq] = row[f'{model}-score']
    res.append((seq_name,human_seqs[seq_name],seq_scores))

100%|██████████| 18134/18134 [21:03<00:00, 14.35it/s] 


In [12]:
output_dir = data_dir + f'/human_3utr/probs/{model}/'

os.makedirs(output_dir, exist_ok=True)

with open(output_dir + 'predictions.pickle', 'wb') as f:
    seq_names, seqs, probs = zip(*res)
    pickle.dump({'seq_names':seq_names, 'seqs':seqs, 'probs':probs},f)