In [1]:
import os
import pandas as pd
import numpy as np
from glob import glob
import pickle
from collections import defaultdict
import re
from tqdm import tqdm


In [2]:
motif_len = 5 #5 or 6

In [3]:
#data_dir = '/s/project/mll/sergey/effect_prediction/MLM/motif_predictions/'
data_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/MLM/motif_predictions/'

In [4]:
eclip_pos_intervals = pd.read_pickle(data_dir + '../eCLIP/eCLIP_intervals_3UTR.df')

In [5]:
test_df = pd.read_csv(data_dir + 'split_75_25/test.csv').set_index('seq_name').squeeze() #compute conservation scores only on fraction of data

In [6]:
#3'UTR table

utr_table = pd.read_csv(data_dir + '../UTR_coords/GRCh38_3_prime_UTR_clean-sorted.bed', sep='\t',
                       header = None, names=['seq_name','strand','seq_len'], usecols=[3,5,9]).set_index('seq_name')

# Conservation

For each motif in each 3'UTR Human sequence, compute the number of sequences (species) that also have this motif

In [7]:
# all 3'UTR FASTA alignments are distributed across subfolders
# find all FASTA files and their subfolders

MAX_ALIGNMENT_LEN = 5000 #length at which sequences were cut after extracting

fasta_path = data_dir + '../aligned/data/3_prime_UTR/'

fasta_dirs = []

for file in glob(fasta_path + '**/*.fa', recursive=True):
    relative_path = os.path.relpath(file, fasta_path)
    folder, seq_id = relative_path.split('/')
    seq_id = seq_id.replace('.fa','')
    fasta_dirs.append((folder,seq_id))
    
fasta_dirs = pd.DataFrame(fasta_dirs, columns=['folder','seq_name']).set_index('seq_name').squeeze()

In [8]:
def read_fasta(fasta_file):
    
    '''
    Read FASTA file
    sequence names are ignored
    outputs an array of sequences
    '''
    
    fasta_seqs = []

    with open(fasta_file,'r') as f:
        for line in f:
            if line.startswith('>'):
                fasta_seqs.append('')
            else:
                fasta_seqs[-1] += line.rstrip().upper()
                
    return fasta_seqs

In [9]:
def get_matches(fasta_seqs, motif_start, motif, tol=0):
    
    '''
    For fasta_seqs yield the number of strings with the given motif
    The motif can be located anywhere between motif_start-tol and motif_start+tol within the sequence
    '''
    
    motif_len = len(motif)

    if motif_start<0 or motif_start>MAX_ALIGNMENT_LEN-motif_len:
        return np.NaN
    
    N_matches = 0.
    
    if fasta_seqs[0][motif_start:motif_start+motif_len] != motif:
        print(motif,motif_start)
        raise Exception("Wrong motif on reference sequence") 
    
    for seq in fasta_seqs:
        if seq[max(motif_start-tol,0):motif_start+motif_len+tol].find(motif)>-1:
            N_matches+=1

    return N_matches

In [10]:
def is_interval_in_seq(find_me, interval_seq):
    '''
    checks if a given interval is in a sequence of intervals
    the sequence should be sorted by the 0th coordinate
    '''
    for interval in interval_seq:
        if find_me[-1]<interval[0]:
            # since interval_seq is sorted by the 0th coordinate, there is no more chance to encounter a region containing find_me
            return False
        # sought interval is included in the current interval
        if find_me[0]>=interval[0] and find_me[1]<=interval[1]:
            return True
    
    return False

In [11]:
consv_df = []

for seq_name,seq in tqdm(test_df.items(), total=len(test_df)):

    file_path = fasta_path + fasta_dirs.loc[seq_name] + '/' + seq_name + '.fa'
    
    fasta_seqs = read_fasta(file_path)

    if seq_name in eclip_pos_intervals.index:
        eCLIP_valid_intervals = eclip_pos_intervals.loc[seq_name]
    else:
        eCLIP_valid_intervals = []

    strand,seq_len  = utr_table.loc[seq_name].strand, utr_table.loc[seq_name].seq_len
    
    for motif_start in range(0,len(seq)-motif_len):
        motif = seq[motif_start:motif_start+motif_len]
        is_eCLIP = is_interval_in_seq((motif_start,motif_start+motif_len),eCLIP_valid_intervals)
        if strand=='-':
            motif_start_aligned = motif_start - (seq_len - len(fasta_seqs[0])) 
        else:
            motif_start_aligned = motif_start
        if motif_start_aligned>=0 and motif_start_aligned<=MAX_ALIGNMENT_LEN-motif_len:
            Nmatches = []
            #for tol in (0,10,50,100,200,300):
            for tol in (0,10,50,100,250,500,1000,2500,5000):
               Nmatches.append(int(get_matches(fasta_seqs, motif_start_aligned, motif, tol))) 
            consv_df.append((seq_name, motif, motif_start, int(is_eCLIP),*Nmatches))

  1%|          | 37/4534 [02:21<4:46:59,  3.83s/it]

KeyboardInterrupt



In [77]:
motifs_df = pd.DataFrame(consv_df, columns=['seq_name','motif','motif_start','is_eCLIP', 
                                          'Nmatches_0','Nmatches_10','Nmatches_50',
                                         'Nmatches_100','Nmatches_250','Nmatches_500',
                                         'Nmatches_1000','Nmatches_2500','Nmatches_5000'])

In [None]:
#for each motif, get distance to the nearest position of the same motif

motif_nearest = []

df = motifs_df.sort_values(by=['seq_name','motif','motif_start']).set_index(['seq_name','motif']).motif_start

for seq_name in tqdm(motifs_df.seq_name.unique()):
    seq_df = df.loc[seq_name]
    for motif in seq_df.index.unique():
        motif_pos = seq_df.loc[motif]
        if type(seq_df.loc[motif])!=pd.Series:
            motif_nearest.append((seq_name, motif, motif_pos, 5000)) #only one motif in this sequence - no neighbours, give max distance
        else:
            for idx, pos in enumerate(motif_pos):
                if idx==0:
                    L_nearest = motif_pos[1] #leftmost motif, use distance to the right neighbour
                elif idx==len(motif_pos)-1:
                    L_nearest = motif_pos[idx]-motif_pos[idx-1] #rightmost motif, use distance to the left neighbour
                else:
                    L_nearest = min(motif_pos[idx]-motif_pos[idx-1], motif_pos[idx+1]-motif_pos[idx]) #minimal distance between left and right neighbours
                motif_nearest.append((seq_name, motif, motif_pos[idx], L_nearest))    

In [None]:
motif_nearest = pd.DataFrame(motif_nearest, columns=['seq_name','motif','motif_start','L_nearest'])

In [None]:
motifs_df = motifs_df.merge(motif_nearest, how='left')

In [80]:
motifs_df.to_csv(data_dir + f'species_aware/conservation_k{motif_len}_new.csv.gz', index=None)