In [1]:
import os
from glob import glob

import itertools

import pandas as pd
import numpy as np

import re

from collections import defaultdict

from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning) 

In [2]:
motif_len = 5 #5 or 6
NOREPEATS = False #ignore all motif matches in repeats (lowercase in fasta)

In [3]:
data_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/mlm/motif_analysis/'

In [8]:
eclip_tsv  = data_dir + 'eclip.tsv'

eclip_df = pd.read_csv(eclip_tsv,sep='\t')

eclip_df = eclip_df[['seq_name','motif_start','motif']].drop_duplicates()

eclip_df.head()

  eclip_df = pd.read_csv(eclip_tsv,sep='\t')


Unnamed: 0,seq_name,motif_start,motif
0,ENST00000379370.7_utr3_35_0_chr1_1054982_f,30,TTATT
1,ENST00000379370.7_utr3_35_0_chr1_1054982_f,40,TTTTT
2,ENST00000379370.7_utr3_35_0_chr1_1054982_f,58,TTTTT
3,ENST00000379370.7_utr3_35_0_chr1_1054982_f,764,ACCCC
4,ENST00000379370.7_utr3_35_0_chr1_1054982_f,789,TTCCC


In [5]:
eclip_motifs = eclip_df.groupby('seq_name').motif.apply(np.unique)

# Conservation

For each motif in each 3'UTR Human sequence, compute the number of sequences (species) that also have this motif

In [9]:
# all 3'UTR FASTA alignments are distributed across subfolders
# find all FASTA files and their subfolders

fasta_path = data_dir + '../fasta/aligned_3UTR/fa/'

fasta_dirs = []

for file in glob(fasta_path + '**/*.fa', recursive=True):
    relative_path = os.path.relpath(file, fasta_path)
    folder, seq_id = relative_path.split('/')
    seq_id = seq_id.replace('.fa','')
    fasta_dirs.append((folder,seq_id))
    
fasta_dirs = pd.DataFrame(fasta_dirs, columns=['folder','seq_name']).set_index('seq_name').squeeze()

fasta_dirs.head()

seq_name
ENST00000394232.6_utr3_12_0_chr15_91292553_f    18
ENST00000291442.4_utr3_3_0_chr19_17231883_r     18
ENST00000310981.6_utr3_0_0_chr2_127702177_f     18
ENST00000418260.3_utr3_1_0_chr12_31615023_r     18
ENST00000380985.10_utr3_12_0_chr5_65822916_f    18
Name: folder, dtype: object

In [10]:
def read_fasta(fasta):

    seqs = defaultdict(str)
    
    with open(fasta, 'r') as f:
        for line in f:
            if line.startswith('>'):
                seq_name = line[1:].rstrip()
            else:
                seq = line.rstrip()
                if NOREPEATS:
                    seq = re.sub('a|c|g|t','-',seq) #treat repeats as deletions
                seqs[seq_name] += seq.upper()
    return seqs

In [11]:
def is_interval_in_seq(find_me, interval_seq):
    '''
    checks if a given interval is in a sequence of intervals
    the sequence should be sorted by the 0th coordinate
    '''
    for interval in interval_seq:
        if find_me[-1]<interval[0]:
            # since interval_seq is sorted by the 0th coordinate, there is no more chance to encounter a region containing find_me
            return False
        # sought interval is included in the current interval
        if find_me[0]>=interval[0] and find_me[1]<=interval[1]:
            return True
    
    return False

In [12]:
def best_match_dm(motif_pos_ref, motif_pos_test):
    '''
    Find the best matching positions between reference and test motif positions
    '''
    n_ref = len(motif_pos_ref)
    n_test = len(motif_pos_test)

    motif_pos_ref = sorted(motif_pos_ref)
    
    distance_matrix = np.empty((n_ref,n_test))
    distance_matrix[:] = np.nan
    
    for row_idx,pos_ref in enumerate(motif_pos_ref): #current reference position
        if row_idx>0:
            dist_max_left = motif_pos_ref[row_idx]-motif_pos_ref[row_idx-1] #distance between current reference position and the next reference position to the left
        else:
            dist_max_left = 1e10 #very large number
        if row_idx<n_ref-1:
            dist_max_right = motif_pos_ref[row_idx+1]-motif_pos_ref[row_idx] #distance between current reference position and the next reference position to the right
        else:
            dist_max_right = 1e10 #very large number
        for col_idx,pos_cur in enumerate(motif_pos_test):
            R = abs(pos_ref-pos_cur) #absolute distance
            if (pos_ref>=pos_cur and R<dist_max_left) or (pos_ref<pos_cur and R<dist_max_right): #avoid that conservation windows around reference motifs overlap
                distance_matrix[row_idx,col_idx] = R                

    matched_pairs = [] #best (reference,test) matches
    
    while True:
        
        #repeat until all elements of distance_matrix are nan

        min_dist = np.nanmin(distance_matrix)

        if np.isnan(min_dist):
            break
            
        row_idx, col_idx = np.argwhere(distance_matrix == min_dist)[0] #find indexes of minimal distance
        
        distance_matrix[row_idx,:] = np.nan
        distance_matrix[:,col_idx] = np.nan

        matched_pairs.append((motif_pos_ref[row_idx],motif_pos_test[col_idx]))

    return matched_pairs

In [13]:
#motif = 'ACTAC'
#
#refseq = 'TAACTACACTAC'
#testseq = 'ACTACACTAC'
#
#motif_pos_ref = [m.start() for m in re.finditer(motif,refseq)] #all positions of given motif in the sequence
#motif_pos_test = [m.start() for m in re.finditer(motif,testseq)] #all positions of given motif in the sequence
#
#
#best_match_dm(motif_pos_ref, motif_pos_test)

In [16]:
def get_matches(fasta_seqs, tol_distances, motif_len, eCLIP_intervals_seq):
    '''
    compute conservation scores for various tol_distances in given fasta_seqs for all eCLIP motifs
    '''
    
    seq_res =[]

    for motif in eCLIP_intervals_seq:
        motif_res = {} #conservation scores for given motif
        for seq_idx, (seq_name,seq) in enumerate(fasta_seqs.items()):
            utr_name, species_name, *_ = seq_name.split(':')
            motif_pos_cur = [m.start() for m in re.finditer(motif,seq)] #all positions of given motif in the sequence
            n_matches = len(motif_pos_cur)
            if seq_idx == 0:
                if n_matches == 0: #motif isn't found in reference
                    break
                motif_pos_ref = motif_pos_cur
            elif n_matches>0:
                conditional_counts_motif = defaultdict(int)
                best_matches = best_match_dm(motif_pos_ref, motif_pos_cur)
                for matched_pos_ref, matched_pos_cur in best_matches:
                    if not matched_pos_ref in motif_res.keys():
                            motif_res[matched_pos_ref] = {R:0 for R in tol_distances}
                    for R in tol_distances:
                        if abs(matched_pos_ref-matched_pos_cur) <= R:
                            motif_res[matched_pos_ref][R] += 1

        for matched_pos_ref, motif_counts in motif_res.items():
            seq_res.append((motif,matched_pos_ref,*motif_counts.values()))

    return seq_res

In [28]:
#tol_distance = W//2

tol_distances =  (10**np.linspace(1,4,16)/2).astype(int) 
tol_distances =  np.insert(tol_distances,0,0)

In [29]:
human_3utr = read_fasta(data_dir + '../fasta/Homo_sapiens_rna.fa')
human_3utr = {seq_name:seq for seq_name,seq in human_3utr.items() if seq_name in eclip_motifs.index}

In [30]:
consv_df = []

for seq_idx,(seq_name,seq) in enumerate(tqdm(human_3utr.items())):

    #print(f'{seq_idx+1}/{len(human_3utr)}:{seq_name}')
        
    file_path = fasta_path + fasta_dirs.loc[seq_name] + '/' + seq_name + '.fa'
    
    fasta_seqs = read_fasta(file_path)

    seq_res = get_matches(fasta_seqs, tol_distances, motif_len, eclip_motifs.loc[seq_name])
    
    seq_res = pd.DataFrame(seq_res,columns=['motif','motif_start']+[f'R_{R}' for R in tol_distances])

    refseq=list(fasta_seqs.values())[0]
    
    for motif,motif_start in seq_res[['motif','motif_start']].values.tolist():
        assert seq[motif_start:motif_start+motif_len]==motif #Homo_sapiens sequence from reference RNA fasta
        assert refseq[motif_start:motif_start+motif_len]==motif #Homo_sapiens sequence from alignment 

    seq_res['seq_name'] = seq_name

    consv_df.append(seq_res)
    
consv_df = pd.concat(consv_df)

  0%|          | 0/11503 [00:00<?, ?it/s]

In [31]:
consv_df = consv_df.merge(eclip_df,how='inner')[['seq_name','motif','motif_start']+[f'R_{R}' for R in tol_distances]]

consv_df.to_csv(data_dir + f'conservation_k{motif_len}.tsv', sep='\t', index=None)