In [1]:
import os
import pandas as pd
import numpy as np
from glob import glob
import pickle
from collections import defaultdict
import re
from tqdm import tqdm


In [2]:
data_dir = '/s/project/mll/sergey/effect_prediction/MLM/motif_predictions/'

In [3]:
test_df = pd.read_csv(data_dir + 'split_75_25/test.csv').set_index('seq_name').squeeze() #compute conservation scores only on fraction of data

In [4]:
table_motifs = pd.read_csv(data_dir + 'motifs.csv') #predictions for motifs from Dominguez et al. 2018 + 10 random motifx
table_motifs.loc[table_motifs.protein.isna(), 'protein'] = 'Random' 

In [5]:
with open(data_dir + 'species_aware/probas.pickle','rb') as f:
    probs = dict(pickle.load(f)) 

In [6]:
motif_preds = []

for seq_name,seq in tqdm(test_df.items(), total=len(test_df)):
    for protein, motif in table_motifs.set_index('protein').squeeze().items():
        for match in re.finditer(motif,seq):
            avg_target_prob = np.mean(probs[seq_name][match.start():match.end()])
            motif_preds.append((seq_name,match.start(),motif,avg_target_prob))
            
motif_preds = pd.DataFrame(motif_preds,columns = ['seq_name','motif_start','motif','avg_target_prob']).set_index('seq_name')

100%|██████████| 4534/4534 [00:59<00:00, 75.83it/s] 


In [7]:
#3'UTR table

utr_table = pd.read_csv(data_dir + '../UTR_coords/GRCh38_3_prime_UTR_clean-sorted.bed', sep='\t',
                       header = None, names=['seq_name','strand','seq_len'], usecols=[3,5,9]).set_index('seq_name')

In [None]:
#test for some seq_idx

seq_idx = 3699

seq_name = seq_matching.loc[seq_idx]
file_path = fasta_path + fasta_dirs.loc[seq_name] + '/' + seq_name + '.fa'
file_path

In [40]:
utr_table.loc[seq_name]

strand        +
seq_len    1357
Name: ENST00000338167.10_utr3_16_0_chr11_130143449_f, dtype: object

# Conservation

For each motif in each 3'UTR Human sequence, compute the number of sequences (species) that also have this motif

In [9]:
# all 3'UTR FASTA alignments are distributed across subfolders
# find all FASTA files and their subfolders

MAX_ALIGNMENT_LEN = 5000 #length at which sequences were cut after extracting

fasta_path = data_dir + '../aligned/data/3_prime_UTR/'

fasta_dirs = []

for file in glob(fasta_path + '**/*.fa', recursive=True):
    relative_path = os.path.relpath(file, fasta_path)
    folder, seq_id = relative_path.split('/')
    seq_id = seq_id.replace('.fa','')
    fasta_dirs.append((folder,seq_id))
    
fasta_dirs = pd.DataFrame(fasta_dirs, columns=['folder','seq_name']).set_index('seq_name').squeeze()

In [10]:
def read_fasta(fasta_file):
    
    '''
    Read FASTA file
    sequence names are ignored
    outputs an array of sequences
    '''
    
    fasta_seqs = []

    with open(fasta_file,'r') as f:
        for line in f:
            if line.startswith('>'):
                fasta_seqs.append('')
            else:
                fasta_seqs[-1] += line.rstrip().upper()
                
    return fasta_seqs

In [11]:
def get_matches(fasta_seqs, motif_start, motif, tol=0):
    
    '''
    For fasta_seqs yield the number of strings with the given motif
    The motif can be located anywhere between motif_start-tol and motif_start+tol within the sequence
    '''
    
    motif_len = len(motif)

    if motif_start<0 or motif_start>MAX_ALIGNMENT_LEN-motif_len:
        return np.NaN
    
    N_matches = 0.
    
    if fasta_seqs[0][motif_start:motif_start+motif_len] != motif:
        print(motif,motif_start)
        raise Exception("Wrong motif on reference sequence") 
    
    for seq in fasta_seqs:
        if seq[max(motif_start-tol,0):motif_start+motif_len+tol].find(motif)>-1:
            N_matches+=1

    return N_matches

In [12]:
%%time

#loop over all FASTA sequences

for seq_idx, seq_name in enumerate(motif_preds.index.unique()):
        
    file_path = fasta_path + fasta_dirs.loc[seq_name] + '/' + seq_name + '.fa'
    
    fasta_seqs = read_fasta(file_path)
    
    motif_df = motif_preds.loc[seq_name,['motif','motif_start']] #motifs found within this FASTA sequence
    
    if utr_table.loc[seq_name].strand=='-':
        motif_df.motif_start = motif_df.motif_start - (utr_table.loc[seq_name].seq_len - len(fasta_seqs[0])) #correct error (we cut seqeunces and then took RC when aligned)
    
    #compute the number of motif matches for several tolerance values
    for tol in (0,10,50,100,200,300):
        if type(motif_df)==pd.DataFrame:
            motif_preds.loc[seq_name,f'Nmatches_{tol}'] = motif_df.apply(lambda x: get_matches(fasta_seqs,x.motif_start,x.motif,tol), axis=1)
        else:
            motif_preds.loc[seq_name,f'Nmatches_{tol}'] = get_matches(fasta_seqs,motif_df.motif_start,motif_df.motif,tol)
            #motif_preds.loc[seq_name,f'Nmatches_ref'] = len(list(re.finditer(motif_df.motif,fasta_seqs[0])))
    
    #compute motif counts in the ref sequence
    #if type(motif_df)==pd.DataFrame:
    #    motif_preds.loc[seq_name,f'motif_counts_ref'] = motif_df.motif.apply(lambda x: len(list(re.finditer(x,fasta_seqs[0]))))
    #else:
    #    motif_preds.loc[seq_name,f'motif_counts_ref'] = len(list(re.finditer(motif_df.motif,fasta_seqs[0])))
                                               
    if (seq_idx)%500==0:
        print(seq_idx)

0
500
1000
1500
2000
2500
3000
3500
4000
4500
CPU times: user 1h 36min 12s, sys: 1.74 s, total: 1h 36min 14s
Wall time: 1h 38min 48s


In [11]:
def seq_entropy(seq):
    probs = np.zeros((4,)) #probability for each base in the sequence
    L = len(seq)
    for base_idx, base in enumerate('ACTG'):
        prob_base = len([True for x in seq if x==base])/L #probability of base in the sequence
        probs[base_idx] = prob_base
    probs = probs[probs!=0] #exclude zeros to avoid error in log
    entropy = (-probs*np.log2(probs)).sum()
    return entropy

In [12]:
entropy_dict = {seq:seq_entropy(seq) for seq in motif_preds.motif.unique()}

In [13]:
motif_preds.to_csv(data_dir + 'species_aware/conservation_df.csv.gz')

# Old

In [13]:
motif_preds['motif_entropy'] = motif_preds.motif.map(entropy_dict)

In [20]:
human_fasta = data_dir + 'fasta/240_mammals/species/Homo_sapiens.fa'

In [21]:
human_utr = defaultdict(str)

with open(human_fasta, 'r') as f:
    for line in f:
        if line.startswith('>'):
            seq_name = line[1:].split(':')[0]
        else:
            human_utr[seq_name] += line.rstrip()

In [204]:
entropy_dict = {seq_idx:seq_entropy(human_utr[seq_name].upper()) for seq_idx,seq_name in  seq_matching.items()}

In [205]:
motif_preds['seq_entropy'] = motif_preds.index.map(entropy_dict)

In [170]:
seqlen_dict = {seq_idx:len(human_utr[seq_name]) for seq_idx,seq_name in  seq_matching.items()}

In [171]:
motif_preds['seq_len'] = motif_preds.index.map(seqlen_dict)

In [177]:
motif_reoccurence = motif_preds.motif.value_counts().to_dict()

In [181]:
motif_preds['motif_counts'] = motif_preds.motif.map(motif_reoccurence)

In [207]:
#motif_preds.to_csv(data_dir + 'motif_predictions/species_aware/conservation_df.csv.gz')

# Old

In [None]:
motif_preds.loc[seq_idx]

In [None]:
conservation_df = pd.read_parquet(data_dir + "ML4RG_project/students_data/conservation/part2_full_df_70.parquet")

In [None]:
with open(data_dir + 'ML4RG_project/students_data/conservation/final_self_df_70.pickle','rb') as f:
    indexseq_df = pickle.load(f)
    
debug_seq = ''.join(indexseq_df['3UTR'].tolist())

In [None]:
len(debug_seq)

22829308

In [206]:
alphabet_seq = conservation_df.motif_range.apply(lambda x:debug_seq[x[0]:x[1]])

len(set(alphabet_seq))

1024

In [5]:
nucleotides = 'ACGTN'

with open(data_dir + 'ML4RG_project/students_data/conservation/final_whole_seq_70.pickle','rb') as f:
    debug_seq = pickle.load(f)
    debug_seq = ''.join(list(map(lambda x:nucleotides[x], debug_seq)))

In [6]:
len(debug_seq)

13114727

In [7]:
alphabet_seq = conservation_df.motif_range.apply(lambda x:debug_seq[x[0]:x[1]])

len(set(alphabet_seq))

283

In [8]:
motifs_debugseq = set(alphabet_seq)

In [9]:
human_fasta = data_dir + 'fasta/240_mammals/species/Homo_sapiens.fa'

In [13]:
from collections import defaultdict

human_utr = defaultdict(str)

with open(human_fasta, 'r') as f:
    for line in f:
        if line.startswith('>'):
            fasta_id = line[1:].split(':')[0]
        else:
            human_utr[fasta_id] += line.rstrip()

In [43]:
seqpos = []

for fasta_id,seq in human_utr.items():
    seq_start = debug_seq.find(seq.upper())
    if seq_start != -1:
        seq_end = seq_start + len(seq)
        seqpos.append((fasta_id, seq_start, seq_end))

In [33]:
motif_df = conservation_df.motif_range.apply(lambda x:x[0]).rename('motif_start').to_frame()

In [44]:
for fasta_id, seq_start, seq_end in seqpos:
    motif_df.loc[(motif_df.motif_start>=seq_start) & (motif_df.motif_start<seq_end), 'id'] = fasta_id

In [45]:
motif_df.id.isna().sum()

15434

In [46]:
seqpos = pd.DataFrame(seqpos, columns=['id','seq_start','seq_end'])

In [56]:
seq=human_utr['ENST00000528848.3_utr3_0_0_chr11_11351942_r']

In [51]:
debug_seq.find(seq.upper())

0

In [57]:
seq

'CG'

In [48]:
seqpos.sort_values(by='seq_start')

Unnamed: 0,id,seq_start,seq_end
4504,ENST00000395343.6_utr3_15_0_chr20_62877743_r,0,1490
1147,ENST00000528848.3_utr3_0_0_chr11_11351942_r,1,3
1240,ENST00000641865.1_utr3_1_0_chr11_58402464_r,1,2
4649,ENST00000390318.2_utr3_1_0_chr22_22872034_f,2,4
4887,ENST00000545770.7_utr3_3_0_chr3_49177634_r,14,16
...,...,...,...
6011,ENST00000371117.8_utr3_66_0_chr6_51615299_r,13107115,13110897
5095,ENST00000475390.2_utr3_1_0_chr3_165186720_r,13110897,13112074
3340,ENST00000592588.7_utr3_3_0_chr19_1440463_f,13112074,13112107
546,ENST00000302101.6_utr3_1_0_chr1_160371134_f,13112107,13113820


In [26]:
fasta_path = data_dir + 'aligned/data/3_prime_UTR/'

fasta_dirs = []

for file in glob(fasta_path + '**/*.fa', recursive=True):
    relative_path = os.path.relpath(file, fasta_path)
    folder, seq_id = relative_path.split('/')
    seq_id = seq_id.replace('.fa','')
    fasta_dirs.append((folder,seq_id))
    
fasta_dirs = pd.DataFrame(fasta_dirs, columns=['folder','id']).set_index('id').squeeze()

In [27]:
def read_fasta(fasta_file):
    
    fasta_seqs = []

    with open(fasta_file,'r') as f:
        for line in f:
            if line.startswith('>'):
                fasta_seqs.append('')
            else:
                fasta_seqs[-1] += line.rstrip().upper()
                
    return fasta_seqs

In [28]:
def get_consv_score(fasta_seqs, start_pos, motif_len=5, norm=None):
    
    N_matches = 0.
    
    motif = fasta_seqs[0][start_pos:start_pos+motif_len]

    for seq in fasta_seqs:
        if seq[start_pos:start_pos+motif_len] == motif:
            N_matches+=1

    N_seqs = len(fasta_seqs)
    
    if norm=='local':
        consv_score = N_matches/N_seqs
    else:
        consv_score = N_matches
        
    return consv_score

In [229]:
conservation_df = conservation_df.sort_values(by='id').set_index('id')

In [4]:
motifs = pd.read_excel(data_dir + 'dominguez_2018/1-s2.0-S1097276518303514-mmc4.xlsx',
             sheet_name =1)

motifs = motifs.iloc[1:,0::2].values.flatten()

motifs_table = set(filter(lambda v: v==v, motifs))

In [10]:
motifs_debugseq-motifs_table

{'ACTCC', 'ACTTA', 'ATGTC', 'CCACA', 'TGACT', 'TTCCG', 'TTGGG'}

In [117]:
motifs_debugseq = set(alphabet_seq)

In [149]:
len(set(motifs_debugseq))

1021

In [143]:
def int_to_str(motif):
    matchdict = 'ACGTN'
    return ''.join([matchdict[x] for x in motif])

In [146]:
motifs_debugseq = set(int_to_str(x.numpy()) for x in alphabet_seq)

In [126]:
list(filter(lambda x:x.numpy(),alphabet_seq))

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [75]:
len(motifs_table.subtract(motifs_debugseq))

AttributeError: 'set' object has no attribute 'subtract'

In [116]:
alphabet_seq = conservation_df[['motif_range','seq_range']].apply(lambda x:debug_seq[x.motif_range[0]-x.seq_range[0]:x.motif_range[1]-x.seq_range[0]], axis=1)

In [94]:
motifs_debugseq = set(conservation_df.alphabet_seq)

In [133]:
conservation_df['motif_start'] = conservation_df[['motif_range','seq_range']].apply(lambda x:x.motif_range[0]-x.seq_range[0], axis=1)

In [None]:
for fasta_id in conservation_df.index.unique():
    file_path = fasta_path + fasta_dirs.loc[fasta_id] + fasta_id + '.fa'
    fasta_seqs = read_fasta(fasta_file)
    

In [146]:
for fasta_id in conservation_df.index.unique():
    file_path = fasta_path + fasta_dirs.loc[fasta_id] + fasta_id + '.fa'
    fasta_seqs = read_fasta(fasta_file)
    conservation_df.loc[fasta_id,'consv_score_counts'] = conservation_df.loc[fasta_id,'motif_start'].apply(lambda x: get_consv_score(fasta_seqs, x))
    conservation_df.loc[fasta_id,'consv_score_students'] = conservation_df.loc[fasta_id,'motif_start'].apply(lambda x: get_consv_score(fasta_seqs, x, norm='local'))

AttributeError: 'numpy.int64' object has no attribute 'apply'

FileNotFoundError: [Errno 2] No such file or directory: '/s/project/mll/sergey/effect_prediction/MLM/ML4RG_project/students_data/final_whole_seq_70.pickle'

In [19]:
seq

tensor([0, 1, 2,  ..., 0, 2, 0])

In [18]:
df

Unnamed: 0,id,3UTR,seq_range
0,ENST00000334600.7_utr3_3_0_chr1_13341892_r,GGAAGGCGTGCCTAGCGGGGTAGAGAAATCCAAAGTTCTCTTCCAG...,"(0, 636)"
1,ENST00000398665.8_utr3_27_0_chr19_2229793_f,GATTTCTACCTCAACCGCGAGACCTATGCAAGGACGGTGTGGACCA...,"(636, 3422)"
2,ENST00000352681.8_utr3_7_0_chr16_678053_f,CCGGCTACCTGAGGCTGCACAGGCCAGGGCTCGGGCATGTGGTGGC...,"(3422, 3638)"
3,ENST00000375341.8_utr3_14_0_chr1_18871430_r,GCCCCTCTCGGGCTCCACCGTCCAGCTGTCTGTCCGTCCAGGTGGC...,"(3638, 5053)"
4,ENST00000336985.11_utr3_7_0_chr22_44737248_f,GGCCTCACAGCTGGCCTTGAGTTTTTACTGACACGTCCCTGTGTGC...,"(5053, 5487)"
...,...,...,...
12688,ENST00000285896.11_utr3_6_0_chr5_154875440_f,TGGCGCCAGGCTCTGCAGGGTGGGCCTGATCCCAGAGTGGTGCTTA...,"(22820103, 22821456)"
12689,ENST00000549884.6_utr3_28_0_chr12_56595596_r,GGCAAGGGAGGTGGGGAGTCACCTTGTGGCATCTCCCCCCACCTTC...,"(22821456, 22824478)"
12690,ENST00000389532.9_utr3_16_0_chr9_135808487_r,CCCCGGTGCAGCTGGCGTCCAAAGGGTGACCCAGACTCGTAAATGA...,"(22824478, 22827300)"
12691,ENST00000340524.10_utr3_8_0_chr3_191267168_r,AGCTTTTTCTCTGGATGCAAAAAAAGATAAGAATATCAGGAAAATA...,"(22827300, 22828548)"


In [15]:
seq[8964378:8964383]

tensor([3, 0, 3, 0, 3])

In [16]:
df

NameError: name 'df' is not defined

In [157]:
fasta_id = 'ENST00000334600.7_utr3_3_0_chr1_13341892_r'

fasta_file = fasta_path  + fasta_dirs.loc[fasta_id] +'/'+ fasta_id + '.fa'

fasta_seqs = read_fasta(fasta_file)

In [10]:
model_df['start_pos'] = model_df.motif_range.apply(lambda x:x[0])

In [None]:
for fasta_id in conservation_df.index.unique():
    file_path = fasta_path + fasta_dirs.loc[fasta_id] + fasta_id + '.fa'
    fasta_seqs = read_fasta(fasta_file)
    if fasta_seqs[0][0:5]==''

In [11]:
model_df.sort_values(by='start_pos').drop_duplicates(subset='motif')

Unnamed: 0,motif_range,motif,Model,avg_target_probas,start_pos
266914,"[0, 5]",PCBP2,11-mer,0.305935,0
127190,"[281, 286]",CELF1,Species-aware,0.261089,281
216375,"[381, 386]",KHSRP,Species-agnostic,0.314125,381
143399,"[381, 386]",PUM1,Species-aware,0.290721,381
112149,"[509, 514]",RBFOX2,Species-aware,0.249339,509
38641,"[587, 592]",PTBP3,Dinucleotide distribution,0.298225,587
262806,"[747, 752]",TRA2A,11-mer,0.298917,747
96131,"[839, 844]",PABPN1L,Species-aware,0.341799,839
227689,"[895, 900]",KHDRBS2,Species-agnostic,0.315431,895
272183,"[968, 973]",TARDBP,11-mer,0.274026,968


In [151]:
conservation_df.loc['ENST00000334600.7_utr3_3_0_chr1_13341892_r']

Unnamed: 0_level_0,motif_range,motif,model,avg_target_probas,alphabet_seq,shanon_entropy,seq_range,conserv_rate,motif_start,consv_score_counts,consv_score_students
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENST00000334600.7_utr3_3_0_chr1_13341892_r,"[577, 582]",RBMS3_3,Species aware all motifs,0.238700,ATATA,0.970951,"[0, 636]",0.033333,577,,
ENST00000334600.7_utr3_3_0_chr1_13341892_r,"[207, 212]",TRNAU1AP_4,Species aware all motifs,0.418737,ATTTA,0.970951,"[0, 636]",0.133333,207,,
ENST00000334600.7_utr3_3_0_chr1_13341892_r,"[299, 304]",HNRNPF_9,Species aware all motifs,0.262675,AGGGA,0.970951,"[0, 636]",0.033333,299,,
ENST00000334600.7_utr3_3_0_chr1_13341892_r,"[22, 27]",RALY_5,Species aware all motifs,0.389148,CTTTT,0.721928,"[0, 636]",0.066667,22,,
ENST00000334600.7_utr3_3_0_chr1_13341892_r,"[284, 289]",ELAVL4_3,Species aware all motifs,0.384210,TTATT,0.721928,"[0, 636]",0.033333,284,,
...,...,...,...,...,...,...,...,...,...,...,...
ENST00000334600.7_utr3_3_0_chr1_13341892_r,"[573, 578]",TRNAU1AP_3,Species aware all motifs,0.474998,TTTTA,0.721928,"[0, 636]",0.033333,573,,
ENST00000334600.7_utr3_3_0_chr1_13341892_r,"[121, 126]",SFPQ_5,Species aware all motifs,0.343194,GTAAT,1.521928,"[0, 636]",0.133333,121,,
ENST00000334600.7_utr3_3_0_chr1_13341892_r,"[369, 374]",TRNAU1AP_3,Species aware all motifs,0.327616,TTTTA,0.721928,"[0, 636]",0.033333,369,,
ENST00000334600.7_utr3_3_0_chr1_13341892_r,"[593, 598]",ELAVL4_6,Species aware all motifs,0.354730,ATTTT,0.721928,"[0, 636]",0.033333,593,,


In [152]:
df

Unnamed: 0,id,3UTR,seq_range
0,ENST00000334600.7_utr3_3_0_chr1_13341892_r,GGAAGGCGTGCCTAGCGGGGTAGAGAAATCCAAAGTTCTCTTCCAG...,"(0, 636)"
1,ENST00000398665.8_utr3_27_0_chr19_2229793_f,GATTTCTACCTCAACCGCGAGACCTATGCAAGGACGGTGTGGACCA...,"(636, 3422)"
2,ENST00000352681.8_utr3_7_0_chr16_678053_f,CCGGCTACCTGAGGCTGCACAGGCCAGGGCTCGGGCATGTGGTGGC...,"(3422, 3638)"
3,ENST00000375341.8_utr3_14_0_chr1_18871430_r,GCCCCTCTCGGGCTCCACCGTCCAGCTGTCTGTCCGTCCAGGTGGC...,"(3638, 5053)"
4,ENST00000336985.11_utr3_7_0_chr22_44737248_f,GGCCTCACAGCTGGCCTTGAGTTTTTACTGACACGTCCCTGTGTGC...,"(5053, 5487)"
...,...,...,...
12688,ENST00000285896.11_utr3_6_0_chr5_154875440_f,TGGCGCCAGGCTCTGCAGGGTGGGCCTGATCCCAGAGTGGTGCTTA...,"(22820103, 22821456)"
12689,ENST00000549884.6_utr3_28_0_chr12_56595596_r,GGCAAGGGAGGTGGGGAGTCACCTTGTGGCATCTCCCCCCACCTTC...,"(22821456, 22824478)"
12690,ENST00000389532.9_utr3_16_0_chr9_135808487_r,CCCCGGTGCAGCTGGCGTCCAAAGGGTGACCCAGACTCGTAAATGA...,"(22824478, 22827300)"
12691,ENST00000340524.10_utr3_8_0_chr3_191267168_r,AGCTTTTTCTCTGGATGCAAAAAAAGATAAGAATATCAGGAAAATA...,"(22827300, 22828548)"


In [None]:
fasta_seqsfasta_seqsfasta_seqs

In [140]:
    motif_len = 5
    start_pos = 1061
    
    N_matches = 0.

    for seq in fasta_seqs:
        if seq[start_pos:start_pos+motif_len] == motif:
            N_matches+=1
            
    N_matches

0.0

In [76]:
fasta_dirs.loc['ENST00000394468.7_utr3_9_0_chr7_93188534_r']

'822'

In [68]:
fasta_seqs[0][12515548-12514726:]

'AATGTTATTGTAATAAAGTGTGATGGAAAATCCAGGTAATTAAAAAATAAATTATAACTA'

In [88]:
fasta_seqs[-1].find('ACTCC')

-1

In [74]:
len(fasta_seqs[0])

882

In [92]:
start_pos = 12515548-12514726

In [94]:
motif = fasta_seqs[0][start_pos:start_pos+5]

In [100]:
n_matches/N_seqs

0.13278008298755187

In [62]:
conservation_df

Unnamed: 0,motif_range,motif,model,avg_target_probas,alphabet_seq,shanon_entropy,id,seq_range,conserv_rate
0,"[12515548, 12515553]",non_motif,Species aware all motifs,0.315958,ACTCC,1.370951,ENST00000298542.9_utr3_11_0_chrX_132076990_r,"[12514726, 12515608]",0.132780
1,"[7290525, 7290530]",non_motif,Species aware all motifs,0.249635,ACTCC,1.370951,ENST00000374160.8_utr3_3_0_chr10_48909480_r,"[7289818, 7290575]",0.132743
2,"[10009478, 10009483]",non_motif,Species aware all motifs,0.262820,TGACT,1.921928,ENST00000397256.5_utr3_11_0_chr3_9834822_f,"[10009238, 10009818]",0.583333
3,"[7881980, 7881985]",non_motif,Species aware all motifs,0.214460,ACTCC,1.370951,ENST00000396267.3_utr3_1_0_chr17_8358822_r,"[7878900, 7889212]",0.178423
4,"[12389715, 12389720]",non_motif,Species aware all motifs,0.215341,TTCCG,1.521928,ENST00000360280.8_utr3_71_0_chr9_77416007_f,"[12388946, 12394477]",0.244813
...,...,...,...,...,...,...,...,...,...
10840718,"[13108524, 13108529]",UNK_17,Species aware all motifs,0.275231,TTAGG,1.521928,ENST00000394468.7_utr3_9_0_chr7_93188534_r,"[13108075, 13108808]",0.054167
10840719,"[13108828, 13108833]",UNK_17,Species aware all motifs,0.272801,TTAGG,1.521928,ENST00000339754.11_utr3_7_0_chr12_8096091_f,"[13108808, 13110599]",0.190871
10840720,"[13109676, 13109681]",UNK_17,Species aware all motifs,0.243267,TTAGG,1.521928,ENST00000339754.11_utr3_7_0_chr12_8096091_f,"[13108808, 13110599]",0.510373
10840721,"[13112894, 13112899]",UNK_17,Species aware all motifs,0.271611,TTAGG,1.521928,ENST00000529689.6_utr3_8_0_chr11_83259081_r,"[13111707, 13114537]",0.405286
