In [1]:
import pandas as pd

## Pre-process VMIR output

In [2]:
def parse_vmir_sequence(path):
    with open(path, 'r') as f:
        txt = f.readlines()
        return txt[10].strip()
    
    
def complementary_seq(sequence, reverse=False):
    complement_trans = str.maketrans('ATGC', 'TACG')
    sequence = sequence.translate(complement_trans)
    if reverse:
        sequence = ''.join(reversed(sequence))
    return sequence

    
def get_seq_from_summary(summary, full_sequence):
    seq_start = summary.Start - 1
    sequence = full_seq[seq_start:seq_start+summary.Size]
    if summary.Orientation == 'Reverse':
        sequence = complementary_seq(sequence, reverse=True)
    return sequence

In [51]:
vmir_outputs = ['sequence_no_header_no_whitespace_Export.txt', 'default_min_score_115_win_count_35_size_50-220.txt']

vmir_combined = pd.DataFrame()

for vmir_output in vmir_outputs:
    df = pd.read_csv(vmir_output, skiprows=32, sep='\s')
    df['Source'] = vmir_output
    print(f'loaded dataframe of size {len(df)} from {vmir_output}')
    if vmir_combined.empty:
        vmir_combined = df
    else:
        vmir_combined = pd.concat([df, vmir_combined]).drop_duplicates(subset=['Start', 'Size']).reset_index(drop=True)

vmir_combined

loaded dataframe of size 222 from sequence_no_header_no_whitespace_Export.txt
loaded dataframe of size 51 from default_min_score_115_win_count_35_size_50-220.txt


  df = pd.read_csv(vmir_output, skiprows=32, sep='\s')


Unnamed: 0,Rank,Name,Orientation,Start,Apex,Size,Score,Sub,HPs,Rep,HPs.1,Wind.Cnt.Abs.,Wind.Cnt.Rel.,Source
0,14,MD1,Direct,398,445,94,171.8,1,0,1,40,,,default_min_score_115_win_count_35_size_50-220...
1,29,MD3,Direct,564,594,59,142.4,1,0,44,45,,,default_min_score_115_win_count_35_size_50-220...
2,25,MD8,Direct,1968,2002,68,144.9,2,0,20,45,,,default_min_score_115_win_count_35_size_50-220...
3,10,MD10,Direct,2107,2162,107,180.1,3,0,1,41,,,default_min_score_115_win_count_35_size_50-220...
4,44,MD12,Direct,2284,2348,120,123.7,2,0,3,44,,,default_min_score_115_win_count_35_size_50-220...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,157,MR77,Reverse,27867,27900,74,129.2,0,0,1,1,,,sequence_no_header_no_whitespace_Export.txt
252,203,MR78,Reverse,27942,27986,89,119.5,0,0,2,2,,,sequence_no_header_no_whitespace_Export.txt
253,120,MR79,Reverse,28216,28252,81,137.4,0,0,1,1,,,sequence_no_header_no_whitespace_Export.txt
254,73,MR80,Reverse,28421,28460,80,155.1,0,0,1,1,,,sequence_no_header_no_whitespace_Export.txt


In [52]:
full_seq = ''
with open('sequence_no_header_no_whitespace.fasta', 'r') as f:
    full_seq = f.read()
    
len(full_seq)

29903

In [53]:
sequences = []

for index, row in vmir_combined.iterrows():
    seq = get_seq_from_summary(row, full_seq)
    sequences.append(seq)
    
vmir_combined['Sequence'] = sequences
vmir_combined

Unnamed: 0,Rank,Name,Orientation,Start,Apex,Size,Score,Sub,HPs,Rep,HPs.1,Wind.Cnt.Abs.,Wind.Cnt.Rel.,Source,Sequence
0,14,MD1,Direct,398,445,94,171.8,1,0,1,40,,,default_min_score_115_win_count_35_size_50-220...,CATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCG...
1,29,MD3,Direct,564,594,59,142.4,1,0,44,45,,,default_min_score_115_win_count_35_size_50-220...,GTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACC...
2,25,MD8,Direct,1968,2002,68,144.9,2,0,20,45,,,default_min_score_115_win_count_35_size_50-220...,CAATACTAGATGGAATTTCACAGTATTCACTGAGACTCATTGATGC...
3,10,MD10,Direct,2107,2162,107,180.1,3,0,1,41,,,default_min_score_115_win_count_35_size_50-220...,TAACATCTTTGGCACTGTTTATGAAAAACTCAAACCCGTCCTTGAT...
4,44,MD12,Direct,2284,2348,120,123.7,2,0,3,44,,,default_min_score_115_win_count_35_size_50-220...,GAGTGTTCAGACATTCTTTAAGCTTGTAAATAAATTTTTGGCTTTG...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,157,MR77,Reverse,27867,27900,74,129.2,0,0,1,1,,,sequence_no_header_no_whitespace_Export.txt,AATGCAGCTACAGTTGTGATGATTCCTAAGAAAACAAGAAATTTCA...
252,203,MR78,Reverse,27942,27986,89,119.5,0,0,2,2,,,sequence_no_header_no_whitespace_Export.txt,TACCATTTAGAATAGAAGTGAATAGGACACGGGTCATCAACTACAT...
253,120,MR79,Reverse,28216,28252,81,137.4,0,0,1,1,,,sequence_no_header_no_whitespace_Export.txt,TTTTGGGGTCCATTATCAGACATTTTAGTTTGTTCGTTTAGATGAA...
254,73,MR80,Reverse,28421,28460,80,155.1,0,0,1,1,,,sequence_no_header_no_whitespace_Export.txt,GTGTTAATTGGAACGCCTTGTCCTCGAGGGAATTTAAGGTCTTCCT...


In [54]:
# vmir_combined.to_csv('vmir-pre-mirnas-with-seqs-combined.tsv', sep='\t', index=False)

## Preprocess miRNAFold output

In [47]:
with open('miRNAfold_output.txt', 'r') as f:
    mirnas = f.read().split('\n\n')
    mirnas = [mirna.split('\n')[1].strip() for mirna in mirnas if len(mirna) > 0]
    
    # NOTE: we replace uracil by thymine for compatibility with VMIR output
    mirnas = [mirna.replace('U', 'T') for mirna in mirnas]
    
len(mirnas)

519

# Find common pre-miRNA predictions 

In [55]:
vmir_seqs = set(list(vmir_combined['Sequence']))
mirnafold_seqs = set(mirnas)

direct_intersection = mirnafold_seqs.intersection(vmir_seqs)
reverse_intersection = set([complementary_seq(seq, reverse=True) for seq in mirnafold_seqs]).intersection(vmir_seqs)
total_intersection = direct_intersection.union(reverse_intersection)

print('Direct intersections:', len(direct_intersection))
print('Reverse intersections:', len(reverse_intersection))

Direct intersections: 4
Reverse intersections: 1


In [56]:
print('miRNA candidates predicted by both VMIR and miRNAFold:')
vmir_combined[vmir_combined['Sequence'].isin(total_intersection)]

miRNA candidates predicted by both VMIR and miRNAFold:


Unnamed: 0,Rank,Name,Orientation,Start,Apex,Size,Score,Sub,HPs,Rep,HPs.1,Wind.Cnt.Abs.,Wind.Cnt.Rel.,Source,Sequence
18,21,MD76,Direct,17137,17166,61,159.0,1,0,1,44,,,default_min_score_115_win_count_35_size_50-220...,TCTGCTCGCATAGTGTATACAGCTTGCTCTCATGCCGCTGTTGATG...
20,23,MD79,Direct,17951,18020,131,151.5,7,0,8,41,,,default_min_score_115_win_count_35_size_50-220...,TACTTTGCATAATGTCTGATAGAGACCTTTATGACAAGTTGCAATT...
55,95,MD5,Direct,1083,1124,86,147.3,0,0,1,1,,,sequence_no_header_no_whitespace_Export.txt,TTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAACCAAGGGT...
89,15,MD40,Direct,7723,7764,89,191.5,0,0,1,1,,,sequence_no_header_no_whitespace_Export.txt,GTCTTCTTACATCGTTGATAGTGTTACAGTGAAGAATGGTTCCATC...
241,187,MR64,Reverse,24722,24762,79,123.0,0,0,1,1,,,sequence_no_header_no_whitespace_Export.txt,CAGGAGCAGTTGTGAAGTTCTTTTCTTGTGCAGGGACATAAGTCAC...


# MFEI calculation

In [1]:
#pre-mirna candidate
#generally lower the better, but at least MFEI ≤ -0.85 kcal/mol per https://www.biorxiv.org/content/10.1101/2020.11.02.365049v1.full.pdf
string = 'GAUUGCUGCAGUCAUAACAAGAGAAGUGGGUUUUGUCGUGCCUGGUUUGCCUGGCACGAUAUUACGCACAACUAAUGGUGACUUUUUGCAUUUC'

def mfei_calculation(precursor, mfe):
    g_and_c = (precursor.count('G')+precursor.count('C'))/len(precursor)
    return mfe/len(precursor)/g_and_c

mfei_calculation(string,-37.3)

-0.8880952380952382