In [6]:
import pandas as pd

### Overlap between (non-filtered) pre-miRNAs

In [102]:
full_genome_vmir = pd.read_csv('./full-genome/raw/vmir-output.csv')
full_genome_vmir.rename(columns={'Sequence':'sequence'}, inplace=True)
full_genome_mirnafold = pd.read_csv('./full-genome/raw/mirnafold-output.csv')
full_genome_mirnafold.rename(columns={'0':'sequence'}, inplace=True)
full_genome = pd.concat([full_genome_vmir, full_genome_mirnafold], ignore_index=True, verify_integrity=True)

print(f'Full-genome total: {len(full_genome)} (VMIR: {len(full_genome_vmir)}, miRNAFold: {len(full_genome_mirnafold)})')

Full-genome total: 775 (VMIR: 256, miRNAFold: 519)


In [93]:
non_conserved_region_vmir = pd.read_csv('./non-conserved-region/raw/vmir-output.csv')
non_conserved_region_vmir.rename(columns={'Sequence':'sequence'}, inplace=True)
non_conserved_region_mirnafold = pd.read_csv('./non-conserved-region/raw/mirnafold-output.csv')
non_conserved_region_mirnafold.rename(columns={'0':'sequence'}, inplace=True)
non_conserved = pd.concat([non_conserved_region_vmir, non_conserved_region_mirnafold], ignore_index=True, verify_integrity=True)

print(f'Non-conserved total: {len(non_conserved)} (VMIR: {len(non_conserved_region_vmir)}, miRNAFold: {len(non_conserved_region_mirnafold)})')

Non-conserved total: 153 (VMIR: 14, miRNAFold: 139)


In [111]:
full_genome_seqs = set(full_genome.sequence)
non_conserved_seqs = set(non_conserved.sequence)

combination = full_genome_seqs.union(non_conserved_seqs)
new_non_conserved = combination.difference(full_genome_seqs)
print(f'Found {len(new_non_conserved)} new sequences in non-conserved region ðŸ˜…')

vmir = 0
mirnafold = 0

for sequence in new_non_conserved:
    if sequence in list(non_conserved_region_vmir.sequence):
        vmir += 1
    if sequence in list(non_conserved_region_mirnafold.sequence):
        mirnafold += 1
        
print(f'Vmir: {vmir} miRNAFold: {mirnafold}')

Found 6 new sequences in non-conserved region ðŸ˜…
Vmir: 1 miRNAFold: 5


### Overlap between *filtered* pre-miRNAs

In [119]:
full_genome_vmir = pd.read_csv('./full-genome/filtered/vmir_output_filtered.csv')
full_genome_vmir.rename(columns={'Sequence':'sequence'}, inplace=True)
full_genome_mirnafold = pd.read_csv('./full-genome/filtered/filtered-pre-mirnas.csv')
full_genome_mirnafold.rename(columns={'pre-miRNA candidate seq':'sequence', 'MFE':'mfe', 'MFEIS':'mfeis'}, inplace=True)
full_genome = pd.concat([full_genome_vmir, full_genome_mirnafold], ignore_index=True, verify_integrity=True)

print(f'Full-genome total: {len(full_genome)} (VMIR: {len(full_genome_vmir)}, miRNAFold: {len(full_genome_mirnafold)})')

Full-genome total: 37 (VMIR: 10, miRNAFold: 27)


In [120]:
non_conserved_region_vmir = pd.read_csv('./non-conserved-region/filtered/vmir_output_from_S_filtered.csv')
non_conserved_region_mirnafold = pd.read_csv('./non-conserved-region/filtered/mirnafold_output_from_S_filtered.csv')
non_conserved_region_mirnafold.rename(columns={'pre-miRNA candidate seq':'sequence', 'MFE':'mfe', 'MFEIS':'mfeis'}, inplace=True)
non_conserved = pd.concat([non_conserved_region_vmir, non_conserved_region_mirnafold], ignore_index=True, verify_integrity=True)

print(f'Non-conserved total: {len(non_conserved)} (VMIR: {len(non_conserved_region_vmir)}, miRNAFold: {len(non_conserved_region_mirnafold)})')

Non-conserved total: 15 (VMIR: 4, miRNAFold: 11)


In [121]:
full_genome_seqs = set(full_genome.sequence)
non_conserved_seqs = set(non_conserved.sequence)

combination = full_genome_seqs.union(non_conserved_seqs)
new_non_conserved = combination.difference(full_genome_seqs)
print(f'Found {len(new_non_conserved)} new sequences in non-conserved region ðŸ˜…')

for sequence in new_non_conserved:
    if sequence in list(non_conserved_region_vmir.sequence):
        vmir += 1
    if sequence in list(non_conserved_region_mirnafold.sequence):
        mirnafold += 1
        
print(f'Vmir: {vmir} miRNAFold: {mirnafold}')

Found 7 new sequences in non-conserved region ðŸ˜…
Vmir: 4 miRNAFold: 9


In [131]:
sorted(new_non_conserved)

['AGCAAGUGCACUUGGAAAACUUCAAGAUGUGGUCAACCAAAAUGCACAAGCU',
 'ATGCGGAATTATATAGGACAGAATAATCAGCAACACAGTTGCTGATTCTCTTCCTGTTCCAAGCAT',
 'CAGAGUAGUAGUACUUUCUUUUGAACUUCUACAUGCACCAGCAACUGUUUGUGGACCUAAAAAGUCUACUAAUUUG',
 'GGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTATGCTTTGCTGTATGACC',
 'TTGAACATGTTACCTTCTTCATCTACAATAAAATTGTTGATGAGCCTGAAGAACATGTCCAA',
 'UGCAACGAUACCGAUACAAGCCUCACUCCCUUUCGGAUGGCUUAUUGUUGGCGUUGCA',
 'UUGACUUUCAGGUUACUAUAGCAGAGAUAUUACUAAUUAUUAUGAGGACUUUUAAAGUUUCCAUUUGGAAUCUUGAUUACAUCAUAAACCUCAUAAUUAAAAAUUUAUCUAAGUCACUAACUGAGAAUAAAUAUUCUCAA']

In [129]:
sorted(list(non_conserved_region_mirnafold.sequence))

['AAAGUAUGAGCAGUAUAUAAAAUGGCCAUGGUACAUUUGGCUAGGUUUUAUAGCUGGCUUGAUUGCCAUAGUAAUGGUGACAAUUAUGCUUU',
 'AGCAAGUGCACUUGGAAAACUUCAAGAUGUGGUCAACCAAAAUGCACAAGCU',
 'CAGAGUAGUAGUACUUUCUUUUGAACUUCUACAUGCACCAGCAACUGUUUGUGGACCUAAAAAGUCUACUAAUUUG',
 'CUUUUCUUAUGGACCUUGAAGGAAAACAGGGUAAUUUCAAAAAUCUUAGGGAAUUUGUGUUUAAGAAUAUUGAUGGUUAUUUUAAAAUAUAUUCUAAGCACACGCCUAUUAAUUUAGUGCGUGAUCUCCCUCAGGGUUUUUCGGCUUUAG',
 'GCAUCAUUUUCCACUUUUAAGUGUUAUGGAGUGUCUCCUACUAAAUUAAAUGAUCUCUGCUUUACUAAUGUCUAUGCAGAUUCAUUUGUAAUUAGAGGUGAUGAAGUCAGACAAAUCGCUCCAGGGCAAACUGGAAAGAUUGC',
 'GGAAGUUCAAGAACUUUACUCUCCAAUUUUUCUUAUUGUUGCGGCAAUAGUGUUUAUAACACUUUGCUUCACACUCAAAAGAAAGACAGAAUGAUUGAACUUUCAUUAAUUGACUUCU',
 'GUAACUUCUUCAAUUGUCAUUACUUCAGGUGAUGGCACAACAAGUCCUAUUUCUGAACAUGACUACCAGAUUGGUGGUUAUACUGAAAAAUGGGAAUCUGGAGUAAAAGACUGUGUUGUAUUACACAGUUAC',
 'UCAGACAAGAGGAAGUUCAAGAACUUUACUCUCCAAUUUUUCUUAUUGUUGCGGCAAUAGUGUUUAUAACACUUUGCUUCACACUCAAAAGAAAGACAGAAUGAUUGAACUUUCAUUAAUUGA',
 'UCAUUUGUAAUUAGAGGUGAUGAAGUCAGACAAAUCGCUCCAGGGCAAACUGGAAAGAUUGCUGAUUAUAAU

In [130]:
sorted(list(non_conserved_region_vmir.sequence))

['ATGCGGAATTATATAGGACAGAATAATCAGCAACACAGTTGCTGATTCTCTTCCTGTTCCAAGCAT',
 'ATTGTGTGAATTTGGACATGTTCTTCAGGCTCATCAACAATTTTATTGTAGATGAAGAAGGTAACATGTTCAACACCAGT',
 'GGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTATGCTTTGCTGTATGACC',
 'TTGAACATGTTACCTTCTTCATCTACAATAAAATTGTTGATGAGCCTGAAGAACATGTCCAA']

In [78]:
non_conserved

Unnamed: 0,sequence,mfe,mfeis,class
0,ATGCGGAATTATATAGGACAGAATAATCAGCAACACAGTTGCTGAT...,-23.9,-0.919231,Real
1,ATTGTGTGAATTTGGACATGTTCTTCAGGCTCATCAACAATTTTAT...,-25.0,-0.892857,Real
2,GGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTATGC...,-21.5,-0.895833,Real
3,TTGAACATGTTACCTTCTTCATCTACAATAAAATTGTTGATGAGCC...,-18.200001,-0.866667,Pseudo
4,CUUUUCUUAUGGACCUUGAAGGAAAACAGGGUAAUUUCAAAAAUCU...,-42.9,-0.858,Real
5,GCAUCAUUUUCCACUUUUAAGUGUUAUGGAGUGUCUCCUACUAAAU...,-47.9,-0.903774,Real
6,UCAUUUGUAAUUAGAGGUGAUGAAGUCAGACAAAUCGCUCCAGGGC...,-27.3,-0.880645,Real
7,CAGAGUAGUAGUACUUUCUUUUGAACUUCUACAUGCACCAGCAACU...,-25.2,-0.9,Pseudo
8,AGCAAGUGCACUUGGAAAACUUCAAGAUGUGGUCAACCAAAAUGCA...,-19.3,-0.877273,Pseudo
9,AAAGUAUGAGCAGUAUAUAAAAUGGCCAUGGUACAUUUGGCUAGGU...,-28.1,-0.851515,Real
