In [2]:
import numpy as np
import pandas as pd

In [7]:
# from miRNAFold predicted hairpins
mature_mirnas_ours = pd.read_csv("../mature-miRNAs/mature-mirnas-from-mirnafold.csv") 
mature_mirnas_ours.head()

Unnamed: 0,sequence,strand
0,AUGUCUACUUAGCUGUCUUUGA,5'
1,CUUCUGGUAAUCUAUUACUAGA,5'
2,UACCAUAUUGGGUAGUGCUUUA,3'
3,UACUAAAUUAAAUGAUCUCUGC,5'
4,AGUGUUUUAAAUGAUAUCCUUU,3'


In [9]:
# source: https://www.frontiersin.org/articles/10.3389/fgene.2020.00765/full#supplementary-material
# "Data_Sheet_3_Epigenetic Regulator miRNA Pattern Differences Among SARS-CoV, SARS-CoV-2, and SARS-CoV-2 
# World-Wide Isolates Delineated the Mystery Behi.xlxs"
# column for SARS-CoV-2 selected and converted to csv
mirnas_paper = pd.read_csv("../mature-miRNAs/papers/miRNA_paper.csv", skiprows = 1) 
mirnas_paper.head()

Unnamed: 0,miRNA SARS-CoV-2
0,>5'stem-miRNA 102
1,CAAGGGCUGGUGAAGCUGCUAA
2,>5'stem-miRNA 103
3,UAGGUGAGUUAGGUGAUGUUAG
4,>5'stem-miRNA 110


In [10]:
stem = (mirnas_paper[mirnas_paper.index % 2 == 0]).rename(columns = {'miRNA SARS-CoV-2': 'stem'})
mirnas_paper = mirnas_paper.drop(mirnas_paper.index[mirnas_paper.index % 2 == 0])
mirnas_paper['strand'] = stem.values
mirnas_paper = mirnas_paper.rename(columns = {'miRNA SARS-CoV-2': 'sequence'})
mirnas_paper.head()

Unnamed: 0,sequence,strand
1,CAAGGGCUGGUGAAGCUGCUAA,>5'stem-miRNA 102
3,UAGGUGAGUUAGGUGAUGUUAG,>5'stem-miRNA 103
5,AAAGUCCUCAGAAUACAAAGGU,>5'stem-miRNA 110
7,UGGUGUAUACGUUGUCUUUGGA,>5'stem-miRNA 116
9,AUACUAUAGCUAAUUAUGCUAA,>5'stem-miRNA 123


In [11]:
# searching for the common sequences with the paper
common_mirnas = mirnas_paper.merge(right = mature_mirnas_ours, how = 'inner', on = 'sequence')

In [12]:
print(f"{common_mirnas.shape[0]} miRNAs out of {mature_mirnas_ours.shape[0]} that we obtained, are same as in https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7381279/pdf/fgene-11-00765.pdf")

50 miRNAs out of 75 that we obtained, are same as in https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7381279/pdf/fgene-11-00765.pdf


 # Target prediction

* ## miRNAFold pipeline

The following predictions have been obtained with [miRDB](http://mirdb.org/custom.html) target prediction tool. For each miRNA from `mature-mirnas-from-mirnafold.csv` predictions with score `>80` were selected ([as authors suggest](http://mirdb.org/faq.html#How_to_interpret_the_target_prediction_score)) 

In [15]:
target_df = pd.read_csv("../targets/mirdb_targets_for_mirnafold.csv")
target_df.head()

Unnamed: 0,Target Rank,Sequence,Gene Symbol,Gene Description
0,100,AUGUCUACUUAGCUGUCUUUGA,SPIN1,spindlin 1
1,95,AUGUCUACUUAGCUGUCUUUGA,PTPN11,"protein tyrosine phosphatase, non-receptor typ..."
2,95,AUGUCUACUUAGCUGUCUUUGA,UFM1,ubiquitin fold modifier 1
3,94,AUGUCUACUUAGCUGUCUUUGA,ASB15,ankyrin repeat and SOCS box containing 15
4,94,AUGUCUACUUAGCUGUCUUUGA,SPIN2A,spindlin family member 2A


Selecting unique genes:

In [61]:
unique_miRNAFold_targets = target_df['Gene Symbol'].unique()
unique_miRNAFold_targets = pd.DataFrame(unique_miRNAFold_targets, columns = ['Gene symbol'])

In [64]:
print(f"With miRDB have discovered {unique_miRNAFold_targets.shape[0]} unique targets for the miRNAs filtered from miRNAFold predicted precursors.")

With miRDB have discovered 8197 unique targets for the miRNAs filtered from miRNAFold predicted precursors.


In [66]:
pd.DataFrame(unique_miRNAFold_targets).to_csv("list_targets_by_miRDB_for_miRNAFold",header = False, index = False)

* ## VMir pipeline

In [19]:
mature_mirnas_vmir = pd.read_csv("../mature-miRNAs/mature-mirnas-from-vmir.csv")
mature_mirnas_vmir.head()

Unnamed: 0,sequence,strand
0,AUUUUUAUAUAAACCAAAAACU,5'
1,AGUGCAUCUUGAUCCUCAUAAC,3'
2,AUGAAGAAGGUAACAUGUUCAA,3'
3,UCAACAAUUUUAUUGUAGAUGA,5'
4,ACUGUGUUAUGUAUGCAUCAGC,3'


In [50]:
# reading in mRDB output
vmir_target_df = pd.read_csv("../targets/mirdb_targets_for_vmir.csv")
vmir_target_df.head()

Unnamed: 0,Target Rank,Sequence,Gene Symbol,Gene Description
0,1,AUUUUUAUAUAAACCAAAAACU,KLHL5,kelch like family member 5
1,2,AUUUUUAUAUAAACCAAAAACU,TWSG1,twisted gastrulation BMP signaling modulator 1
2,3,AUUUUUAUAUAAACCAAAAACU,RBMS3,RNA binding motif single stranded interacting ...
3,4,AUUUUUAUAUAAACCAAAAACU,PLOD2,"procollagen-lysine,2-oxoglutarate 5-dioxygenase 2"
4,5,AUUUUUAUAUAAACCAAAAACU,DGKH,diacylglycerol kinase eta


In [59]:
# select unique targets
unique_vmir_targets = vmir_target_df['Gene Symbol'].unique()
unique_vmir_targets = pd.DataFrame(unique_vmir_targets, columns = ['Gene symbol'])

In [52]:
print(f"With miRDB have discovered {unique_vmir_targets.shape[0]} unique targets for the miRNAs filtered from VMir predicted precursors.")

With miRDB have discovered 4582 unique targets for the miRNAs filtered from VMir predicted precursors.


In [81]:
pd.DataFrame(unique_vmir_targets).to_csv("list_targets_by_miRDB_for_VMir",header = False, index = False)

In [72]:
common_targets = unique_vmir_targets.merge(right = unique_miRNAFold_targets, how = 'inner', on = 'Gene symbol')

In [73]:
print(f'There are {common_targets.shape[0]} common targets obtained from both pipelines.')

There are 3338 common targets obtained from both pipelines.


In [74]:
common_targets.to_csv("common_targets",header = False, index = False)