In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

In [2]:
pre_negative = pd.read_csv('dataset/human_MiRNA_DATA.csv')
pre_negative.columns = ['seq_name']
pre_positive = pd.read_csv('dataset/positive_dataset.csv')

In [3]:
pre_negative[:10]

Unnamed: 0,seq_name
0,>cel-let-7-5p MIMAT0000001 Caenorhabditis eleg...
1,UGAGGUAGUAGGUUGUAUAGUU
2,>cel-let-7-3p MIMAT0015091 Caenorhabditis eleg...
3,CUAUGCAAUUUUCUACCUUACC
4,>cel-lin-4-5p MIMAT0000002 Caenorhabditis eleg...
5,UCCCUGAGACCUCAAGUGUGA
6,>cel-lin-4-3p MIMAT0015092 Caenorhabditis eleg...
7,ACACCUGGGCUCUCCGGGUACC
8,>cel-miR-1-5p MIMAT0020301 Caenorhabditis eleg...
9,CAUACUUCCUUACAUGCCCAUA


In [4]:
pre_negative['is_seq_name'] = pre_negative['seq_name'].str.find('>')

In [5]:
pre_negative[:10]

Unnamed: 0,seq_name,is_seq_name
0,>cel-let-7-5p MIMAT0000001 Caenorhabditis eleg...,0
1,UGAGGUAGUAGGUUGUAUAGUU,-1
2,>cel-let-7-3p MIMAT0015091 Caenorhabditis eleg...,0
3,CUAUGCAAUUUUCUACCUUACC,-1
4,>cel-lin-4-5p MIMAT0000002 Caenorhabditis eleg...,0
5,UCCCUGAGACCUCAAGUGUGA,-1
6,>cel-lin-4-3p MIMAT0015092 Caenorhabditis eleg...,0
7,ACACCUGGGCUCUCCGGGUACC,-1
8,>cel-miR-1-5p MIMAT0020301 Caenorhabditis eleg...,0
9,CAUACUUCCUUACAUGCCCAUA,-1


In [6]:
seq_names = pre_negative.loc[pre_negative['is_seq_name'] == 0]
seq = pre_negative.loc[pre_negative['is_seq_name'] == -1]
seq_names = seq_names['seq_name']
seq = seq['seq_name']

In [7]:
seq_names[:10]

0     >cel-let-7-5p MIMAT0000001 Caenorhabditis eleg...
2     >cel-let-7-3p MIMAT0015091 Caenorhabditis eleg...
4     >cel-lin-4-5p MIMAT0000002 Caenorhabditis eleg...
6     >cel-lin-4-3p MIMAT0015092 Caenorhabditis eleg...
8     >cel-miR-1-5p MIMAT0020301 Caenorhabditis eleg...
10    >cel-miR-1-3p MIMAT0000003 Caenorhabditis eleg...
12    >cel-miR-2-5p MIMAT0020302 Caenorhabditis eleg...
14    >cel-miR-2-3p MIMAT0000004 Caenorhabditis eleg...
16    >cel-miR-34-5p MIMAT0000005 Caenorhabditis ele...
18    >cel-miR-34-3p MIMAT0015093 Caenorhabditis ele...
Name: seq_name, dtype: object

In [8]:
seq[:10]

1      UGAGGUAGUAGGUUGUAUAGUU
3      CUAUGCAAUUUUCUACCUUACC
5       UCCCUGAGACCUCAAGUGUGA
7      ACACCUGGGCUCUCCGGGUACC
9      CAUACUUCCUUACAUGCCCAUA
11      UGGAAUGUAAAGAAGUAUGUA
13     CAUCAAAGCGGUGGUUGAUGUG
15    UAUCACAGCCAGCUUUGAUGUGC
17     AGGCAGUGUGGUUAGCUGGUUG
19     ACGGCUACCUUCACUGCCACCC
Name: seq_name, dtype: object

In [33]:
negative_dataset = pd.DataFrame(columns = ['seq_name', 'sequence', 'is_mulberry_miRNA'])

In [34]:
negative_dataset['seq_name'] = seq_names.to_list()

In [35]:
negative_dataset['sequence'] = seq.to_list()

In [36]:
negative_dataset['is_mulberry_miRNA'] = 0

In [37]:
negative_dataset[:5]

Unnamed: 0,seq_name,sequence,is_mulberry_miRNA
0,>cel-let-7-5p MIMAT0000001 Caenorhabditis eleg...,UGAGGUAGUAGGUUGUAUAGUU,0
1,>cel-let-7-3p MIMAT0015091 Caenorhabditis eleg...,CUAUGCAAUUUUCUACCUUACC,0
2,>cel-lin-4-5p MIMAT0000002 Caenorhabditis eleg...,UCCCUGAGACCUCAAGUGUGA,0
3,>cel-lin-4-3p MIMAT0015092 Caenorhabditis eleg...,ACACCUGGGCUCUCCGGGUACC,0
4,>cel-miR-1-5p MIMAT0020301 Caenorhabditis eleg...,CAUACUUCCUUACAUGCCCAUA,0


In [14]:
header = pre_positive.iloc[0]
pre_positive = pd.DataFrame(pre_positive.values[1:], columns = header)

In [15]:
pre_positive[:5]

Unnamed: 0,mi_rna,l_5p,l_3p,l_reads,f_5p,f_3p,f_reads,b_5p,b_3p,b_reads,seq_5p_5_3,seq_3p_5_3,seq_premiRNA_5_3,premirna_location,mfe_kcal_per_mol_of premirna
0,mno-miRn1,1,0,1,2,0,2,10,0,10,CGTTCCCGTGGCGGTTGCAGGGC,-,TGTCGCCGTTCCCGTGGCGGTTGCAGGGCGAAGCTCTCTTCGTGAT...,C10452211:48:120:-,-46.6
1,mno-miRn2-1,18,0,18,11,0,11,5,0,5,TCTGATGGGGGCTGAAAATCA,-,GAAAGTAAGCTCTGATGGGGGCTGAAAATCAGCCCCCATCAGCCGC...,C10661187:10:94:-,-60.7
2,mno-miRn2-2,18,0,18,11,0,11,5,0,5,TCTGATGGGGGCTGAAAATCA,-,GATTGTCAGCTCTGATGGGGGCTGAAAATCAGTCCCCATCAGCCCC...,scaffold485:876133:876219:+,-57.1
3,mno-miRn2-3,18,0,18,11,0,11,5,0,5,TCTGATGGGGGCTGAAAATCA,-,TCAGCTCTGATGGGGGCTGAAAATCAGCCCCCATCAGCCCCCATGG...,C10789358:1:133:+,-98.8
4,mno-miRn2-4,0,18,18,0,11,11,0,5,5,-,TCTGATGGGGGCTGAAAATCA,CCCAATGGGGGCTGATTTTTAGCCCCCATTGGGGATGAAAGTCAGC...,scaffold1623:55201:55277:-,-60.9


In [16]:
p5_pos = pre_positive[['mi_rna','seq_5p_5_3']]
p3_pos = pre_positive[['mi_rna','seq_3p_5_3']]

In [17]:
p5_pos = p5_pos.loc[p5_pos['seq_5p_5_3'] != '-']
p3_pos = p3_pos.loc[p3_pos['seq_3p_5_3'] != '-']

In [18]:
p5_pos.columns = ['seq_name', 'sequence']
p3_pos.columns = ['seq_name', 'sequence']

In [19]:
positive_dataset = p5_pos.append(p3_pos, ignore_index = True)

In [20]:
positive_dataset['is_mulberry_miRNA'] = 1

In [21]:
positive_dataset[:5]

Unnamed: 0,seq_name,sequence,is_mulberry_miRNA
0,mno-miRn1,CGTTCCCGTGGCGGTTGCAGGGC,1
1,mno-miRn2-1,TCTGATGGGGGCTGAAAATCA,1
2,mno-miRn2-2,TCTGATGGGGGCTGAAAATCA,1
3,mno-miRn2-3,TCTGATGGGGGCTGAAAATCA,1
4,mno-miRn3,ATATGAGGAAAGAATGATGAT,1


In [22]:
print("Total positive examples: ", len(positive_dataset))

Total positive examples:  480


In [23]:
miRNA_dataset = positive_dataset.append(negative_dataset, ignore_index = True)

In [24]:
miRNA_dataset = miRNA_dataset.sample(frac = 1).reset_index(drop = True)

In [25]:
miRNA_dataset[:5]

Unnamed: 0,seq_name,sequence,is_mulberry_miRNA
0,>gma-miR4995 MIMAT0021013 Glycine max miR4995,AGGCAGUGGCUUGGUUAAGGG,0
1,>cli-miR-138-2-3p MIMAT0038524 Columba livia m...,GCUAUUUCACUACACCAGGGU,0
2,>aca-miR-5455-5p MIMAT0022081 Anolis carolinen...,UUGAAAAAGAAGUCCCUAGUUC,0
3,>rno-miR-511-3p MIMAT0017358 Rattus norvegicus...,AAUGUGUAGCAAAAGACAGGA,0
4,>csi-miR390a-5p MIMAT0014092 Citrus sinensis m...,AAGCUCAGGAGGGAUAGCGCC,0


In [26]:
print("Total datapoints: ", len(miRNA_dataset))

Total datapoints:  49365


In [27]:
def seq_padding(seq, max_length):
    l = len(seq)
    n_pad = max_length - l
    pad = n_pad * 'X'
    seq_hat = seq + pad
    return seq_hat

In [28]:
def char2vec(dataset):
    base2idx = {'A':1,'C':2, 'T':3,'G':4,'U':3,'X':5}
    seq = dataset['sequence'].values
    str_len = [len(s) for s in seq]
    max_seq_len = max(str_len)
    n_seqs = len(seq)
    dset = np.zeros((n_seqs, max_seq_len))
    padded_seqs = [seq_padding(x, max_seq_len) for x in seq]
    for i, seg in enumerate(padded_seqs):
        strs = list(seg)
        vec = np.array([base2idx[i] for i in strs])
        dset[i,:] = vec
    return dset

In [142]:
dset_matrix = char2vec(miRNA_dataset)

In [143]:
dset_matrix.shape

(49365, 34)

In [146]:
labels = miRNA_dataset['is_mulberry_miRNA'].to_numpy()

In [149]:
np.save('./dataset/labels.npy', labels)
np.save('./dataset/data_matrix.npy', dset_matrix)

In [28]:
seqs = negative_dataset['sequence'].values

In [38]:
#Trim the negative dataset
trimmed_neg = negative_dataset.sample(480, random_state=10)

In [39]:
trimmed_neg[:10]

Unnamed: 0,seq_name,sequence,is_mulberry_miRNA
31845,>ppc-miR-8336-5p MIMAT0033218 Pristionchus pac...,UACUUGAACUGCAUAUUUCGGA,0
16521,>aly-miR3443-5p MIMAT0017719 Arabidopsis lyrat...,UUCUAAUCCCGCCAUGCACCG,0
6566,>ppt-miR1063f MIMAT0005187 Physcomitrella pate...,CAUCUUGGAGUACUGCAUCUU,0
40042,>hsa-miR-10396a-5p MIMAT0041623 Homo sapiens m...,GGCGGGGCUCGGAGCCGGG,0
25689,>gga-miR-6710-3p MIMAT0025820 Gallus gallus mi...,AAACUGUUCUCUUCCAUCUAG,0
26814,>cme-miR396c MIMAT0026146 Cucumis melo miR396c,UUCCACAGCUUUCUUGAACUU,0
28585,>ipu-miR-99b MIMAT0029533 Ictalurus punctatus ...,AACCCGUAGAUCCGAUCUUGUGA,0
12806,>bta-miR-2468 MIMAT0012058 Bos taurus miR-2468,AUAGGAACAUGGAAGAUUGUCA,0
24768,>gma-miR166o MIMAT0024897 Glycine max miR166o,UCGGACCAGGCUUCAUUCCCC,0
31208,>ssa-miR-27c-2-5p MIMAT0032580 Salmo salar miR...,AGAGCUUAGCUAAUUGGUGAGC,0


In [40]:
miRNA_dataset_trimmed = positive_dataset.append(trimmed_neg, ignore_index = True)
miRNA_dataset_trimmed = miRNA_dataset_trimmed.sample(frac = 1).reset_index(drop = True)
miRNA_dataset_trimmed[:5]

Unnamed: 0,seq_name,sequence,is_mulberry_miRNA
0,>hsa-miR-647 MIMAT0003317 Homo sapiens miR-647,GUGGCUGCACUCACUUCCUUC,0
1,mno-miRn200,AAGTGACAGAAAGAGTGATGCTA,1
2,>eca-miR-1912 MIMAT0013201 Equus caballus miR-...,UACCCAGAGCGUGCAGUGUGAA,0
3,mno-miRn88,TCTGATTTAGAATTATGGTT,1
4,mno-miRn112,TCAGTGAAGTGAACGAGGCCT,1


In [41]:
print("Total examples in trimmed dataset: ", len(miRNA_dataset_trimmed))

Total examples in trimmed dataset:  960


In [42]:
dset_mat_trimmed = char2vec(miRNA_dataset_trimmed)

In [43]:
dset_mat_trimmed.shape

(960, 27)

In [44]:
labels_trimmed = miRNA_dataset_trimmed['is_mulberry_miRNA'].to_numpy()

In [45]:
np.save('./dataset/labels_trimmed_equal.npy', labels_trimmed)
np.save('./dataset/data_matrix_trimmed_equal.npy', dset_mat_trimmed)