# Notes on using miRBench

In [1]:
import miRBench # ignore warnings

2024-09-11 11:51:52.018707: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-11 11:51:52.020004: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-11 11:51:52.044960: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-11 11:51:52.045644: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Accessing datasets

Three collections of datasets are available. 

In [2]:
miRBench.dataset.list_datasets()

['AGO2_CLASH_Hejret2023',
 'AGO2_eCLIP_Klimentova2022',
 'AGO2_eCLIP_Manakov2022']

Each collection is available in different train/test splits and class imbalance ratios. 

In [27]:
miRBench.dataset.list_datasets(full=True)

{'AGO2_CLASH_Hejret2023': {'splits': {'train': {'ratios': ['10']},
   'test': {'ratios': ['1', '10', '100']}}},
 'AGO2_eCLIP_Klimentova2022': {'splits': {'test': {'ratios': ['1',
     '10',
     '100']}}},
 'AGO2_eCLIP_Manakov2022': {'splits': {'train': {'ratios': ['1', '10', '100']},
   'test': {'ratios': ['1', '10', '100']}}}}

A dataset can be loaded as a pandas df directly, or downloaded to a specified path on your machine. 

In [41]:
# Choose a dataset
dataset_name = "AGO2_CLASH_Hejret2023"
split = 'test'
ratio = '1'

In [35]:
df = miRBench.dataset.get_dataset_df(dataset_name, split=split, ratio=ratio)
df.head()

Using cached dataset /home/steph/.miRBench/datasets/AGO2_CLASH_Hejret2023/1/test/dataset.tsv


Unnamed: 0,noncodingRNA,gene,label
0,TCCGAGCCTGGGTCTCCCTCTT,GGGTTTAGGGAAGGAGGTTCGGAGACAGGGAGCCAAGGCCTCTGTC...,1
1,TGCGGGGCTAGGGCTAACAGCA,GCTTCCCAAGTTAGGTTAGTGATGTGAAATGCTCCTGTCCCTGGCC...,1
2,CCCACTGCCCCAGGTGCTGCTGG,TCTTTCCAAAATTGTCCAGCAGCTTGAATGAGGCAGTGACAATTCT...,1
3,TGAGGGGCAGAGAGCGAGACTTT,CAGAACTGGGATTCAAGCGAGGTCTGGCCCCTCAGTCTGTGGCTTT...,1
4,CAAAGTGCTGTTCGTGCAGGTAG,TTTTTTCCCTTAGGACTCTGCACTTTATAGAATGTTGTAAAACAGA...,1


In [25]:
miRBench.dataset.download_dataset(dataset_name, download_path='./dataset.tsv', split=split, ratio=ratio)

Data will be downloaded to $HOME / ".miRBench" / "datasets" directory, under separate subdirectories for each dataset.

In [36]:
miRBench.dataset.get_dataset_path(dataset_name, split=split, ratio=ratio)

Using cached dataset /home/steph/.miRBench/datasets/AGO2_CLASH_Hejret2023/1/test/dataset.tsv


PosixPath('/home/steph/.miRBench/datasets/AGO2_CLASH_Hejret2023/1/test/dataset.tsv')

## Accessing models

There are 11 predictor tools available. 

In [6]:
miRBench.predictor.list_predictors()

['CnnMirTarget_Zheng2020',
 'RNACofold',
 'miRNA_CNN_Hejret2023',
 'miRBind_Klimentova2022',
 'TargetNet_Min2021',
 'Seed8mer',
 'Seed7mer',
 'Seed6mer',
 'Seed6merBulgeOrMismatch',
 'TargetScanCnn_McGeary2019',
 'InteractionAwareModel_Yang2024']

## Encoding data

Depending on your tool of interest, a different encoder is required to prepare your dataset. 

In [42]:
# Choose a tool
tool = 'miRBind_Klimentova2022'

In [39]:
encoder = miRBench.encoder.get_encoder(tool)
input = encoder(df)

input[:10]

array([[[[0.],
         [1.],
         [1.],
         ...,
         [1.],
         [0.],
         [1.]],

        [[0.],
         [1.],
         [1.],
         ...,
         [1.],
         [0.],
         [1.]],

        [[0.],
         [1.],
         [1.],
         ...,
         [1.],
         [0.],
         [1.]],

        ...,

        [[0.],
         [1.],
         [1.],
         ...,
         [1.],
         [0.],
         [1.]],

        [[0.],
         [1.],
         [1.],
         ...,
         [1.],
         [0.],
         [1.]],

        [[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]]],


       [[[0.],
         [0.],
         [1.],
         ...,
         [1.],
         [0.],
         [0.]],

        [[0.],
         [1.],
         [0.],
         ...,
         [0.],
         [0.],
         [1.]],

        [[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [1.],
         [0.]],

        ...,

        [[0.],
 

## Getting predictions

Pass your encoded dataset to an instance of your predictor to get your predictions. 

In [40]:
predictor = miRBench.predictor.get_predictor(tool)
predictions = predictor(input)

predictions[:10]



array([0.689916  , 0.15220629, 0.07301959, 0.43757904, 0.34360746,
       0.20519173, 0.09550288, 0.7929826 , 0.14150581, 0.05329493],
      dtype=float32)