## Imports

In [2]:
import sys, os
import numpy as np

# Import the rouskinhf package
sys.path.append('..')
from rouskinhf import import_dataset, DataFolder, setup_env

# make sure to source the env file
# OPTION 1: source the env file in the notebook
%load_ext dotenv
%dotenv env
# OPTION 2: source the env file in the terminal
setup_env(RNASTRUCTURE_PATH='/Users/ymdt/src/RNAstructure/exe')

# Create a datafolder from local files

These methods will allow you to process your data and create a datafolder from it. The accepted formats are:
- DREEM output
- fasta
- set of CTs
- already formatted json + info.json

**Make sure to change the paths to your own paths!**

### From DREEM output

In [54]:
%reload_ext autoreload
%autoreload 2
datafolder = DataFolder.from_dreem_output(
    name='pri-miRNA-test', # name of the input data by default
    path_in='/Users/ymdt/Downloads/pri-miRNA.json', # path to the input data
    path_out='/Users/ymdt/src/rouskinhf/data/datafolders', 
 #   predict_structure=True,
    tqdm=False,
    generate_npy=True
    )

Over a total of 513 datapoints, there are:
    - 513 valid datapoints
    - 0 invalid datapoints (ex: sequence with non-regular characters)
    - 0 datapoints with the same reference
    - 0 duplicate sequences with the same structure / dms
    - 0 duplicate sequences with different structure / dms


### From a data.json file

In [25]:
%reload_ext autoreload
%autoreload 2
datafolder = DataFolder.from_data_json(
    name='ribonanza_shape', # name of the input data by default
    path_in='/Users/ymdt/src/das_data/data/shape_data.json', 
    path_out='/Users/ymdt/src/rouskinhf/data/datafolders', 
    predict_structure=False,
    predict_dms=False,
    tqdm=True,
    generate_npy=True
    )

Parsing json file: 100%|██████████| 16674/16674 [00:00<00:00, 26797.20it/s]


Over a total of 16674 datapoints, there are:
    - 16674 valid datapoints
    - 0 invalid datapoints (ex: sequence with non-regular characters)
    - 0 datapoints with the same reference
    - 0 duplicate sequences with the same structure / dms
    - 0 duplicate sequences with different structure / dms
    - 0 datapoints removed because of low AUROC (<0.8)


### From a list of CT files

In [None]:
%reload_ext autoreload
%autoreload 2
datafolder = DataFolder.from_ct_folder(
    name='from_ct_folder', # name of the input data by default
    path_in='/Users/ymdt/src/rouskinhf/data/input_files_for_testing/ct_files', 
    path_out='/Users/ymdt/src/rouskinhf/data/datafolders', 
    predict_dms=False, # won't take the structure from the ct files into account
    tqdm=True, 
    generate_npy=True,
    )  


### From fasta

In [2]:
datafolder = DataFolder.from_fasta(
    name= 'sequences', # name of the input data by default
    path_in = '/Users/ymdt/src/rouskinhf/data/input_files_for_testing/sequences.fasta', 
    path_out='/Users/ymdt/src/rouskinhf/data/datafolders', 
    predict_structure=True,
    predict_dms=True,
    tqdm=True,
    generate_npy=True,
    )
np.load(datafolder.get_dms_npy(), allow_pickle=True)

Parsing fasta file: 100%|██████████| 5/5 [00:01<00:00,  3.77it/s]

Over a total of 5 datapoints, there are:
    - 2 valid datapoints
    - 1 invalid datapoints (ex: sequence with non-regular characters)
    - 1 datapoints with the same reference
    - 1 duplicate sequences with the same structure / dms
    - 0 duplicate sequences with different structure / dms





array([array([3.3176258e-01, 8.6194777e-01, 9.8495054e-01, 9.9978286e-01,
              9.9995619e-01, 9.9994540e-01, 9.9702460e-01, 3.2988770e-10,
              1.0113345e-08, 9.3018784e-09, 2.1481183e-10, 9.9695349e-01,
              9.9997377e-01, 9.9998462e-01, 9.9979717e-01, 9.8487574e-01,
              8.6187601e-01, 3.3190918e-01], dtype=float32)              ,
       array([2.3766817e-01, 8.4576339e-01, 9.8692513e-01, 9.9954951e-01,
              9.9981171e-01, 9.9981171e-01, 9.9900937e-01, 7.4752042e-04,
              1.4492057e-03, 5.6404242e-11, 0.0000000e+00, 0.0000000e+00,
              1.4492056e-03, 9.9931222e-01, 9.9998856e-01, 9.9998885e-01,
              9.9991626e-01, 9.8716962e-01, 8.4594077e-01, 2.3697025e-01],
             dtype=float32)                                               ],
      dtype=object)

### From an existing local datafolder

In [4]:
datafolder = DataFolder.from_local(
    name = '/Users/ymdt/src/rouskinhf/data/datafolders/for_testing',
)

# OR

datafolder = import_dataset(
    name = 'for_testing',
    data = 'DMS', # can be 'DMS' or 'structure'
    force_download=False # if True, will download the data even if it already exists locally
)

No folder found in data/input_files/pri-miRNA


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)ffe09d71b9/data.json:   0%|          | 0.00/2.05M [00:00<?, ?B/s]

Downloading (…)ffe09d71b9/info.json:   0%|          | 0.00/790 [00:00<?, ?B/s]

Parsing json file: 100%|██████████| 503/503 [00:00<00:00, 25099.76it/s]


Over a total of 503 datapoints, there are:
    - 503 valid datapoints
    - 0 invalid datapoints (ex: sequence with non-regular characters)
    - 0 datapoints with the same reference
    - 0 duplicate sequences with the same structure / dms
    - 0 duplicate sequences with different structure / dms
    - 0 datapoints removed because of low AUROC (<0.8)
Using data from HuggingFace Hub for pri-miRNA


# Push a local datafolder to Hugging Face

### 1. Create a repository if it does not exist

In [26]:
# Find more arguments here: https://huggingface.co/docs/huggingface_hub/guides/repository#create-a-repository
datafolder.create_repo(
    exist_ok=True,
    private=True
)

### 2. Push the datafolder to Hugging Face

In [55]:

# Find more arguments here: https://huggingface.co/docs/huggingface_hub/guides/upload#upload-a-folder
future = datafolder.upload_folder(
    revision='main', # branch name
    commit_message='Upload demo dataset',
    commit_description='This is a demo dataset',
    run_as_future=True,
)

future.done() # True if the upload is done
future.result() # Wait for the upload to complete (blocking action)

'https://huggingface.co/datasets/rouskinlab/pri-miRNA-test/tree/main/'

### 3. Check that the datafolder is on Hugging Face

Take a look at https://huggingface.co/rouskinlab

# Explore the datafolder object

In [None]:
import numpy as np
import json
print(datafolder)
print('main folder:',datafolder.get_main_folder())
print('sequences.npy file:\n',np.load(datafolder.get_sequences_npy(), allow_pickle=True))
print('base_pairs.npy file:\n',np.load(datafolder.get_base_pairs_npy(), allow_pickle=True))
print('dms.npy file:\n',np.load(datafolder.get_dms_npy(), allow_pickle=True))
print('json file:\n', json.load(open(datafolder.get_json(), 'r')))
print('dms.npy file:',datafolder.get_dms_npy())
print('json file:',datafolder.get_json())
print('source files:',datafolder.get_source_files())
print('info file:',datafolder.get_info_file())

In [61]:
data_pri = import_dataset(
    name = 'pri-miRNA-test',
    data = 'DMS', # can be 'DMS' or 'structure'
    force_download=False # if True, will download the data even if it already exists locally
)

data_utr = import_dataset(
    name = 'utr',
    data = 'DMS', # can be 'DMS' or 'structure'
    force_download=False # if True, will download the data even if it already exists locally
)


Using local data for: pri-miRNA-test
Force download from HuggingFace Hub


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing json file: 100%|██████████| 1234/1234 [00:00<00:00, 3009.75it/s]


Over a total of 1234 datapoints, there are:
    - 1234 valid datapoints
    - 0 invalid datapoints (ex: sequence with non-regular characters)
    - 0 datapoints with the same reference
    - 0 duplicate sequences with the same structure / dms
    - 0 duplicate sequences with different structure / dms
    - 0 datapoints removed because of low AUROC (<0.8)
Using data from HuggingFace Hub for utr


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (1234,) + inhomogeneous part.

In [60]:
# load dms.npy 
data_pri['DMS'][0]

array([-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0,
       -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0,
       -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0,
       -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, 0.4699999988079071,
       -1000.0, 0.5659999847412109, 0.39500001072883606,
       0.32100000977516174, 0.3240000009536743, -1000.0, -1000.0, -1000.0,
       -1000.0, 0.5049999952316284, -1000.0, -1000.0, 0.6269999742507935,
       0.13699999451637268, -1000.0, -1000.0, -1000.0, -1000.0,
       0.15700000524520874, 0.4350000023841858, 0.37400001287460327,
       0.21299999952316284, 0.32600000500679016, -1000.0, -1000.0,
       0.31299999356269836, 0.5450000166893005, 0.6480000019073486,
       0.05400000140070915, 0.05299999937415123, -1000.0, -1000.0,
       -1000.0, 0.0820000022649765, -1000.0, -1000.0, -1000.0, -1000.0,
       0.36899998784065247, -1000.0, 0.15700000524520874, -1000.0,
       -1000.0, 0.03099999949336052, 0

In [51]:
data_utr['DMS']

array([array([-1.00e+03, -1.00e+03, -1.00e+03,  2.66e-01, -1.00e+03, -1.00e+03,
               3.18e-01,  3.45e-01, -1.00e+03,  3.98e-01, -1.00e+03, -1.00e+03,
              -1.00e+03, -1.00e+03,  5.51e-01, -1.00e+03, -1.00e+03,  4.65e-01,
              -1.00e+03, -1.00e+03, -1.00e+03, -1.00e+03,  3.96e-01, -1.00e+03,
              -1.00e+03,  6.44e-01,  5.99e-01, -1.00e+03,  6.31e-01,  1.17e-01,
               1.20e-01, -1.00e+03, -1.00e+03,  6.33e-01,  3.20e-02,  2.80e-02,
               7.80e-02, -1.00e+03,  5.00e-02,  2.50e-01,  2.78e-01,  7.45e-01,
              -1.00e+03, -1.00e+03, -1.00e+03,  2.78e-01,  8.18e-01, -1.00e+03,
               9.73e-01, -1.00e+03,  9.67e-01,  7.66e-01, -1.00e+03,  7.60e-02,
              -1.00e+03,  5.61e-01, -1.00e+03,  9.01e-01, -1.00e+03, -1.00e+03,
               3.50e-02, -1.00e+03, -1.00e+03, -1.00e+03, -1.00e+03,  5.48e-01,
               6.80e-02, -1.00e+03, -1.00e+03, -1.00e+03,  4.41e-01,  6.88e-01,
              -1.00e+03,  5.49e-01, -1.0