## Imports

In [22]:
import sys, os
import numpy as np

# Import the rouskinhf package
sys.path.append('..')
from rouskinhf import import_dataset, DataFolder, setup_env

# make sure to source the env file
# OPTION 1: source the env file in the notebook
%load_ext dotenv
%dotenv env
# OPTION 2: source the env file in the terminal
if False:
    setup_env(RNASTRUCTURE_PATH='/Users/yvesmartin/lib/RNAstructure/exe', 
                RNASTRUCTURE_TEMP_FOLDER='/Users/yvesmartin/src/rouskinhf/temp',
                DATA_FOLDER='/Users/yvesmartin/src/rouskinhf/data/datafolders',
            )

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


# Create a datafolder from local files

These methods will allow you to process your data and create a datafolder from it. The accepted formats are:
- DREEM output
- fasta
- set of CTs
- already formatted json + info.json

**Make sure to change the paths to your own paths!**

### From DREEM output

In [23]:
%reload_ext autoreload
%autoreload 2
datafolder = DataFolder.from_dreem_output(
    name='pri-miRNA', # name of the input data by default
    path_in='/Users/yvesmartin/Downloads/pri-miRNA.json', # path to the input data
    path_out='/Users/yvesmartin/src/rouskinhf/data/datafolders', 
    predict_structure=True,
    # predict_dms=False,
    tqdm=True,
    generate_npy=True
    )

Parsing dreem output file:   1%|          | 6/513 [00:02<02:52,  2.94it/s]

KeyboardInterrupt: 

### From a data.json file

In [20]:
%reload_ext autoreload
%autoreload 2
datafolder = DataFolder.from_data_json(
    name='ribonanza', # name of the input data by default
    path_in='/Users/yvesmartin/src/rouskinhf/data/ribonanza.json', 
    path_out='/Users/yvesmartin/src/rouskinhf/data/datafolders', 
    predict_structure=False,
    predict_dms=False,
    tqdm=True,
    generate_npy=True
    )

Parsing json file: 100%|██████████| 49001/49001 [00:03<00:00, 14230.55it/s]


Over a total of 49001 datapoints, there are:
    - 49001 valid datapoints
    - 0 invalid datapoints (ex: sequence with non-regular characters)
    - 0 datapoints with the same reference
    - 0 duplicate sequences with the same structure / dms
    - 0 duplicate sequences with different structure / dms


### From a list of CT files

In [None]:
%reload_ext autoreload
%autoreload 2
datafolder = DataFolder.from_ct_folder(
    name='bpRNA', # name of the input data by default
    path_in='/Users/ymdt/Downloads/bpRNA', 
    path_out='/Users/ymdt/src/rouskinhf/data/datafolders', 
    predict_dms=False, # won't take the structure from the ct files into account
    tqdm=True, 
    generate_npy=True,
    )  


### From fasta

In [5]:
datafolder = DataFolder.from_fasta(
    name= 'sequences', # name of the input data by default
    path_in = '/Users/yvesmartin/src/rouskinhf/data/input_files_for_testing/test_sequences.fasta', 
    path_out='/Users/yvesmartin/src/rouskinhf/data/datafolders', 
    predict_structure=True,
    predict_dms=True,
    tqdm=True,
    generate_npy=True,
    )
np.load(datafolder.get_dms_npy(), allow_pickle=True)



Parsing fasta file: 100%|██████████| 5/5 [00:00<00:00,  7.16it/s]

Over a total of 5 datapoints, there are:
    - 0 valid datapoints
    - 1 invalid datapoints (ex: sequence with non-regular characters)
    - 1 datapoints with the same reference
    - 1 duplicate sequences with the same structure / dms
    - 0 duplicate sequences with different structure / dms
    - 2 datapoints removed because of low AUROC (<0.8)





array([array([3.3176258e-01, 8.6194777e-01, 9.8495054e-01, 9.9978286e-01,
              9.9995619e-01, 9.9994540e-01, 9.9702460e-01, 3.2988770e-10,
              1.0113345e-08, 9.3018784e-09, 2.1481183e-10, 9.9695349e-01,
              9.9997377e-01, 9.9998462e-01, 9.9979717e-01, 9.8487574e-01,
              8.6187601e-01, 3.3190918e-01], dtype=float32)              ,
       array([2.3766817e-01, 8.4576339e-01, 9.8692513e-01, 9.9954951e-01,
              9.9981171e-01, 9.9981171e-01, 9.9900937e-01, 7.4752042e-04,
              1.4492057e-03, 5.6404242e-11, 0.0000000e+00, 0.0000000e+00,
              1.4492056e-03, 9.9931222e-01, 9.9998856e-01, 9.9998885e-01,
              9.9991626e-01, 9.8716962e-01, 8.4594077e-01, 2.3697025e-01],
             dtype=float32)                                               ],
      dtype=object)

### From an existing local datafolder

#### Load the datafolder locally

In [18]:
datafolder = DataFolder.from_local(
    name = '/Users/yvesmartin/src/rouskinhf/data/datafolders/for_testing',
)

Using local data for: for_testing


#### Or load directly the data

In [None]:
data = import_dataset(
    name = 'for_testing',
    force_download=False # if True, will download the data even if it already exists locally
)

# Push a local datafolder to Hugging Face

### 1. Create a repository if it does not exist

In [13]:
# Find more arguments here: https://huggingface.co/docs/huggingface_hub/guides/repository#create-a-repository
datafolder.create_repo(
    exist_ok=True,
    private=True
)

### 2. Push the datafolder to Hugging Face

In [14]:

# Find more arguments here: https://huggingface.co/docs/huggingface_hub/guides/upload#upload-a-folder
future = datafolder.upload_folder(
    revision='main', # branch name
    commit_message='init commit',
    commit_description='',
    run_as_future=True,
)

future.done() # True if the upload is done
future.result() # Wait for the upload to complete (blocking action)

'https://huggingface.co/datasets/rouskinlab/for_testing/tree/main/'

### 3. Check that the datafolder is on Hugging Face

Take a look at https://huggingface.co/rouskinlab

# Explore the datafolder object

In [21]:
import numpy as np
import json
print(datafolder)
print('main folder:',datafolder.get_main_folder())
print('sequences.npy file:\n',np.load(datafolder.get_sequences_npy(), allow_pickle=True))
print('base_pairs.npy file:\n',np.load(datafolder.get_base_pairs_npy(), allow_pickle=True))
print('dms.npy file:\n',np.load(datafolder.get_dms_npy(), allow_pickle=True))
print('json file:\n', json.load(open(datafolder.get_json(), 'r')))
print('dms.npy file:',datafolder.get_dms_npy())
print('json file:',datafolder.get_json())
# print('source files:',datafolder.get_source_files())
print('info file:',datafolder.get_info_file())

LoadDatafolderFromLocal(name='/Users/yvesmartin/src/rouskinhf/data/datafolders/for_testing')
main folder: /Users/yvesmartin/src/rouskinhf/data/datafolders/for_testing
sequences.npy file:
 [array([1, 1, 1, 2, 2, 2, 2, 1, 3, 4, 1, 3, 3, 3, 3, 4, 4, 4])
 array([4, 4, 4, 2, 2, 2, 2, 4, 1, 3, 4, 1, 4, 3, 3, 3, 3, 1, 1, 1])]
base_pairs.npy file:
 [array([[ 0, 17],
        [ 1, 16],
        [ 2, 15],
        [ 3, 14],
        [ 4, 13],
        [ 5, 12],
        [ 6, 11]]) array([[ 1, 18],
                          [ 2, 17],
                          [ 3, 16],
                          [ 4, 15],
                          [ 5, 14],
                          [ 6, 13]])]
dms.npy file:
 [array([0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.,
        0.], dtype=float32)
 array([1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0.,
        0., 0., 0.], dtype=float32)                                        ]
json file:
 {'RF02271.fa.csv_1': {'sequence': 'AAACCCCAGUAG