## Imports

In [None]:
import sys, os
import numpy as np

# Import the rouskinhf package
sys.path.append('..')
from rouskinhf import import_dataset, DataFolder, setup_env

# make sure to source the env file
# OPTION 1: source the env file in the notebook
%load_ext dotenv
%dotenv env
# OPTION 2: source the env file in the terminal
if False:
    setup_env(RNASTRUCTURE_PATH='/Users/yvesmartin/lib/RNAstructure/exe', 
                RNASTRUCTURE_TEMP_FOLDER='/Users/yvesmartin/src/rouskinhf/temp',
                DATA_FOLDER='/Users/yvesmartin/src/rouskinhf/data/datafolders',
            )

# Create a datafolder from local files

These methods will allow you to process your data and create a datafolder from it. The accepted formats are:
- DREEM output
- fasta
- set of CTs
- already formatted json + info.json

**Make sure to change the paths to your own paths!**

### From DREEM output

In [None]:
%reload_ext autoreload
%autoreload 2
datafolder = DataFolder.from_dreem_output(
    name='pri-miRNA', # name of the input data by default
    path_in='/Users/yvesmartin/Downloads/pri-miRNA.json', # path to the input data
    path_out='/Users/yvesmartin/src/rouskinhf/data/datafolders', 
    predict_structure=True,
    # predict_dms=False,
    tqdm=True,
    generate_npy=True
    )

### From a data.json file

In [None]:
%reload_ext autoreload
%autoreload 2
datafolder = DataFolder.from_data_json(
    name='ribonanza', # name of the input data by default
    path_in='/Users/yvesmartin/src/rouskinhf/data/ribonanza.json', 
    path_out='/Users/yvesmartin/src/rouskinhf/data/datafolders', 
    predict_structure=False,
    predict_dms=False,
    tqdm=True,
    generate_npy=True
    )

### From a list of CT files

In [None]:
%reload_ext autoreload
%autoreload 2
datafolder = DataFolder.from_ct_folder(
    name='bpRNA', # name of the input data by default
    path_in='/Users/ymdt/Downloads/bpRNA', 
    path_out='/Users/ymdt/src/rouskinhf/data/datafolders', 
    predict_dms=False, # won't take the structure from the ct files into account
    tqdm=True, 
    generate_npy=True,
    )  


### From fasta

In [None]:
datafolder = DataFolder.from_fasta(
    name= 'sequences', # name of the input data by default
    path_in = '/Users/yvesmartin/src/rouskinhf/data/input_files_for_testing/test_sequences.fasta', 
    path_out='/Users/yvesmartin/src/rouskinhf/data/datafolders', 
    predict_structure=True,
    predict_dms=True,
    tqdm=True,
    generate_npy=True,
    )
np.load(datafolder.get_dms_npy(), allow_pickle=True)

### From an existing local datafolder

#### Load the datafolder locally

In [None]:
datafolder = DataFolder.from_local(
    name = '/Users/yvesmartin/src/rouskinhf/data/datafolders/for_testing',
)

#### Or load directly the data

In [None]:
data = import_dataset(
    name = 'for_testing',
    force_download=False # if True, will download the data even if it already exists locally
)

# Push a local datafolder to Hugging Face

### 1. Create a repository if it does not exist

In [None]:
# Find more arguments here: https://huggingface.co/docs/huggingface_hub/guides/repository#create-a-repository
datafolder.create_repo(
    exist_ok=True,
    private=True
)

### 2. Push the datafolder to Hugging Face

In [None]:

# Find more arguments here: https://huggingface.co/docs/huggingface_hub/guides/upload#upload-a-folder
future = datafolder.upload_folder(
    revision='main', # branch name
    commit_message='init commit',
    commit_description='',
    run_as_future=True,
)

future.done() # True if the upload is done
future.result() # Wait for the upload to complete (blocking action)

### 3. Check that the datafolder is on Hugging Face

Take a look at https://huggingface.co/rouskinlab

# Explore the datafolder object

In [None]:
import numpy as np
import json
print(datafolder)
print('main folder:',datafolder.get_main_folder())
print('sequences.npy file:\n',np.load(datafolder.get_sequences_npy(), allow_pickle=True))
print('base_pairs.npy file:\n',np.load(datafolder.get_base_pairs_npy(), allow_pickle=True))
print('dms.npy file:\n',np.load(datafolder.get_dms_npy(), allow_pickle=True))
print('json file:\n', json.load(open(datafolder.get_json(), 'r')))
print('dms.npy file:',datafolder.get_dms_npy())
print('json file:',datafolder.get_json())
# print('source files:',datafolder.get_source_files())
print('info file:',datafolder.get_info_file())