## Imports

In [None]:
# make sure to source the env file before importing rouskinhf
%load_ext dotenv
%dotenv env
import sys, os
sys.path.append('..')
from rouskinhf import import_dataset, DataFolder
import numpy as np

# Create a datafolder from local files

These methods will allow you to process your data and create a datafolder from it. The accepted formats are:
- DREEM output
- fasta
- set of CTs
- already formatted json + info.json

**Make sure to change the paths to your own paths!**

### From DREEM output

In [None]:
%reload_ext autoreload
%autoreload 2
datafolder = DataFolder.from_dreem_output(
    name='from_dreem_output', # name of the input data by default
    path_in='/Users/ymdt/src/rouskinhf/data/input_files_for_testing/dreem_output.json', 
    path_out='/Users/ymdt/src/rouskinhf/data/datafolders', 
    predict_structure=True,
    tqdm=False,
    generate_npy=True
    )

### From a list of CT files

In [None]:
%reload_ext autoreload
%autoreload 2
datafolder = DataFolder.from_ct_folder(
    name='from_ct_folder', # name of the input data by default
    path_in='/Users/ymdt/src/rouskinhf/data/input_files_for_testing/ct_files', 
    path_out='/Users/ymdt/src/rouskinhf/data/datafolders', 
    predict_dms=False, # won't take the structure from the ct files into account
    tqdm=True, 
    generate_npy=True,
    )  


### From fasta

In [None]:
datafolder = DataFolder.from_fasta(
    name= 'sequences', # name of the input data by default
    path_in = '/Users/ymdt/src/rouskinhf/data/input_files_for_testing/sequences.fasta', 
    path_out='/Users/ymdt/src/rouskinhf/data/datafolders', 
    predict_structure=True,
    predict_dms=True,
    tqdm=True,
    generate_npy=True,
    )
np.load(datafolder.get_dms_npy(), allow_pickle=True)

### From an existing local datafolder

In [None]:
datafolder = DataFolder.from_local(
    name = '/Users/ymdt/src/rouskinhf/data/datafolders/for_testing',
)

# OR

datafolder = import_dataset(
    name = 'for_testing',
    data = 'DMS', # can be 'DMS' or 'structure'
    force_download=False # if True, will download the data even if it already exists locally
)

# Push a local datafolder to Hugging Face

### 1. Create a repository if it does not exist

In [None]:
# Find more arguments here: https://huggingface.co/docs/huggingface_hub/guides/repository#create-a-repository
datafolder.create_repo(
    exist_ok=True,
    private=True
)

### 2. Push the datafolder to Hugging Face

In [None]:

# Find more arguments here: https://huggingface.co/docs/huggingface_hub/guides/upload#upload-a-folder
future = datafolder.upload_folder(
    revision='main', # branch name
    commit_message='Upload demo dataset',
    commit_description='This is a demo dataset',
    run_as_future=True,
)

future.done() # True if the upload is done
future.result() # Wait for the upload to complete (blocking action)

### 3. Check that the datafolder is on Hugging Face

Take a look at https://huggingface.co/rouskinlab

# Explore the datafolder object

In [None]:
import numpy as np
import json
print(datafolder)
print('main folder:',datafolder.get_main_folder())
print('sequences.npy file:\n',np.load(datafolder.get_sequences_npy(), allow_pickle=True))
print('base_pairs.npy file:\n',np.load(datafolder.get_base_pairs_npy(), allow_pickle=True))
print('dms.npy file:\n',np.load(datafolder.get_dms_npy(), allow_pickle=True))
print('json file:\n', json.load(open(datafolder.get_json(), 'r')))
print('dms.npy file:',datafolder.get_dms_npy())
print('json file:',datafolder.get_json())
print('source files:',datafolder.get_source_files())
print('info file:',datafolder.get_info_file())