In [1]:
import pandas as pd

from rouskinhf import get_dataset

In [2]:
seq = []
for dataset in ['RNAstralign', 'bpRNA', 'ribo500-blast']:

    data = get_dataset(dataset)

    seq += pd.DataFrame(data).T['sequence'].to_list()


# Keep sequences that are not in the other datasets
data = get_dataset('archiveII')
data_test = pd.DataFrame(data).T
data_test = data_test[~data_test.sequence.isin(seq)]

# Remove sequences that are more than 2000 nucleotides long
data_test = data_test[data_test.sequence.str.len() <= 2000]
data_test

Unnamed: 0,sequence,structure
5s_Saccharopolyspora-erythraea-1,GUUGUCGGUGGCGAUAGCGGUGGGGAAACGCCCGGUCCCUUUCCGA...,"[[0, 121], [1, 120], [2, 119], [4, 117], [5, 1..."
5s_Thermotoga-neapolitana-1,UCCCCGGGUGCCGAUACUGGGGCGGGAAACACCCGGUUCCAUUCCG...,"[[1, 120], [2, 119], [3, 118], [4, 117], [5, 1..."
16s_H,CCGCCCGUCAAAGCACCCGAGUGAGGUCCGGAUGAGGCCACCACAC...,"[[2, 87], [5, 83], [6, 82], [10, 77], [11, 76]..."
23s_E,AGCGUACACGGUGGAUGCCCUGGCAGUCAGAGGCGAUGAAGGACGU...,"[[1, 511], [2, 510], [3, 509], [4, 508], [5, 5..."
16s_H_1,AAUAGGUUUGGUCCUAGCCUUUCUAUUAGCUCUUAGUAAGAUUACA...,"[[3, 19], [4, 18], [5, 17], [6, 16], [7, 15], ..."
...,...,...
5s_Spirobolus-sp-1,GUCAACGGCCAUACUACGUUGAAAACACCAGUUCUCGUCUGAUCAC...,"[[0, 117], [1, 116], [2, 115], [3, 114], [4, 1..."
23s_T_6,AGCCGGCGACUCACGGUCGUGGGCGAGCUUAAGCCGUUGAGGCGGA...,"[[1, 706], [2, 705], [3, 704], [4, 703], [5, 7..."
23s_H_6,AGGAAAGUUGAGGAGAGCUGUCCCUAGUACGAGAGGACCGGGAUGG...,"[[1, 156], [2, 155], [3, 154], [4, 153], [5, 1..."
23s_B_6,AGAGGGGUGAGAAUCCCUCCACCGAAUGCCUAAGGGUUCCUGAGGA...,"[[1, 18], [2, 17], [3, 16], [4, 15], [5, 14], ..."


In [3]:
data_test['family'] = [ref.split('_')[0] for ref in data_test.index]
data_test.family.value_counts()

# in percent
data_test.family.value_counts(normalize=True) * 100

5s        60.845070
16s       26.197183
23s        8.450704
RNaseP     2.535211
grp1       1.690141
tRNA       0.281690
Name: family, dtype: float64

## Convert to rouskinHF

In [4]:
import envbash
envbash.load.load_envbash('../.env')
from rouskinhf import convert, upload_dataset, get_dataset

## RouskinHF filtering

In [5]:
from rouskinhf import convert, dump_json

dump_json(data_test.to_dict(orient='index'),
          'data/archiveII_blast.json')

data = convert(
    'json',
    'data/archiveII_blast.json',
    name='archiveII_blast',
    path_out='data',
    filter=True,
    
)

Parsing json file: 100%|██████████| 355/355 [00:00<00:00, 68295.47it/s]

Over a total of 355 datapoints, there are:
### OUTPUT
- ALL: 355 valid datapoints
- INCLUDED: 0 duplicate sequences with different structure / dms / shape
### MODIFIED
- 0 multiple sequences with the same reference (renamed reference)
### FILTERED OUT
- 0 invalid datapoints (ex: sequence with non-regular characters)
- 0 datapoints with bad structures
- 0 duplicate sequences with the same structure / dms / shape





## To HuggingFace

In [6]:
from rouskinhf import upload_dataset

upload_dataset(
    'data/archiveII_blast/data.json',
    commit_message='removed sequences > 2000 nt',
    exist_ok=True
)