In [15]:
# list datasets for yack
from rouskinhf import *
import pandas as pd

datasets = ["zuber", 'bpRNA','RNAstralign','archiveII',"ribo500-blast"]

data = pd.DataFrame()
for name in datasets:
    ds = pd.DataFrame.from_dict(get_dataset(name), orient='index')
    data = pd.concat([data, ds], axis=0)  
      
data.reset_index(inplace=True)
data.rename(columns={'index': 'reference'}, inplace=True)

In [16]:
# remove sequences below 10nt 
print("Remove {} sequences below 10nt".format((data['sequence'].str.len() < 10).sum()))
data = data[data['sequence'].str.len() >= 10]

# remove sequences with no structure
print("Remove {} sequences with no structure".format((data['structure'].str.len() == 0).sum()))
print(data[data['structure'].str.len() == 0]['reference'])
data = data[data['structure'].str.len() > 0]

# rename non-unique references
for i, group in data.groupby('reference'):
    if len(group) > 1:
        for j, row in group.iterrows():
            data.loc[j, 'reference'] = row['reference'] + '__' + str(j)

assert len(data['reference'].unique()) == len(data)

data.set_index('reference', inplace=True)
dump_json(data.to_dict(orient='index'), 'datasets.json')

Remove 28 sequences below 10nt
Remove 392 sequences with no structure
1312     bpRNA_CRW_45342
1390     bpRNA_CRW_48649
1410     bpRNA_CRW_44793
1430     bpRNA_CRW_46034
1470     bpRNA_CRW_41561
              ...       
86573             Schi_3
95386           srp_Crit
96469              16s_M
97378         srp_Schi_4
97548        srp_Leis_19
Name: reference, Length: 392, dtype: object


In [17]:
dump_json(
    convert('json', 'datasets.json', name='yack'),
    path="yack.json")

Parsing json file: 100%|██████████| 144230/144230 [00:04<00:00, 31667.10it/s]


Over a total of 144230 datapoints, there are:
### OUTPUT
- ALL: 125259 valid datapoints
- INCLUDED: 0 duplicate sequences with different structure / dms / shape
### MODIFIED
- 0 multiple sequences with the same reference (renamed reference)
### FILTERED OUT
- 0 invalid datapoints (ex: sequence with non-regular characters)
- 0 datapoints with bad structures
- 18971 duplicate sequences with the same structure / dms / shape
- 0 datapoints removed because of low AUROC (<0.8)


In [19]:
commit_message = "remove sequences below 10nt and sequences with no structure"

In [20]:
upload_dataset('yack.json', exist_ok=True, commit_message=commit_message)

yack.json:   0%|          | 0.00/262M [00:00<?, ?B/s]

## Yack train / valid

In [21]:
from rouskinhf import *

# VALID: 
# pick 4096 random references from the dataset
import numpy as np
data = get_dataset('yack')
valid_refs = np.random.choice(list(data.keys()), size=4096, replace=False)

# save valid dataset
data_valid = {k: data[k] for k in valid_refs}
dump_json(data_valid, 'yack_valid.json')
upload_dataset('yack_valid.json', exist_ok=True, commit_message=commit_message)

# TRAIN:
# keep only the references that are not in the valid dataset
data_train = {k: data[k] for k in data if k not in valid_refs}
dump_json(data_train, 'yack_train.json')
upload_dataset('yack_train.json', exist_ok=True, commit_message=commit_message)

yack_train.json:   0%|          | 0.00/253M [00:00<?, ?B/s]