In [1]:
import pandas as pd
import numpy as np
from rouskinhf import get_dataset, convert, upload_dataset
import torch

import os

import sys
sys.path.append('..')

In [2]:
data = get_dataset('RNAStralign', force_download=True)
data = pd.DataFrame(data).T[['sequence', 'structure', 'family']]
data = data.drop_duplicates(subset='sequence', keep='first')

RNAStralign: Downloading dataset from HuggingFace Hub...


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data.json:   0%|          | 0.00/65.0M [00:00<?, ?B/s]

RNAStralign: Download complete. File saved at data/RNAStralign/data.json


In [3]:
data['family'] = data.apply(lambda x: x['family'].split('_')[0], axis=1)
data.loc[data['family'] == 'group', 'family'] = 'Group_I_intron'
data.family.value_counts()

family
16S               9550
5S                9323
tRNA              6432
Group_I_intron     680
SRP                440
tmRNA              404
RNaseP             155
telomerase          37
Name: count, dtype: int64

In [None]:
from seismic2dreem.dump import dump_json

# Upload each split
for family in data.family.unique():

    df = data.loc[data.family == family, :]
    
    folder_name = f'data/RNAStralign_{family}'
    if not os.path.exists(folder_name): os.mkdir(folder_name)
    df.T.to_json(os.path.join(folder_name,'data.json'), orient='columns', indent=2)

    upload_dataset(
        os.path.join(folder_name,'data.json'),
        commit_message='corrected duplicate names',
        exist_ok=True,
)

In [10]:
# Upload a small part of the dataset as validation
df = data.sample(3000)
folder_name = f'data/RNAStralign_validation'
if not os.path.exists(folder_name): os.mkdir(folder_name)
df.T.to_json(os.path.join(folder_name,'data.json'), orient='columns', indent=2)

upload_dataset(
    os.path.join(folder_name,'data.json'),
    commit_message='first commit',
    exist_ok=True,
)

data.json:   0%|          | 0.00/20.9M [00:00<?, ?B/s]

In [9]:
# list directories
os.listdir('data')

['RNAStralign_Group_I_intron',
 'RNAStralign_5S',
 'RNAStralign_telomerase',
 'RNAStralign_SRP',
 'RNAStralign_tmRNA',
 'RNAStralign_RNaseP',
 'RNAStralign',
 'RNAStralign_16S',
 'RNAStralign_tRNA']

In [3]:
def get_base_pairs_nt(sequence, structure):

    base_pairs_nt = []

    for pair in structure:
        pair = [sequence[pair[0]], sequence[pair[1]]]
        pair.sort()
        base_pairs_nt.append(''.join(pair))

    return base_pairs_nt

In [4]:
from tqdm import tqdm

all_base_pairs = []

for seq, struct in tqdm(zip(data['sequence'], data['structure'])):
    all_base_pairs += get_base_pairs_nt(seq, struct)

np.unique(all_base_pairs, return_counts=True)

27082it [00:00, 37751.41it/s]


(array(['AA', 'AC', 'AG', 'AU', 'CC', 'CG', 'CU', 'GG', 'GU', 'UU'],
       dtype='<U2'),
 array([  10416,   34523,   92471,  943569,    5225, 2373086,   10718,
          17122,  544813,   30972]))

In [5]:
import plotly.graph_objects as go

unique_base_pairs, counts = np.unique(all_base_pairs, return_counts=True)

fig = go.Figure(data=[go.Bar(x=unique_base_pairs, y=counts)])
fig.update_layout(title='Base pairs frequency', xaxis_title='Base pair', yaxis_title='Frequency')
fig.update_yaxes(type="log")
fig.show()