## Ribo500

Ribo500 is a subset of Ribonanza. It contains references with # of reads > 500 and S/N > 1. 

In [1]:
import json
import pandas as pd
import rouskinhf as rhf
import numpy as np
import json
import tqdm
tqdm.tqdm.pandas()

In [2]:
# Load data
data = json.load(open('data/clean_ribonanza.json'))
df_all = pd.read_csv('~/data/ribonanza/train_data.csv')

# filter
df = df_all[(df_all['reads'] > 500) & (df_all['signal_to_noise'] > 1)].copy()
df = df.rename(columns={'sequence_id': 'reference'}).sort_values('reads', ascending=False).drop_duplicates(['reference','experiment_type'], keep='first')

In [3]:
# get the signals
reactivity_cols = [c for c in df.columns if c.startswith('reactivity_0')]
signals = df[reactivity_cols].values.tolist()
df = df[['reference', 'sequence', 'experiment_type']].copy()
df['signal'] = signals
df['signal'] = df.progress_apply(lambda row: [x if s in 'AC' and not np.isnan(x) else -1000. for x, s in zip(row['signal'], row['sequence'])], axis=1)
df

100%|██████████| 357388/357388 [00:16<00:00, 21420.05it/s]


Unnamed: 0,reference,sequence,experiment_type,signal
1638157,3e6c574877dd,GGGAACGACUCGAGUAGAGUCGAAAAAUGACGGGUCCUCUGAGCUU...,DMS_MaP,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ..."
1625526,ebe90364da34,GGGAACGACUCGAGUAGAGUCGAAAAACUUUUCGACUCUGAUACCA...,DMS_MaP,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ..."
1627292,3e6c574877dd,GGGAACGACUCGAGUAGAGUCGAAAAAUGACGGGUCCUCUGAGCUU...,2A3_MaP,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ..."
1622797,ebe90364da34,GGGAACGACUCGAGUAGAGUCGAAAAACUUUUCGACUCUGAUACCA...,2A3_MaP,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ..."
1636948,04b293ab47f6,GGGAACGACUCGAGUAGAGUCGAAAAGAGAGCGUAGCACACACACA...,DMS_MaP,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ..."
...,...,...,...,...
947668,6395be7c964b,GGGAACGACUCGAGUAGAGUCGAAAACGGCGAGCGUCCCGGACCGG...,DMS_MaP,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ..."
612108,ea1da33f1fd1,GGGAACGACUCGAGUAGAGUCGAAAAGCUAAUAUGCUAGAUGUCUC...,2A3_MaP,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ..."
1587570,6dce49c4a068,GGGAACGACUCGAGUAGAGUCGAAAACAGCGGACCAGCAAAGCAUG...,DMS_MaP,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ..."
1475958,bbea23ef018b,GGGAACGACUCGAGUAGAGUCGAAAAUGGCCUGCGUACUGGCGCUC...,2A3_MaP,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ..."


In [4]:
df_out = pd.merge(
    df[df['experiment_type'] == '2A3_MaP'].rename(columns={'signal': 'shape'})[['sequence', 'shape',  'reference']],
    df[df['experiment_type'] == 'DMS_MaP'].rename(columns={'signal': 'dms'})[['sequence', 'dms', 'reference']],
    on=['reference', 'sequence'], how='outer'
).set_index('reference')

df_out

Unnamed: 0_level_0,shape,sequence,dms
reference,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3e6c574877dd,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...",GGGAACGACUCGAGUAGAGUCGAAAAAUGACGGGUCCUCUGAGCUU...,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ..."
ebe90364da34,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...",GGGAACGACUCGAGUAGAGUCGAAAAACUUUUCGACUCUGAUACCA...,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ..."
04b293ab47f6,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...",GGGAACGACUCGAGUAGAGUCGAAAAGAGAGCGUAGCACACACACA...,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ..."
438571322af3,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...",GGGAACGACUCGAGUAGAGUCGAAAAUAGUGCGUGGACGCACCCAC...,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ..."
6ea263220d58,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...",GGGAACGACUCGAGUAGAGUCGAAAAGAGCGUCGCACACACGCAGA...,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ..."
...,...,...,...
e7e971271ffb,,GGGAACGACUCGAGUAGAGUCGAAAAUUCUUUAGUAAUUACCUAAA...,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ..."
74b7990f9332,,GGGAACGACUCGAGUAGAGUCGAAAAGAUAUGGACUCAGGGAGUGA...,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ..."
d5ef12473c37,,GGGAACGACUCGAGUAGAGUCGAAAAGGUGAAUGUGUUCCUGAGCC...,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ..."
9873f40d0f30,,GGGAACGACUCGAGUAGAGUCGAAAAGCCGUUUUCAAUGUGUAGAU...,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ..."


In [5]:
bpp = pd.read_json('data/bpp.json', orient='index')

In [6]:
df_out = pd.merge(df_out, bpp, left_index=True, right_index=True, how='left')

In [7]:
rhf.dump_json(df_out.to_dict(orient='index'), 'data/ribo500.json')

In [8]:
from rouskinhf import convert

data = convert('json', 'data/ribo500.json')

Parsing json file: 100%|██████████| 209121/209121 [00:14<00:00, 14093.01it/s]


Over a total of 209121 datapoints, there are:
### OUTPUT
- ALL: 124811 valid datapoints
- INCLUDED: 0 duplicate sequences with different structure / dms / shape
### MODIFIED
- 0 multiple sequences with the same reference (renamed reference)
### FILTERED OUT
- 0 invalid datapoints (ex: sequence with non-regular characters)
- 0 datapoints with bad structures
- 0 duplicate sequences with the same structure / dms / shape
- 84310 datapoints removed because of low AUROC (<0.8)


In [9]:
from rouskinhf import upload_dataset
import envbash
envbash.load.load_envbash('../.env')

upload_dataset(
    datapath='./data/ribo500/data.json',
    exist_ok=True,
)

data.json:   0%|          | 0.00/408M [00:00<?, ?B/s]