# Preprocessing

In [None]:
import pandas as pd
import numpy as np
import tqdm
tqdm.tqdm.pandas()

df_all = pd.read_csv('~/data/ribonanza/train_data.csv')


In [None]:
df = df_all[(df_all['signal_to_noise'] > 0.75) & (df_all['reads'] > 100)].copy()

### Compute signal and clip it

In [None]:

react_cols = [c for c in df.columns if c.startswith('reactivity_') and not 'error' in c]
vals = df[react_cols].values
len_seq = df['sequence'].apply(len).values
vals = np.clip(vals, 0, 1)
vals[np.isnan(vals)] = -1000.
df['signal'] = np.array([vals[i, :len_seq[i]] for i in range(len(vals))], dtype=object)

### Merge duplicate sequences

In [None]:
l = len(df)
df.sort_values(ascending=False, by='reads', inplace=True)

def nanmean(x, weights, axis=0):
    """Compute the mean over the given axis ignoring nans.
    
    >>> nanmean(np.array([[1,2,3],[np.nan, 10, np.nan]]), np.array([1,3]).reshape(-1,1), axis=0)
    array([1., 8., 3.])
    """
    mask = ~np.isnan(x)
    return np.nansum(weights*x, axis=axis)/np.nansum(np.where(~np.isnan(x), weights, x), axis=axis)

def average_signal(df, x):
    if len(x) == 1:
        return x.values[0]
    sig = np.stack(x.values)
    sig[sig == -1000.] = np.nan
    avg = nanmean(sig, weights=df.loc[x.index, 'reads'].values.reshape(-1,1), axis=0)
    avg[np.isnan(avg)] = -1000.
    return avg

# make a weighted average of the signals for the same sequence and experiment_type
df['signal'] = df['signal'].apply(np.array)
temp = df.groupby(['sequence_id','sequence', 'experiment_type']).agg({'signal': lambda x: average_signal(df, x), 'reads': 'sum'}).reset_index()
df = pd.merge(df, temp, on=['sequence_id','sequence', 'experiment_type'], suffixes=('_remove', ''), how='right')
df.drop(columns=[c for c in df.columns if c.endswith('_remove')], inplace=True)
df.drop_duplicates(subset=['sequence_id','sequence', 'experiment_type'], inplace=True)

print("ALL: We merge {}/{} duplicate sequences\n{} sequences left".format(l-len(df), l, len(df)))

### Add error

Note: for merged signals, the error will be too high

In [None]:
react_cols = [c for c in df.columns if c.startswith('reactivity_error_')]
vals = df[react_cols].values
len_seq = df['sequence'].apply(len).values
vals[np.isnan(vals)] = -1000.
vals = vals.round(4)
vals = np.array([vals[i, :len_seq[i]] for i in range(len(vals))], dtype=object)
df['error'] = vals.tolist()

### Aggregate signals by sequence and dump datasets

In [None]:
import json

def dump_df(df, path):
    dict_out = pd.merge(
        df[df['experiment_type']=='DMS_MaP'].rename(columns={'signal':'dms'})[['sequence_id','sequence', 'dms', 'error']],
        df[df['experiment_type']=='2A3_MaP'].rename(columns={'signal':'shape'})[['sequence_id','sequence', 'shape', 'error']],
        on=['sequence', 'sequence_id'],
        suffixes=('_dms', '_shape'),
        how='outer',
    )
    dict_out.drop_duplicates(subset=['sequence_id'], inplace=True, keep='first')
    dict_out = dict_out.set_index('sequence_id').to_dict(orient='index')
    out = {}

    for k, v in dict_out.items():
        out[k] = {}
        for k2, v2 in v.items():
            if not (type(v2) == float and np.isnan(v2)):
                if type(v2) == np.ndarray:
                    v2 = v2.tolist()
                out[k][k2] = v2

    with open(path, 'w') as f:
        f.write('{\n')
        for idx, (k, v) in enumerate(out.items()):
            f.write(json.dumps({k:v})[1:-1])
            if idx < len(out)-1:
                f.write(',\n')
        f.write('\n}')

dump_df(df, 'data/ribonanza.json')

## Analyse eternafold structure prediction

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os, sys
import tqdm

p = '/Users/yvesmartin/data/ribonanza/Ribonanza_bpp_files/extra_data'
ex = '/Users/yvesmartin/data/ribonanza/Ribonanza_bpp_files/extra_data/3/0/1/0a90ac1768b5.txt'

# histogram of the scores

def read_file(path):
    df= pd.read_csv(path, sep=' ', header=None)
    df.columns= ['b1', 'b2', 'score']
    return df

def df_to_bpp(df, thresh=0.5):
    df = df[df['score'] > thresh]
    # assert that each base is only paired once
    if not (len(df['b1'].unique()) == len(df)) or not (len(df['b2'].unique()) == len(df)):
        print('Error: some bases are paired more than once')
        return 
    return tuple(zip(df['b1']-1, df['b2']-1 )) # 0-indexing

def iterate_folder(p):
    for root, dirs, files in os.walk(p):
        for file in files:
            if file.endswith(".txt"):
                yield file.split('.')[0], os.path.join(root, file)

out = []
for idx, (ref, path) in enumerate(tqdm.tqdm(iterate_folder(p), total=len(list(iterate_folder(p))), desc='Reading files', colour='green')):
    df = read_file(path)
    bpp = df_to_bpp(df)
    if len(bpp) == 0 or bpp is None:
        continue
    out.append((ref, bpp))
    
with open('data/bpp.json', 'w') as f:
    f.write('{\n')
    for idx, (k, v) in enumerate(out):
        f.write(json.dumps({k:{'structure':v}})[1:-1])
        if idx < len(out)-1:
            f.write(',\n')
    f.write('\n}')

out = json.load(open('data/bpp.json'))

## Merge the two datasets by the sequence

In [8]:
import pandas as pd
bpp = pd.read_json('data/bpp.json', orient='index')
data = pd.read_json('data/ribonanza.json', orient='index')

data = pd.merge(data, bpp, left_index=True, right_index=True, how='left')
data

Unnamed: 0,sequence,dms,error_dms,shape,error_shape,structure
0000d87cab97,GGGAACGACUCGAGUAGAGUCGAAAAAGAUCGCCACGCACUUACGA...,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[[5, 21], [6, 20], [7, 19], [8, 18], [9, 17], ..."
0001ca9d21b0,GGGAACGACUCGAGUAGAGUCGAAAAGGUGGCCGGCAGAAUCGCGA...,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[[5, 21], [6, 20], [7, 19], [8, 18], [9, 17], ..."
0001eb71980d,GGGAACGACUCGAGUAGAGUCGAAAAAAAGGCUUAUCAAGAGAGGU...,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...",,,"[[5, 21], [6, 20], [7, 19], [8, 18], [9, 17], ..."
0002156decc7,GGGAACGACUCGAGUAGAGUCGAAAAGAAGAAGACCGGCAUCAUCG...,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[[5, 21], [6, 20], [7, 19], [8, 18], [9, 17], ..."
00021f968267,GGGAACGACUCGAGUAGAGUCGAAAACAUUGUUAAUGCCUAUAUUA...,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[[5, 21], [6, 20], [7, 19], [8, 18], [9, 17], ..."
...,...,...,...,...,...,...
ffe9e6bff7c1,GGGAACGACUCGAGUAGAGUCGAAAACCUCCAGGUGAUCAAUUUAA...,,,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[[5, 21], [6, 20], [7, 19], [8, 18], [9, 17], ..."
fff1a84815c9,GGGAACGACUCGAGUAGAGUCGAAAAGACCAGGGCUUUAACUGCAG...,,,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[[5, 21], [6, 20], [7, 19], [8, 18], [9, 17], ..."
fff75b55b2e0,GGGAACGACUCGAGUAGAGUCGAAAAUUACAGUGACUAAUGGACGC...,,,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[[5, 21], [6, 20], [7, 19], [8, 18], [9, 17], ..."
fffd37f4d330,GGGAACGACUCGAGUAGAGUCGAAAAAAAGAUAUGGAGGAGAGUAA...,,,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[[5, 21], [6, 20], [7, 19], [8, 18], [9, 17], ..."


In [11]:
data

Unnamed: 0,sequence,dms,error_dms,shape,error_shape,structure
0000d87cab97,GGGAACGACUCGAGUAGAGUCGAAAAAGAUCGCCACGCACUUACGA...,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[[5, 21], [6, 20], [7, 19], [8, 18], [9, 17], ..."
0001ca9d21b0,GGGAACGACUCGAGUAGAGUCGAAAAGGUGGCCGGCAGAAUCGCGA...,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[[5, 21], [6, 20], [7, 19], [8, 18], [9, 17], ..."
0001eb71980d,GGGAACGACUCGAGUAGAGUCGAAAAAAAGGCUUAUCAAGAGAGGU...,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...",,,"[[5, 21], [6, 20], [7, 19], [8, 18], [9, 17], ..."
0002156decc7,GGGAACGACUCGAGUAGAGUCGAAAAGAAGAAGACCGGCAUCAUCG...,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[[5, 21], [6, 20], [7, 19], [8, 18], [9, 17], ..."
00021f968267,GGGAACGACUCGAGUAGAGUCGAAAACAUUGUUAAUGCCUAUAUUA...,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[[5, 21], [6, 20], [7, 19], [8, 18], [9, 17], ..."
...,...,...,...,...,...,...
ffe9e6bff7c1,GGGAACGACUCGAGUAGAGUCGAAAACCUCCAGGUGAUCAAUUUAA...,,,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[[5, 21], [6, 20], [7, 19], [8, 18], [9, 17], ..."
fff1a84815c9,GGGAACGACUCGAGUAGAGUCGAAAAGACCAGGGCUUUAACUGCAG...,,,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[[5, 21], [6, 20], [7, 19], [8, 18], [9, 17], ..."
fff75b55b2e0,GGGAACGACUCGAGUAGAGUCGAAAAUUACAGUGACUAAUGGACGC...,,,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[[5, 21], [6, 20], [7, 19], [8, 18], [9, 17], ..."
fffd37f4d330,GGGAACGACUCGAGUAGAGUCGAAAAAAAGAUAUGGAGGAGAGUAA...,,,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[[5, 21], [6, 20], [7, 19], [8, 18], [9, 17], ..."


In [9]:
import numpy as np
def post_process(attributes, filterAC=False):
    for k, v in attributes.items():
        if type(v) == list and k != 'structure':
            attributes[k] = [round(x, 4) for x in v]
        elif type(v) == float and np.isnan(v):
            attributes[k] = None
        if filterAC:
            if 'dms' in attributes and not attributes['dms'] is None:
                attributes['dms'] = [x if s in 'AC' else -1000. for x, s in zip(attributes['dms'], attributes['sequence'])]
            if 'error_dms' in attributes and not attributes['error_dms'] is None:
                attributes['error_dms'] = [x if s in 'AC' else -1000. for x, s in zip(attributes['error_dms'], attributes['sequence'])]
    return {k:v for k, v in attributes.items() if v is not None and not (type(v) == list and len(v) == 0) and not (type(v) == float and np.isnan(v))}

In [17]:
import json
import numpy as np
import tqdm

for path, filterAC in [('data/clean_ribonanza.json', False), ('data/clean_ribonanza_AC.json', True)]:
    with open(path, 'w') as f:
        f.write('{\n')
        for idx, (k, v) in tqdm.tqdm(enumerate(data.to_dict(orient='index').items()), total=len(data), desc=path, colour='green'):
            f.write(json.dumps({k:post_process(v, filterAC)})[1:-1])
            if idx < len(data)-1:
                f.write(',\n')
        f.write('\n}')

data/clean_ribonanza_AC.json: 100%|[32m██████████[0m| 302377/302377 [01:40<00:00, 3016.89it/s]


# Filter using rouskinHF

In [21]:

from rouskinhf import convert

for path, name in zip(['data/clean_ribonanza.json', 'data/clean_ribonanza_AC.json'], ['ribonanza', 'ribonanza_AC']):
    convert(
        'json',
        path,
        name=name,
        path_out='data',
        filter=True
    )

Parsing json file: 100%|██████████| 302377/302377 [00:24<00:00, 12351.22it/s]


Over a total of 302377 datapoints, there are:
### OUTPUT
- ALL: 162155 valid datapoints
- INCLUDED: 0 duplicate sequences with different structure / dms / shape
### MODIFIED
- 0 multiple sequences with the same reference (renamed reference)
### FILTERED OUT
- 0 invalid datapoints (ex: sequence with non-regular characters)
- 0 datapoints with bad structures
- 0 duplicate sequences with the same structure / dms / shape
- 140222 datapoints removed because of low AUROC (<0.8)


Parsing json file: 100%|██████████| 302377/302377 [00:23<00:00, 13109.33it/s]


Over a total of 302377 datapoints, there are:
### OUTPUT
- ALL: 162155 valid datapoints
- INCLUDED: 0 duplicate sequences with different structure / dms / shape
### MODIFIED
- 0 multiple sequences with the same reference (renamed reference)
### FILTERED OUT
- 0 invalid datapoints (ex: sequence with non-regular characters)
- 0 datapoints with bad structures
- 0 duplicate sequences with the same structure / dms / shape
- 140222 datapoints removed because of low AUROC (<0.8)


## Reintegrate error

In [2]:
import json
refs = set(json.load(open('data/ribonanza/data.json')).keys())

In [11]:
import json
import pandas as pd
import numpy as np
for file in ['ribonanza', 'ribonanza_AC']:
    # ignore nan values
    data = pd.read_json('./data/clean_{}.json'.format(file), orient='index')
    data = data[data.index.isin(refs)]
    with open('./data/{}/data.json'.format(file), 'w') as f:
        f.write('{\n')
        for idx, (k, v) in enumerate(data.to_dict(orient='index').items()):
            f.write(json.dumps({k:post_process(v)})[1:-1])
            if idx < len(data)-1:
                f.write(',\n')
        f.write('\n}')

## Push to HF

In [12]:
from rouskinhf import upload_dataset
import envbash
envbash.load.load_envbash('../.env')

upload_dataset(
    datapath='./data/ribonanza_AC/data.json',
    exist_ok=True,
)

data.json:   0%|          | 0.00/935M [00:00<?, ?B/s]

In [13]:
upload_dataset(
    datapath='./data/ribonanza/data.json',
    exist_ok=True,
)

data.json:   0%|          | 0.00/893M [00:00<?, ?B/s]