In [1]:
import pandas as pd
import numpy as np
import torch

In [27]:
def read_fasta(path):
    """Reads a fasta file and returns a dictionary with the sequence
    and structure (if structure=True)"""
    refs  = []
    seqs  = []
    structures = []
    with open(path) as f:
        lines = f.readlines()

    for i in range(0, len(lines), 3):
        refs.append(lines[i].strip().split('>')[1])
    for i in range(1, len(lines), 3):
        seqs.append(lines[i].strip().upper())
    for i in range(2, len(lines), 3):
        structures.append(lines[i].strip())

    df = pd.DataFrame({'reference': refs, 'sequence': seqs, 'structure': structures})
    return df

def dotbracket2Matrix(dotbracket):
    """
    Convert a dotbracket notation to a pairing matrix.

    :param dotbracket: Dotbracket notation
    :param len_seq: Length of the sequence
    :return: Pairing matrix
    """

    pair_list = []
    stack = []

    for i, char in enumerate(dotbracket):
        if char == '(':
            stack.append(i)
        elif char == ')':
            pair_list.append([stack.pop(), i])

    return pairList2pairMatrix(pair_list, len(dotbracket))

def pairList2pairMatrix(pair_list, len_seq):
    pair_list = np.array(pair_list).astype(int)
    pairing_matrix = torch.zeros((len_seq, len_seq))

    if len(pair_list) > 0:
        pairing_matrix[pair_list[:,0], pair_list[:,1]] = 1.0
        pairing_matrix[pair_list[:,1], pair_list[:,0]] = 1.0

    return pairing_matrix

def dotbracket2Pairs(dotbracket):
    """
    Convert a dotbracket notation to a pairing matrix.

    :param dotbracket: Dotbracket notation
    :return: list of pairs
    """

    pair_list = []
    stack = []

    for i, char in enumerate(dotbracket):
        if char == '(':
            stack.append(i)
        elif char == ')':
            pair_list.append([stack.pop(), i])
    
    pair_list.sort( key=lambda x: x[0])

    return pair_list

def compute_f1(pred_matrix, target_matrix, threshold=0.5):
    """
    Compute the F1 score of the predictions.

    :param pred_matrix: Predicted pairing matrix probability  (L,L)
    :param target_matrix: True binary pairing matrix (L,L)
    :return: F1 score for this RNA structure
    """

    pred_matrix = (pred_matrix > threshold).float()

    sum_pair = torch.sum(pred_matrix) + torch.sum(target_matrix)

    if sum_pair == 0:
        return 1.0
    else:
        return (2 * torch.sum(pred_matrix * target_matrix) / sum_pair).item()

In [24]:
list = [[1,10],[3,8],[2,9],[4,7]]
list.sort( key=lambda x: x[0])
list

[[1, 10], [2, 9], [3, 8], [4, 7]]

## Analysis of predictions

In [40]:
refs_sce188 = np.loadtxt('data/sce188.txt', dtype=str)

rnastructure_pred = read_fasta('data/RNAstructure_sce.fasta').set_index('reference').rename(columns={'structure': 'structure_pred'})
rnastructure_pred.loc[refs_sce188]

ground_truth = pd.read_csv('data/sce_genes_folded.tab', sep='\t', header=None).rename(columns={0: 'reference', 1: 'sequence', 2:'structure_true'}).set_index('reference')

df_comparison = ground_truth.loc[refs_sce188].merge(rnastructure_pred.loc[refs_sce188], how='inner')
df_comparison['F1'] = df_comparison.apply(lambda x: compute_f1(dotbracket2Matrix(x['structure_true']), dotbracket2Matrix(x['structure_pred'])), axis=1)
df_comparison['length'] = df_comparison['sequence'].apply(len)

In [41]:
import plotly.express as px
fig = px.scatter(df_comparison, x='length', y='F1', trendline='ols')
fig.show()

print(df_comparison[df_comparison['length'] > 500]['F1'].mean())
print(df_comparison[df_comparison['length'] < 500]['F1'].mean())

0.5073022061476001
0.4806272683025741


In [3]:
ground_truth_lcRNA = read_fasta('data/lncRNAs.fasta').set_index('reference').rename(columns={'structure': 'structure_true'})

rnastructure_pred_lncRNA = read_fasta('data/RNAstructure_lnc.fasta').set_index('reference').rename(columns={'structure': 'structure_pred'})

df_comparison_lncRNA = ground_truth_lcRNA.reset_index().merge(rnastructure_pred_lncRNA.reset_index())
df_comparison_lncRNA['F1'] = df_comparison_lncRNA.apply(lambda x: compute_f1(dotbracket2Matrix(x['structure_true']), dotbracket2Matrix(x['structure_pred'])), axis=1)
df_comparison_lncRNA['length'] = df_comparison_lncRNA['sequence'].apply(len)

import plotly.express as px
fig = px.scatter(df_comparison_lncRNA.reset_index(), x='length', y='F1', trendline='ols', hover_data=['reference'])
fig.show()

print(df_comparison_lncRNA[df_comparison_lncRNA['length'] > 500]['F1'].mean())
print(df_comparison_lncRNA[df_comparison_lncRNA['length'] < 500]['F1'].mean())
print(df_comparison_lncRNA['F1'].mean())
print(df_comparison_lncRNA['F1'].std())


0.42194083239883184
0.3909269869327545
0.4184948495692677
0.12894288306515303


## Create dataset of lncRNA

In [44]:
ground_truth_lcRNA = read_fasta('data/lncRNAs.fasta').set_index('reference').rename(columns={'structure': 'structure_true'})
ground_truth_lcRNA['structure'] = ground_truth_lcRNA['structure_true'].apply(lambda x: dotbracket2Pairs(x))

ground_truth_lcRNA.drop(columns=['structure_true'], inplace=True)
ground_truth_lcRNA = ground_truth_lcRNA[ground_truth_lcRNA['sequence'].apply(len) <=2000]
ground_truth_lcRNA

Unnamed: 0_level_0,sequence,structure
reference,Unnamed: 1_level_1,Unnamed: 2_level_1
lincRNAp21_IRAlu_Sense,GGCUGGGCGUGGUGGCUCACGCCUGUAAUCCCACCACUUUGGGAGG...,"[[1, 46], [2, 45], [3, 44], [4, 23], [5, 22], ..."
lincRNAp21_IRAlu_Antisense,UUCUUUUUUUUUUUUUAUUGGAGAUGGAGUCUCACUCUGUUGCUCA...,"[[19, 118], [20, 117], [25, 112], [26, 111], [..."
MEG3,AGCCCCUAGCGCAGACGGCGGAGAGCAGAGAGGGAGCGCGCCUUGG...,"[[1, 195], [2, 194], [3, 193], [4, 192], [5, 1..."
ROX2,TGTTGCGGCATTCGCGGCCTGGTCACACTAAGCTAGGGCTACTTTT...,"[[113, 167], [114, 166], [115, 165], [116, 164..."
NORAD1_23C,AGUUCCGGUCCGGCAGAGAUCGCGGAGAGACGCAGAACGCAGCCCG...,"[[3, 752], [4, 751], [5, 750], [6, 749], [10, ..."
NORAD1_37C,AGUUCCGGUCCGGCAGAGAUCGCGGAGAGACGCAGAACGCAGCCCG...,"[[3, 752], [4, 751], [5, 750], [6, 749], [10, ..."
NORAD1_55C,AGUUCCGGUCCGGCAGAGAUCGCGGAGAGACGCAGAACGCAGCCCG...,"[[1, 1895], [2, 1894], [3, 1893], [4, 1892], [..."
NORAD2_23C,CCACCUUUGUGAACAGUAUAGUAAUGUCUAUACUUGUUCAAUAGUU...,"[[0, 653], [1, 652], [2, 651], [3, 650], [4, 6..."
NORAD2_37C,CCACCUUUGUGAACAGUAUAGUAAUGUCUAUACUUGUUCAAUAGUU...,"[[9, 39], [10, 38], [11, 37], [12, 36], [13, 3..."
NORAD2_55C,CCACCUUUGUGAACAGUAUAGUAAUGUCUAUACUUGUUCAAUAGUU...,"[[3, 51], [4, 50], [5, 49], [6, 48], [7, 47], ..."


## Convert to rouskinHF

In [30]:
import envbash
envbash.load.load_envbash('../.env')
from rouskinhf import convert, upload_dataset, get_dataset

## RouskinHF filtering

In [45]:
from rouskinhf import convert, dump_json

dump_json(ground_truth_lcRNA.to_dict(orient='index'),
          'data/lncRNA.json')

data = convert(
    'json',
    'data/lncRNA.json',
    name='lncRNA',
    path_out='data',
    filter=True,
    
)

Parsing json file: 100%|██████████| 15/15 [00:00<00:00, 18337.09it/s]

Over a total of 15 datapoints, there are:
### OUTPUT
- ALL: 15 valid datapoints
- INCLUDED: 6 duplicate sequences with different structure / dms / shape
### MODIFIED
- 0 multiple sequences with the same reference (renamed reference)
### FILTERED OUT
- 0 invalid datapoints (ex: sequence with non-regular characters)
- 0 datapoints with bad structures
- 0 duplicate sequences with the same structure / dms / shape





## To HuggingFace

In [47]:
from rouskinhf import upload_dataset

upload_dataset(
    'data/lncRNA/data.json',
    commit_message='removed super long sequences',
    exist_ok=True
)