In [1]:
import RNA
import pandas as pd
import numpy as np
from rouskinhf import get_dataset

from rnastructure_wrapper import RNAstructure

In [2]:
def pairList2dotbracket(pair_list, length):
    dotbracket = ['.'] * length
    for i, j in pair_list:
        dotbracket[i] = '('
        dotbracket[j] = ')'
    return ''.join(dotbracket)

def dotbracket2pairList(dotbracket):
    pair_list = []
    stack = []
    for i, c in enumerate(dotbracket):
        if c == '(':
            stack.append(i)
        elif c == ')':
            pair_list.append((stack.pop(), i))
    return pair_list


def compute_f1(pred_matrix, target_matrix, threshold=0.5):
    """
    Compute the F1 score of the predictions.

    :param pred_matrix: Predicted pairing matrix probability  (L,L)
    :param target_matrix: True binary pairing matrix (L,L)
    :return: F1 score for this RNA structure
    """

    pred_matrix = (pred_matrix > threshold)

    sum_pair = np.sum(pred_matrix) + np.sum(target_matrix)

    if sum_pair == 0:
        return 1.0
    else:
        return (2 * np.sum(pred_matrix * target_matrix) / sum_pair)
    
def pairList2pairMatrix(pair_list, len_seq):
    pair_list = np.array(pair_list).astype(int)
    pairing_matrix = np.zeros((len_seq, len_seq))

    if len(pair_list) > 0:
        pairing_matrix[pair_list[:,0], pair_list[:,1]] = 1.0
        pairing_matrix[pair_list[:,1], pair_list[:,0]] = 1.0

    return pairing_matrix

from sklearn.metrics import roc_auc_score
import numpy as np
def auroc_dms(dms: list, structure: str):

    UKN = -1000

    dms = np.array(dms)
    assert len(structure) == len(dms), 'Sequence, dms and structure must have the same length'

    isNotPaired = np.array([1 if structure[i] == '.' else 0 for i in range(len(structure))])
    mask = (dms != UKN)

    # If the non masked bases are all paired or all unpaired
    if len(np.unique(isNotPaired[mask])) == 1:
        return np.nan

    return roc_auc_score(isNotPaired[mask], dms[mask])

In [15]:
data = get_dataset('pri_miRNA', force_download=True)
data = pd.DataFrame(data).T
data['auroc'] = data.apply(lambda x: auroc_dms(x['dms'], pairList2dotbracket(x['structure'], len(x['dms']))), axis=1)

pri_miRNA: Downloading dataset from HuggingFace Hub...


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

pri_miRNA: Download complete. File saved at data/pri_miRNA/data.json


In [16]:
from tqdm import tqdm
rnastruct_fold = RNAstructure(path='~/RNAstructure/exe/')


F1_constraints = []
F1_noConstraints = []
F1_ShapeKnots = []

for i, row in tqdm(data.iterrows(), total=len(data)):

    # RNAstructure vs ViennaRNA (with constraints)
    vienna_rna = RNA.fold_compound(row['sequence'])
    vienna_rna.sc_add_SHAPE_deigan(row['dms'], 2.11, 0)
    vienna_struct = dotbracket2pairList(vienna_rna.mfe()[0])

    F1_constraints.append(compute_f1(  pairList2pairMatrix(vienna_struct, len(row['sequence'])),
                                                pairList2pairMatrix(row['structure'], len(row['sequence'])) ) )
    

    # RNAstructure vs ViennaRNA (without constraints)
    fold_struct = np.array(rnastruct_fold.fold(row['sequence'], output_format='basepairs', mfe_only=True)['basepairs'])-1

    vienna_rna = RNA.fold_compound(row['sequence'])
    vienna_struct = dotbracket2pairList(vienna_rna.mfe()[0])

    F1_noConstraints.append(compute_f1(  pairList2pairMatrix(fold_struct, len(row['sequence'])), 
                                        pairList2pairMatrix(vienna_struct, len(row['sequence'])) ) )
    
    # ShapeKnots vs Fold (with constraints)
    shapeknots_struct = np.array(rnastruct_fold.fold(row['sequence'], dms=row['dms'], 
                                                     output_format='basepairs', mfe_only=True, pseudoknots=True)['basepairs'])-1
    
    F1_ShapeKnots.append(compute_f1(  pairList2pairMatrix(shapeknots_struct, len(row['sequence'])), 
                                        pairList2pairMatrix(row['structure'], len(row['sequence'])) ) )


data['F1_constraints'] = F1_constraints
data['F1_noConstraints'] = F1_noConstraints
data['F1_ShapeKnots'] = F1_ShapeKnots
    

100%|██████████| 1098/1098 [12:08:21<00:00, 39.80s/it]  


In [17]:
## Probably an issue with the ViennaRNA implementation

import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(rows=1, cols=3)
fig.add_trace(go.Scatter(x=data['auroc'], y=data['F1_constraints'], name='With DMS', opacity=0.5, mode='markers'), col=1, row=1)
fig.add_hline(y=np.mean(data['F1_constraints']), line_dash='dash', line_color='blue', annotation_text='mean', col=1, row=1)

fig.add_trace(go.Scatter(x=data['auroc'], y=data['F1_noConstraints'], name='Without DMS', opacity=0.5, mode='markers'), col=2, row=1)
fig.add_hline(y=np.mean(data['F1_noConstraints']), line_dash='dash', line_color='red', annotation_text='mean', col=2, row=1)

fig.add_trace(go.Scatter(x=data['auroc'], y=data['F1_ShapeKnots'], name='ShapeKnots', opacity=0.5, mode='markers'), col=3, row=1)
fig.add_hline(y=np.mean(data['F1_ShapeKnots']), line_dash='dash', line_color='green', annotation_text='mean', col=3, row=1)

fig.update_layout(title='ViennaRNA vs RNAstructure Fold on the pri-miRNA dataset', xaxis_title='AUROC', yaxis_title='F1 score', 
                  width=1200, height=600)

In [20]:
# ShapeKnots is reliable, print metrics
print(f"Average F1 score: {np.mean(data['F1_ShapeKnots']):.2f}, fraction equal to 1: {(data['F1_ShapeKnots']==1).sum()/len(data['F1_ShapeKnots']):.2f}")  

Average F1 score: 0.98, fraction equal to 1: 0.72
