In [1]:
import pandas as pd
import numpy as np
import torch

import os

In [2]:
def ListofPairs2pairMatrix(pairs, length):
    pairs = np.array(pairs)
    matrix = torch.zeros((length, length))

    if len(pairs) == 0: return matrix
    matrix[pairs[:,0], pairs[:,1]] = 1
    matrix[pairs[:,1], pairs[:,0]] = 1

    return matrix.int()


def compute_f1(pred_matrix, target_matrix, threshold=0.5):
    """
    Compute the F1 score of the predictions.

    :param pred_matrix: Predicted pairing matrix probability  (L,L)
    :param target_matrix: True binary pairing matrix (L,L)
    :return: precision, recall, F1 score for this RNA structure
    """

    pred_matrix = (pred_matrix > threshold).float()


    TP = torch.sum(pred_matrix*target_matrix)
    PP = torch.sum(pred_matrix)
    P = torch.sum(target_matrix)
    sum_pair = PP + P

    if sum_pair == 0:
        return [1.0, 1.0, 1.0]
    else:
        return [
                (TP / PP).item(),
                (TP / P).item(),
                (2 * TP / sum_pair).item()
                ]

In [81]:
# Load predictions from algorithms
prediction_algos = pd.read_feather('../Figure1/saved_data_plot/results_benchmark_algos.feather').drop(columns=['Precision', 'Recall', 'F1', 'MCC', 'length'])
prediction_algos['structure'] = prediction_algos['structure'].apply(lambda x: np.stack(x) if len(x) > 0 else np.array([]))
ref2dataset = prediction_algos[['reference', 'dataset']].drop_duplicates()

# Load the predictions from eFold and convert to same format
prediction_eFold = pd.read_feather('../Figure5/saved_data_plot/results_V1/test_results_PT+FT.feather')
ref2sequence = prediction_eFold[['reference', 'sequence']].drop_duplicates()
prediction_eFold['structure'] = prediction_eFold['structure'].apply(lambda x: np.unique(np.sort(np.vstack(np.where(np.stack(x))).T, axis=1), axis=0))
prediction_eFold = prediction_eFold.merge(ref2dataset, on=['reference'])
prediction_eFold['model'] = 'eFold'

prediction_algos = prediction_algos.merge(ref2sequence, on=['reference'])
prediction_algos = pd.concat([prediction_algos, prediction_eFold])

prediction_algos.loc[prediction_algos['dataset']=='viral_fragments', 'dataset'] = 'Viral mRNA'
prediction_algos.loc[prediction_algos['dataset']=='lncRNA_nonFiltered', 'dataset'] = 'Long ncRNA'


In [82]:
full_data = pd.DataFrame()

for i, model1 in enumerate(prediction_algos.model.unique()):
    for model2 in prediction_algos.model.unique()[i+1:]:

        data_comparison = prediction_algos[prediction_algos['model']==model1].merge(prediction_algos[prediction_algos['model']==model2], on=['reference', 'sequence', 'dataset'], suffixes=('_1', '_2'))
        print(model1, model2, data_comparison.shape)

        data_comparison['F1'] = data_comparison.apply(lambda x: compute_f1(ListofPairs2pairMatrix(x['structure_1'], len(x['sequence'])), 
                                                                           ListofPairs2pairMatrix(x['structure_2'], len(x['sequence'])))[2], axis=1)
        
        data_comparison.drop(columns=['structure_1', 'structure_2'], inplace=True)

        full_data = pd.concat([full_data, data_comparison])
        

RNAstructure MXFold2 (3907, 7)
RNAstructure EternaFold (3907, 7)
RNAstructure UFold (3896, 7)
RNAstructure SPOT-RNA (3907, 7)
RNAstructure RNAformer (3907, 7)
RNAstructure eFold (3907, 7)
MXFold2 EternaFold (3907, 7)
MXFold2 UFold (3896, 7)
MXFold2 SPOT-RNA (3907, 7)
MXFold2 RNAformer (3907, 7)
MXFold2 eFold (3907, 7)
EternaFold UFold (3896, 7)
EternaFold SPOT-RNA (3907, 7)
EternaFold RNAformer (3907, 7)
EternaFold eFold (3907, 7)
UFold SPOT-RNA (3970, 7)
UFold RNAformer (3896, 7)
UFold eFold (3896, 7)
SPOT-RNA RNAformer (3907, 7)
SPOT-RNA eFold (3907, 7)
RNAformer eFold (3907, 7)


In [86]:
# Plot imshow of F1 scores between each pair of models, using plotly, separated by dataset, add the number in each cell

from plotly.subplots import make_subplots

import plotly.graph_objects as go
models = prediction_algos.model.unique()

fig = make_subplots(rows=2, cols=2, subplot_titles=full_data.dataset.unique())

for k, dataset in enumerate(full_data.dataset.unique()):
    data = full_data[full_data['dataset']==dataset]

    f1_matrix = np.zeros((len(models), len(models)))

    for i, model1 in enumerate(models):
        for j, model2 in enumerate(models):
            f1_matrix[i,j] = data[(data['model_1']==model1) & (data['model_2']==model2)]['F1'].mean()

    fig.add_trace(go.Heatmap(z=f1_matrix, x=models, y=models, 
                             text=f1_matrix.round(2).astype(str), texttemplate="%{text}",
                             showscale=False), row=(k//2)+1, col=(k%2)+1)


fig.update_layout(height=1200, width=1200, 
                  template='plotly_white', font_size=14, font_color='black', font_family='times new roman',)
fig.show()

In [89]:
fig.write_image("images/S4/prediction_similarity.pdf")