## Benchmark different algos on different test sets, for structure prediction

One violin plot with the three test sets (PDB, lncRNA, viral_fragments) for each algorithm (RNAstructure, EternaFold, MxFold2, Ufold). 

Group by algorithm.
Colored by test set

**Assigned to**: Alberic

Use Ploty, and a white background

In [1]:
import pandas as pd

results = pd.read_feather('results_benchmark_algos.feather').set_index('reference')
results.loc[results['dataset']=='viral_fragments', 'dataset'] = 'viral mRNA'
results.loc[results['dataset']=='lncRNA', 'dataset'] = 'long ncRNA'
results.loc[results['dataset']=='archiveII_blast', 'dataset'] = 'archiveII'

In [2]:
results.groupby(['model', 'dataset']).describe()[['F1']]

Unnamed: 0_level_0,Unnamed: 1_level_0,F1,F1,F1,F1,F1,F1,F1,F1
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
model,dataset,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
EternaFold,PDB,356.0,0.880343,0.220558,0.0,0.879512,0.976744,1.0,1.0
EternaFold,archiveII,355.0,0.599729,0.252134,0.0,0.417617,0.656151,0.807764,0.986301
EternaFold,long ncRNA,15.0,0.447952,0.097185,0.304904,0.387898,0.406389,0.52477,0.669173
EternaFold,viral mRNA,58.0,0.730161,0.168405,0.334076,0.645014,0.745455,0.866734,0.983607
MXFold2,PDB,356.0,0.903932,0.193964,0.0,0.9,1.0,1.0,1.0
MXFold2,archiveII,355.0,0.713831,0.220128,0.0,0.625933,0.784091,0.87678,0.985507
MXFold2,long ncRNA,15.0,0.394327,0.177139,0.055556,0.343794,0.390745,0.441807,0.664122
MXFold2,viral mRNA,58.0,0.692629,0.212328,0.102128,0.564697,0.695934,0.86761,1.0
RNAstructure,PDB,356.0,0.889996,0.216522,0.0,0.896372,1.0,1.0,1.0
RNAstructure,archiveII,355.0,0.554425,0.23953,0.0,0.352941,0.583333,0.753623,1.0


In [3]:
# Creat a box plot with plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
fig = make_subplots(rows=1, cols=1, shared_xaxes=True)

colors = px.colors.qualitative.Set2
for i, dataset in enumerate(results['dataset'].unique()):
    results_dataset = results[results['dataset']==dataset]
    fig.add_trace(go.Violin(x=results_dataset['model'], y=results_dataset['F1'], 
                            name=f'{dataset} (N={len(results_dataset[results_dataset["model"]=="RNAstructure"])})', marker_color=colors[i], 
                            meanline_visible=True, points=False))
    
fig.update_layout(
                    # title='F1 score distribution for each model and dataset', 
                  yaxis_title='F1 score', xaxis_title='Model',
                  violinmode='group', yaxis_range=[0, 1],
                  width=1000, height=370,
                  template='plotly_white', font_size=15, font_color='black',)
fig.update_xaxes(categoryorder='array', categoryarray= ['RNAstructure', 'EternaFold', 'MXFold2', 'UFold'])
fig.show()

In [5]:
# save pdf
fig.write_image("images/a_algo_benchmark.pdf")
fig.write_image("images/a_algo_benchmark.eps")