In [1]:
import pandas as pd

In [2]:
from sklearn.metrics import roc_auc_score
import numpy as np
def auroc_dms(dms: list, structure: str):

    UKN = -1000

    dms = np.array(dms)
    assert len(structure) == len(dms), 'Sequence, dms and structure must have the same length'

    isNotPaired = np.array([1 if structure[i] == '.' else 0 for i in range(len(structure))])
    mask = (dms != UKN)

    # If the non masked bases are all paired or all unpaired
    if len(np.unique(isNotPaired[mask])) == 1:
        return np.nan

    return roc_auc_score(isNotPaired[mask], dms[mask])

def basepairs2dot(basepairs: list, length: int):
    dot = ['.'] * length
    for pair in basepairs:
        dot[pair[0]] = '('
        dot[pair[1]] = ')'
    return ''.join(dot)

In [3]:
data_mRNA = pd.read_json('saved_data_plot/full_human_mRNA.json').T
data_mRNA['auroc_dms'] = data_mRNA.apply(lambda x: auroc_dms(x['dms'], basepairs2dot(x['structure'], len(x['sequence']))), axis=1)

data_primiRNA = pd.read_json('saved_data_plot/full_primiRNA.json').T
data_primiRNA['auroc_dms'] = data_primiRNA.apply(lambda x: auroc_dms(x['dms'], basepairs2dot(x['structure'], len(x['sequence']))), axis=1)

In [4]:
# Make horizontal boxplot with plotly

import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Violin(
    x=data_primiRNA['auroc_dms'],
    name='pri-miRNA',
    line_color='#EF553B',
    box_visible=False,
    points=False,
    meanline_visible=True
))  
fig.add_trace(go.Violin(
    x=data_mRNA['auroc_dms'],
    name='mRNA',
    line_color='#00CC96',
    box_visible=False,
    points=False,
    meanline_visible=True
))  

fig.update_layout(
    xaxis_range=[0, 1],
    template='plotly_white', font_color='black',
    font_family='time new romans', 
    font_size=20,
    xaxis_title='AUROC between DMS and structure model',
    width=1000,
    height=450
)

fig.show()


In [6]:
fig.write_image("images/e_auroc_distribution.pdf")