In [1]:
import os
from os.path import join
from os import listdir
import pandas as pd
import numpy as np
import tqdm
import pandas as pd
import rouskinhf

## Plot showing the quality of our datasets

Compare replicates:
- R2 score between normalized DMS signals (first row of subplot)
- F1 score between replicate + RNAstructure (second row)

Then bootstrap from the DMS by adding noise proportional to the confidence interval, then predicting the structure with the noisy DMS. Compute the F1 score between the structures (third row of the subplot)

Use pri_miRNA and human_mRNA (called UTR previously). Combine them in one plot, or separate them in two columns if realy different

**Assigned to**: Yves

Use Ploty, and a white background

## Build dataset (no need to re-run)

In [2]:
# load refs from HF
refs = list(rouskinhf.get_dataset('human_mRNA').keys()) + list(rouskinhf.get_dataset('pri_miRNA').keys())

# load reps data from local
pri = pd.read_feather('saved_data_plot/pri_miRNA_normalized.feather').drop(columns=['index'])
pri['dataset'] = 'pri-miRNA'
mrna = pd.read_feather('saved_data_plot/mRNA_normalized.feather').drop(columns=['index'])
mrna['dataset'] = 'mRNA'
df = pd.concat([pri, mrna])

# # keep only reps that are in HF
# df = df[df['replicate'] != 'Untreated']
# df = df[df['reference'].isin(refs)].reset_index(drop=True)

# count the number of reference per sample
df['number_of_replicates'] = df.groupby(['reference', 'dataset', 'plate'])['reference'].transform('count')

# Count the amount of available reps per dataset
ref_per_sample_per_dataset = {}
for dataset in df['dataset'].unique():
    loc_df = df[df['dataset']==dataset]
    ref_per_sample_per_dataset[dataset] = {k[0]: v for k, v in dict(loc_df.value_counts(['number_of_replicates']).sort_index()).items()} 

# drop the reference with less than 2 sample 
l = len(df)
df = df[df['number_of_replicates'] == 2]
print("drop {}/{} references".format(l - len(df), l))


drop 1649/5107 references


In [3]:
df

Unnamed: 0,sample,reference,plate,replicate,sequence,sub_rate,coverage,n_reads,dataset,number_of_replicates
0,1-5_1_B_S2_L001,hsa-mir-6803,0,B,CAGAGATTCAACCGTCCTGCCTGCTCACCTGTGTGGAGCGGCCTCA...,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...",35767.00,pri-miRNA,2
1,1-5_1_B_S2_L001,hsa-mir-4757,0,B,CAGAGATTCAACCGTCCTGGTGACGTCACGAAGTTACGTAATGGAA...,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...",46551.50,pri-miRNA,2
2,1-5_1_B_S2_L001,hsa-mir-6804,0,B,CAGAGATTCAACCGTCCTGAGGCCCCTCCCCGTGGGGCAGGTGGCA...,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...",13801.75,pri-miRNA,2
3,1-5_1_B_S2_L001,hsa-mir-649,0,B,CAGAGATTCAACCGTCCTGCTCACCTCAGCCTCCCAAAGTGATGGG...,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...",76881.25,pri-miRNA,2
4,1-5_1_B_S2_L001,hsa-mir-4750,0,B,CAGAGATTCAACCGTCCTGACTTCTGAGCCCCCAGAGGGCCAGCCC...,"[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...",23549.25,pri-miRNA,2
...,...,...,...,...,...,...,...,...,...,...
2914,Un_5--1-5_12_B_S1_L001,ENSG00000129862.7,12,B,GGGGCAGCCTGGAGTTAGTCGACCGTTGCGAGACGTTGAGCTGCGG...,"[-1000.0, -1000.0, -1000.0, -1000.0, 0.4074819...","[-1000.0, -1000.0, -1000.0, -1000.0, 6932.675,...",12622.25,mRNA,2
2917,Un_5--1-5_12_B_S1_L001,ENSG00000129824.16,12,B,GGGCTCTTCCGTCGCAGAGTTTCGCCATGGCCCGGGGCCCCAAGAA...,"[-1000.0, -1000.0, -1000.0, 0.1979055170023161...","[-1000.0, -1000.0, -1000.0, 13385.9, -1000.0, ...",21021.75,mRNA,2
2918,Un_5--1-5_12_B_S1_L001,ENSG00000136819.15,12,B,GGGAGTGCTGTGGCCTTGCTGGTGGGAGAGAAGGTACAAGAGGAGA...,"[-1000.0, -1000.0, -1000.0, 0.3659597865170987...","[-1000.0, -1000.0, -1000.0, 13904.875, -1000.0...",24730.25,mRNA,2
2923,Un_5--1-5_12_B_S1_L001,ENSG00000137947.12,12,B,GGGCTGCTTTCGGTGTGTCTGTTGTGTCTTGTTGCGGGCACCGCAG...,"[-1000.0, -1000.0, -1000.0, 0.1865048385746620...","[-1000.0, -1000.0, -1000.0, 22774.225, -1000.0...",30728.00,mRNA,2


## Compute scores

In [5]:
import plotly.graph_objects as go

# set default font to helvetica light
import plotly.io as pio
pio.templates.default = "plotly_white"
pio.templates["plotly_white"].layout.font.family = "Helvetica Neue, Helvetica, Arial, sans-serif"

def compute_score_between_replicates(df, score):
    scores = []
    for (ref, plate), group in tqdm.tqdm(df.groupby(['reference', 'plate']), total=len(df['reference'].unique())):
        A = group[group['replicate'] == 'A']
        B = group[group['replicate'] == 'B']
        dataset = group['dataset'].values[0]
        scores.append({
            'reference': ref,
            'plate': plate,
            'replicate': 'A',
            score.name: score(A, B),
            'dataset': dataset
        })
        scores.append({
            'reference': ref,
            'plate': plate,
            'replicate': 'B',
            score.name: score(B, A),
            'dataset': dataset
        })
    
    return pd.DataFrame(scores)


import plotly
colors = plotly.colors.qualitative.Plotly

def violin_plot(df, scores, score, min_X):  
    fig = go.Figure()
    for name, dataset in scores.groupby('dataset'):
        fig.add_trace(go.Violin(x=dataset[score.name], name=name,# + ' (N={:,})'.format(len(dataset)),
                            meanline_visible=True,
                            # dont show outliers
                            marker=dict(color=colors[2 if name == 'mRNA' else 1]),
                            points=False,
                            ))

    fig.update_layout(
        title='{} score distribution between replicates'.format(score.name),
        xaxis_title=score.name + ' score',
        # yaxis_title='dataset',
        bargap=0.2,
        bargroupgap=0.1,
        xaxis_range=[min_X, 1],
        showlegend=False,
        width=800,    
        paper_bgcolor='white',  # Background color of the entire plot
        plot_bgcolor='white',  # Background color of the plot area
        # add a frame
        margin=dict(l=50, r=50, t=50, b=50),  # Adjust margins
        xaxis=dict(
            showline=True,
            linewidth=2,
            linecolor='lightgrey',
            mirror=True,
            showgrid=False,
            gridcolor='white',
            gridwidth=2,
        ),
        yaxis=dict(
            showline=True,
            linewidth=2,
            linecolor='lightgrey',
            mirror=True,
            showgrid=False,
            gridcolor='white',
            gridwidth=2,
        ),
        font=dict(
        size=18,
    )
    )
    
    # add median value on the plot
    # pri-miRNA

    return fig
    
def normalize(x, mask=None):
    x = np.array(x)
    mask = x != -1000. if mask is None else mask
    per90 = np.percentile(x, 90)
    y = np.clip(x[mask] / per90, 0, 1)
    x[mask] = y
    return x


def pearson_score(x, y):
    if isinstance(x, np.ndarray) and isinstance(y, np.ndarray):
        A, B = x, y
    else:
        if not len(x['sub_rate'].values) or not len(y['sub_rate'].values):
            return np.nan
        A, B = x['sub_rate'].values[0], y['sub_rate'].values[0]
    mask = (A != -1000.) & (B != -1000.)
    A, B = A[mask], B[mask]
    return np.corrcoef(A, B)[0, 1]

pearson_score.name = 'pearson'


fig = violin_plot(df, compute_score_between_replicates(df, pearson_score), pearson_score, 0.).update_layout(width=800, height=300)#.show()
fig.update_xaxes(
    title = 'Pearson score distribution between replicates',
)
fig.update_layout(title="")
fig.show()
# save figure as pdf
import plotly.io as pio
pio.write_image(fig, 'images/c_data_quality.pdf')

  0%|          | 0/1729 [00:00<?, ?it/s]

100%|██████████| 1729/1729 [00:00<00:00, 2755.03it/s]
