In [6]:
import pandas as pd
import plotly.express as px

df = pd.read_csv('data_curation_stats.csv').set_index('Dataset')
df



Unnamed: 0_level_0,Fetched from online repository,Reads > 500 and S/N > 1,AUROC,BLAST,Duplicates,Non-ACGU characters,Sequences below 10nt,No structure,Redundancy across databases
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
zuber,1450,0,0,0,123,20,0,0,164
archiveII,3975,0,0,0,489,0,0,4,977
RNAstralign,37149,0,0,0,6066,3949,0,3,24213
bpRNA,102318,0,0,74617,0,0,0,135,24833
Ribonanza,806573,597452,84310,16398,0,0,0,1,108412


In [13]:

df_expanded = []

for dataset, row in df.iterrows():
    for stage in row.index:
        if stage == 'Fetched from online repository':
            val = row[stage]  
        elif stage == 'Redundancy across databases':
            val = row[stage]
        else:
            val = val - row[stage]
        df_expanded.append([dataset, stage, val])
        
        
df_expanded = pd.DataFrame(df_expanded, columns=['Dataset', 'Stage', 'Count'])

df_expanded = pd.concat([df_expanded, pd.DataFrame(
    {
        'Dataset': 'Multi-databases',
        'Stage': ['No structure', 'Redundancy across databases'],
        'Count': [0, n_redundant_sequences:=161991 - df_expanded[df_expanded['Stage'] == 'Redundancy across databases']['Count'].sum()]
    }, 
    index=[10, 11]
)])

df_expanded.sort_index(inplace=True)


fig = px.area(df_expanded, x="Stage", y="Count", color="Dataset", log_y=1, title='', category_orders={'Stage': df.columns.tolist()}, line_shape=None,
    # make ribonanza patterned
    pattern_shape='Dataset',
    pattern_shape_map={**{dataset: None for dataset in df.index}, **{'Ribonanza': 'x', 'Multi-databases': None}},
    # pattern shape legend

    
)
# fig.update_layout(
#     legend=dict(    
#         orientation="h",
#         yanchor="bottom",
#         y=1.02,
#         xanchor="right",
#         x=1
#     )
# )

fig.update_xaxes(title_text='Data curation stage')
fig.update_yaxes(title_text='Count of sequences')

# add annotations
# add count of sequences per dataset at the beginning and end of each line
import numpy as np

def add_annotation(fig, dataset, count, position, yshift=0, xanchor='right'):
    fig.add_annotation(x=position, y=np.log10(count), text=dataset + ' (N ={:,})'.format(count), showarrow=False, yshift=yshift, xanchor=xanchor, font=dict(size=12))

# add start
yshift_start = {'zuber': -27, 'archiveII': -5, 'bpRNA': -5, 'RNAstralign': -20, 'Ribonanza': -20, 'Multi-databases': 0}
for dataset in df.index:
    add_annotation(fig, dataset, df.loc[dataset, df.columns[0]], df.columns[0], yshift=yshift_start[dataset], xanchor='left')

yshift_end = {'zuber': -10, 'archiveII': -13, 'bpRNA': 13, 'RNAstralign': -15, 'Ribonanza': 4, 'Multi-databases': -10}
for dataset in df.index:
    if dataset == 'bpRNA':
        continue
    add_annotation(fig, dataset, df.loc[dataset, df.columns[-1]], df.columns[-1], yshift=yshift_end[dataset], xanchor='left')
add_annotation(fig, 'bpRNA90', df.loc['bpRNA', df.columns[-1]], df.columns[-1], yshift=yshift_end['bpRNA'], xanchor='left')
add_annotation(fig, 'Multi-databases', n_redundant_sequences, df.columns[-1], yshift=yshift_end['Multi-databases'], xanchor='left')

# add total
add_annotation(fig, 'Total',  df[df.columns[-1]].sum(), df.columns[-1], yshift=20, xanchor='left')
add_annotation(fig, 'Total',  df[df.columns[0]].sum(), df.columns[0], yshift=20, xanchor='left')

fig.update_layout(
    height=500,
    width=800,
)

# make legend horizontal
fig.update_layout(
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)

# make background transparent
fig.update_layout(
    plot_bgcolor='rgba(0,0,0,0)',
    # paper_bgcolor='white',
)
# add a grid
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey')

# add a frame
fig.update_layout(
    xaxis=dict(
        showline=True,
        linewidth=1,
        linecolor='black',
        mirror=True,
    ),
    yaxis=dict(
        showline=True,
        linewidth=1,
        linecolor='black',
        mirror=True,
    )
)

# set everything to helvetica
fig.update_layout(
    font_family="Helvetica",
    font_color="black",
    title_font_family="Helvetica",
    title_font_color="black",
    legend_title_font_color="black",
    # size
    font_size=15,
    title_font_size=15,
    legend_title_font_size=15,
)

fig.show()

import plotly.io as pio
pio.write_image(fig, 'images/S3/data_curation_stats_log.pdf')#, width=800, height=600)

In [30]:
import pandas as pd
import plotly.express as px

df = pd.read_csv('data_curation_stats_test.csv').set_index('Dataset')
df


Unnamed: 0_level_0,Fetched from online repository,Reads < 3'000,No abnormally high mutations,Good coverage,Unique sequences,AUROC > 0.8
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Human mRNA,7115,4502,3848,2929,1523,1456
Pri-miRNA,2516,2179,2179,2178,1107,1105


In [61]:
import plotly.graph_objects as go
import plotly

colors = plotly.colors.qualitative.Plotly

fig = go.Figure()
# show the count of sequences at each stage
fig.add_trace(go.Bar(x=df.columns, y=df.loc['Pri-miRNA'], name='pri-miRNA', marker_color=colors[1], text=df.loc['Pri-miRNA'], textposition='outside'))
fig.add_trace(go.Bar(x=df.columns, y=df.loc['Human mRNA'], name='mRNA', marker_color=colors[2], text=df.loc['Human mRNA'], textposition='outside'))
fig.update_layout(xaxis_title='Data curation stage', yaxis_title='Count of sequences')
fig.update_layout(template='plotly_white', font=dict(size=20, family='Times New Roman'))
# remove horizontal grid lines
fig.update_yaxes(showgrid=False)
# place the legend inside the plot
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=.82,
    xanchor="right",
    x=1
))
# increase the y axis range
fig.update_layout(yaxis=dict(range=[0, 1.2*df.max().max()]))
fig.update_layout(height=500, width=1000)
# save as high resolution png
fig.write_image('images/S3/data_curation_stats.png', scale=5)

In [39]:

df_expanded = []

for dataset, row in df.iterrows():
    for stage in row.index:
        if stage == 'Fetched from online repository':
            val = row[stage]  
        elif stage == 'Redundancy across databases':
            val = row[stage]
        else:
            val = val - row[stage]
        df_expanded.append([dataset, stage, val])
        
        
df_expanded = pd.DataFrame(df_expanded, columns=['Dataset', 'Stage', 'Count'])

df_expanded = pd.concat([df_expanded, pd.DataFrame(
    {
        'Dataset': 'Multi-databases',
        'Stage': ['No structure', 'Redundancy across databases'],
        'Count': [0, n_redundant_sequences:=161991 - df_expanded[df_expanded['Stage'] == 'Redundancy across databases']['Count'].sum()]
    }, 
    index=[10, 11]
)])

df_expanded = df_expanded[df_expanded['Dataset'] != 'Ribonanza']

df = df[df.index != 'Ribonanza']
df_expanded.sort_index(inplace=True)


fig = px.area(df_expanded, x="Stage", y="Count", color="Dataset", log_y=1, title='Data curation statistics', category_orders={'Stage': df.columns.tolist()}, line_shape=None,
    # make ribonanza patterned
    pattern_shape='Dataset',
    pattern_shape_map={**{dataset: None for dataset in df.index}, **{'Ribonanza': 'x', 'Multi-databases': None}},
    # pattern shape legend
    
)
# fig.update_layout(
#     legend=dict(    
#         orientation="h",
#         yanchor="bottom",
#         y=1.02,
#         xanchor="right",
#         x=1
#     )
# )

fig.update_xaxes(title_text='Data curation stage')
fig.update_yaxes(title_text='Count of sequences')

# add annotations
# add count of sequences per dataset at the beginning and end of each line
import numpy as np

def add_annotation(fig, dataset, count, position, yshift=0, xanchor='right'):
    fig.add_annotation(x=position, y=np.log10(count), text=dataset + ' (N ={:,})'.format(count), showarrow=False, yshift=yshift, xanchor=xanchor, font=dict(size=12))

# add start
yshift_start = {'zuber': -27, 'archiveII': -5, 'bpRNA': -5, 'RNAstralign': -20, 'Ribonanza': -20, 'Multi-databases': 0}
for dataset in df.index:
    add_annotation(fig, dataset, df.loc[dataset, df.columns[0]], df.columns[0], yshift=yshift_start[dataset], xanchor='left')

yshift_end = {'zuber': -10, 'archiveII': -13, 'bpRNA': 13, 'RNAstralign': -15, 'Ribonanza': 4, 'Multi-databases': -10}
for dataset in df.index:
    if dataset == 'bpRNA' or dataset == 'Ribonanza':
        continue
    add_annotation(fig, dataset, df.loc[dataset, df.columns[-1]], df.columns[-1], yshift=yshift_end[dataset], xanchor='left')
add_annotation(fig, 'bpRNA90', df.loc['bpRNA', df.columns[-1]], df.columns[-1], yshift=yshift_end['bpRNA'], xanchor='left')
add_annotation(fig, 'Multi-databases', n_redundant_sequences, df.columns[-1], yshift=yshift_end['Multi-databases'], xanchor='left')

# add total
add_annotation(fig, 'Total',  df[df.columns[-1]].sum(), df.columns[-1], yshift=20, xanchor='left')
add_annotation(fig, 'Total',  df[df.columns[0]].sum(), df.columns[0], yshift=20, xanchor='left')

fig.update_layout(
    height=500,
    width=800,
)

# make legend horizontal
fig.update_layout(
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)

# make background transparent
fig.update_layout(
    plot_bgcolor='rgba(0,0,0,0)',
    # paper_bgcolor='white',
)
# add a grid
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey')

# add a frame
fig.update_layout(
    xaxis=dict(
        showline=True,
        linewidth=1,
        linecolor='black',
        mirror=True,
    ),
    yaxis=dict(
        showline=True,
        linewidth=1,
        linecolor='black',
        mirror=True,
    )
)
fig.show()

import plotly.io as pio
pio.write_image(fig, 'data_curation_stats_log.pdf')#, width=800, height=600)

In [126]:
df.columns[0]


'Fetched from online repository'

In [5]:
import plotly.express as px
import plotly.graph_objects as go

# Sample data
data = px.data.tips()

# Set the threshold for capping
threshold = 40

# Cap values beyond the threshold
data['total_bill'] = data['total_bill'].clip(upper=threshold)

# Create violin plot
fig = go.Figure()

# cap to 40 
fig.add_trace(go.Violin(y=data['total_bill'], box_visible=True, line_color='black', fillcolor='lightseagreen', opacity=0.6,))

# Customize layout
fig.update_layout(
    title='Violin Plot with Capped Values',
    xaxis=dict(title='Day'),
    yaxis=dict(title='Total Bill'),
)

# Show the plot
fig.show()


In [128]:
df_expanded.drop_duplicates('Dataset', keep='last')['Count'].sum()

161991