## Plot of the length distribution for our datasets vs the rest

One plot with three subplots:
- First row is length distribution of UTR
- Second row is for pri-miRNA
- Third row is all the rest of data (Ribo, bpRNA, ArchiveII, RNAstralign, synthetic, zuber ...)

Use shared x axis

**Assigned to**: Colin

Use Ploty, and a white background

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import json
import os
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
def get_lengths(data):
    lens = []
    for i in data:
        seq = data[i]['sequence']
        lens.append(len(seq))
    return lens

In [3]:
import rouskinhf
lengths = {
    dataset: get_lengths(rouskinhf.get_dataset(dataset)) for dataset in ['pri_miRNA', 'human_mRNA']
}

In [5]:
from rouskinhf import get_dataset

fig = go.Figure()
# fig = make_subplots(rows=1, cols=1,
#                     specs=[[{"secondary_y": True}]])

# grey
pri_color = '#EF553B'
mRNA_color = '#00CC96'

data = get_dataset('pri_miRNA')
data = pd.DataFrame(data).T
fig.add_trace(go.Histogram(x=data['sequence'].str.len(), nbinsx=50, 
                            marker_color=pri_color, opacity=1, name='pri-miRNA', yaxis="y", marker=dict(line=dict(width=13, color=pri_color))))


data = get_dataset('human_mRNA')
data = pd.DataFrame(data).T
fig.add_trace(go.Histogram(x=data['sequence'].str.len(), nbinsx=50, 
                         marker_color=mRNA_color, opacity=1, name='mRNA', yaxis="y2"))



max_y2 = 70
max_y1 = 1400
nticks = 3

fig.update_layout(
    yaxis_title='probability',
    xaxis_title='Lengths of sequences (nt)',
    xaxis=dict(range=[200, 1070]),  # Set x-axis range
    template='plotly_white',
    bargap=0.1,
    font_family='helvetica light', 
    font_size=16,
    width=600,
    height=450,
    showlegend=True,
    # remove lines 
    yaxis_showgrid=False,
    yaxis=dict(
        title="pri-miRNA count",
        titlefont=dict(
            color=pri_color,
            family='helvetica light',
            size=20,
        ),
        tickvals=[0, 600, 1200],
        tickfont=dict(
            color=pri_color,
            size=20
        ),
        range = [0, max_y1],
    ),
    yaxis2=dict(
        title="mRNA count",
        titlefont=dict(
            color=mRNA_color,
            family='helvetica light',
            size=20,
        ),
        tickfont=dict(
            color=mRNA_color,
            size=20
        ),
        tickvals=[0, 30, 60],
        anchor="free",
        overlaying="y",
        side="right",
        position=1,
        range = [0, max_y2],
    ),
)

# remove gridlines
fig.update_yaxes(showgrid=False, zeroline=False)

# hide legend
fig.update_layout(showlegend=False)

# to pdf
import plotly.io as pio
pio.write_image(fig, 'images/a_length_distribution.pdf')

fig.show()