In [2]:
import json
import pandas as pd
import numpy as np

## Ribonanza

In [3]:
def rename_families(x):
    if x == '5s' or x == '16s' or x == '23s' or 'rRNA' in x:
        return 'rRNA'
    if x == 'grp1' or x == 'group_I_intron':
        return 'Introns'
    if x == 'grp2' or 'intron' in x:
        return 'Introns'
    if x == 'srp' or x == 'SRP':
        return 'SRP'
    if x == 'telomerase':
        return 'telomerase'
    if x == 'RNaseP':
        return 'rRNA'
    else: 
        return x

## RNAStralign

In [4]:
data = json.load(open('../RNAStralign/data.json'))
from rouskinhf import convert
data = convert(
                'ct',
                file_or_folder='../RNAStralign/data',
                name='RNAStralign',
                filter=0
            )

new_data = {}
refs = {}
for ref, attr in data.items():
    if '__####' in ref:
        family, ref = ref.split('__####')
        attr['family'] = family
    if not ref in refs:
        refs[ref] = 0
    else:
        while f'{ref}_{refs[ref]}' in refs:
            refs[ref] += 1
        ref = f'{ref}_{refs[ref]}'
    new_data[ref] = attr
    
data = new_data
families = pd.DataFrame.from_dict(data, orient='index')['family'].apply(lambda x: x.split('__')[0].replace('_database', '')).apply(rename_families)
fam_rnastralign = pd.DataFrame(families.value_counts()).rename(columns={'count': 'RNAStralign'})
len_rnastralign = pd.DataFrame.from_dict(data, orient='index')['sequence'].apply(len)
fam_rnastralign

Parsing ct files: 100%|██████████| 37149/37149 [00:22<00:00, 1625.11it/s]


Drop 3949 datapoints with None values (null sequence or reference)


Unnamed: 0_level_0,RNAStralign
family,Unnamed: 1_level_1
rRNA,21904
tRNA,9234
Introns,453
SRP,164
tmRNA,133
telomerase,37


## archiveII

In [85]:
import os
fam_archivII = {}
len_archivII = []
for file in os.listdir('/Users/yvesmartin/data/archiveII'):
    if not file.endswith('.seq'):
        continue
    seq = open(f'/Users/yvesmartin/data/archiveII/{file}').read().split('\n')[2]
    family = file.split('_')[0]
    fam_archivII[family] = fam_archivII.get(family, 0) + 1
    len_archivII.append(len(seq))
len(len_archivII)
fam_archivII

{'5s': 1283,
 'tRNA': 557,
 'srp': 928,
 'tmRNA': 462,
 'RNaseP': 454,
 'grp1': 115,
 '23s': 35,
 '16s': 110,
 'telomerase': 37,
 'grp2': 11}

In [86]:
fam_archivII_new = {}
for k, v in fam_archivII.items():
    fam_archivII_new[rename_families(k)] = fam_archivII_new.get(rename_families(k), 0) + v
fam_archivII = pd.DataFrame.from_dict(fam_archivII_new, orient='index').rename({0: 'ArchiveII'}, axis=1)
fam_archivII

Unnamed: 0,ArchiveII
rRNA,1882
tRNA,557
SRP,928
tmRNA,462
Introns,126
telomerase,37


In [88]:
assert len(len_archivII) == fam_archivII['ArchiveII'].sum()

## bpRNA (from paper)

In [89]:
import pandas as pd
from io import StringIO

# Provided data
data = """
RNA Type,bpRNA-1m,bpRNA-1m(90)
Transfer RNA,35622,3383
16S Ribosomal RNA,17641,1067
5S Ribosomal RNA,477,607
Signal Recognition Particle RNA,1603,388
Ribonuclease P RNA,1425,605
Transfer Messenger RNA,161,449
Group I Intron,237,123
23S Ribosomal RNA,191,72
Hammerhead Ribozyme,186,77
Group II Intron,131,101
"""

# Create a DataFrame
bprna = pd.read_csv(StringIO(data), delimiter=',')#.set_index('RNA Type')
def rename_families_bp(x):
    return {
    'Transfer RNA': 'tRNA',
    '16S Ribosomal RNA': 'rRNA',
    '5S Ribosomal RNA': 'rRNA',
    'Signal Recognition Particle RNA': 'Other',
    'Ribonuclease P RNA': 'Other',
    'Transfer Messenger RNA': 'tmRNA',
    'Group I Intron': 'Introns',
    '23S Ribosomal RNA': 'rRNA',
    'Hammerhead Ribozyme': 'rRNA',
    'Group II Intron': 'Introns',
    }.get(x, x)
bprna['RNA Type'] = bprna['RNA Type'].apply(rename_families_bp)
bprna = bprna.groupby('RNA Type').sum()
bprna

Unnamed: 0_level_0,bpRNA-1m,bpRNA-1m(90)
RNA Type,Unnamed: 1_level_1,Unnamed: 2_level_1
Introns,368,224
Other,3028,993
rRNA,18495,1823
tRNA,35622,3383
tmRNA,161,449


### bpRNA from data

In [97]:
from rouskinhf import convert, get_dataset

data = pd.DataFrame.from_dict(get_dataset('bpRNA'), orient='index').drop_duplicates('sequence')
len_bprna = data['sequence'].apply(len)
len(len_bprna)

bprna = pd.DataFrame({
    'tRNA': 35622,
    "rRNA": 17641 + 477 + 1425 + 191 + 186,
    'SRP': 1603,
    'Introns': 237+131,
    'tmRNA': 161,
}, index=['bpRNA-1m']).T
bprna

Unnamed: 0,bpRNA-1m
tRNA,35622
rRNA,19920
SRP,1603
Introns,368
tmRNA,161


In [91]:
len_bprna90 = [len(l) for l in open('/Users/yvesmartin/data/bpRNA/bpRNA_1m_90.fasta').read().split('\n')[1::2]]

# Aggregate

In [98]:
df = pd.concat([bprna, fam_rnastralign, fam_archivII], axis=1)
# df.columns = ['bpRNA-1m',  'RNAStralign', 'ArchiveII']
# df = df.fillna(0).astype(int)
df_family = df.copy()
# df_family.loc['total'] = df_family.sum()
# df_family.loc['total', 'bpRNA-1m'] = len(bprna)
# df_family.loc['total', 'bpRNA-1m(90)'] = len(len_bprna90)
# df_family.loc['total', 'RNAStralign'] = len(len_rnastralign)
# df_family.loc['total', 'ArchiveII'] = len(len_archivII)
df_family.loc['total'] = df_family.sum()
# others = total - df_family.sum()
# df_family.loc['Other'] = df_family.loc['total'] - np.nansum(df_family.loc[[c for c in df_family.index if c != 'total']].values, axis=0)
# 
# assert df_family.loc[df_family.index[:-1]].sum().sum() == df_family.loc['total'].sum()
df_family.drop('total', inplace=True)
df_family


Unnamed: 0,bpRNA-1m,RNAStralign,ArchiveII
tRNA,35622.0,9234.0,557.0
rRNA,19920.0,21904.0,1882.0
SRP,1603.0,164.0,928.0
Introns,368.0,453.0,126.0
tmRNA,161.0,133.0,462.0
telomerase,,37.0,37.0


In [99]:
# one value per bin
def make_histograms(l):
    min_val = 0
    max_val = 4400
    bin_size = 100
    bins = np.arange(min_val + bin_size/2, max_val - bin_size/2, bin_size)
    hist, bins = np.histogram(l, bins=max_val//bin_size, range=(min_val, max_val))
    return hist, bins
hists = {
    'bpRNA-1m': make_histograms(len_bprna),
    # 'bpRNA-1m(90)': make_histograms(len_bprna90),
    'RNAStralign': make_histograms(len_rnastralign),
    'ArchiveII': make_histograms(len_archivII),
}

### Plot as piecharts

In [102]:
horizontal_spacing = 0.08
vertical_spacing = 0.2
height = 500
width = 1300

In [105]:

import plotly.graph_objects as go
from plotly.subplots import make_subplots



# first row is piechart of family distribution
# second row is histogram of sequence length distribution
# I want it to look like a paper figure
fig = make_subplots(rows=2, cols=3, specs=[[{'type': 'domain'}]*3, [{'type': 'histogram'}]*3], 
                    subplot_titles=["{} (N={:,})".format(name, len(l)) for name, l in zip(
                        df.columns,
                        [len_bprna, len_rnastralign, len_archivII])],
                    vertical_spacing=vertical_spacing,
                    horizontal_spacing=horizontal_spacing,
                    row_heights=[0.5, 0.2],
)

for i in fig['layout']['annotations']:
    i['font'] = dict(size=22)
    
for i, name in enumerate(hists.keys()):
    fig.add_trace(go.Pie(
        labels=df_family.index,
        values=df_family[name],
        name=name,
        textinfo=f'percent',
        textposition='inside',
        showlegend=i==2,
        sort=True,
        # legennd location
        # domain={'x': [0.0, 0.25], 'y': [0.5, 1.0]} if i == 0 else {'x': [0.25, 0.5], 'y': [0.5, 1.0]} if i == 1 else {'x': [0.5, 0.75], 'y': [0.5, 1.0]} if i == 2 else {'x': [0.75, 1.0], 'y': [0.5, 1.0]},
    ), row=1, col=i+1)
    fig.add_trace(go.Bar(
        x=hists[name][1][:-1],
        y=hists[name][0],
        name=name,
        showlegend=False,
        marker_color='rgb(0, 0, 0)',
    ), row=2, col=i+1)
    # fig.update_yaxes(row=2, col=i+1,  range=[0, 50000])
    fig.update_xaxes(row=2, col=i+1, range=[-100, 2000], tick0=0, dtick=2000)
    # make bar width constant
    # add horitontal lines
    fig.update_layout(barmode='overlay', bargap=0.1, bargroupgap=0.1)

# write 'sequence length' at the bottom, centered
fig.add_annotation(dict(
    x=0.5,
    y=-0.20,
    text='Sequence length (bin size: 100)',
    showarrow=False,
    font=dict(size=22),
    xref='paper',
    yref='paper',
    xanchor='center',
    yanchor='bottom',
))

fig.add_annotation(dict(
    x=-0.06,
    y=0.63,
    text='Families',
    showarrow=False,
    font=dict(size=22),
    xref='paper',
    yref='paper',
    xanchor='center',
    yanchor='bottom',
    textangle=-90
), 
)

fig.add_annotation(dict(
    x=-0.06,
    y=0.03,
    text='Count',
    showarrow=False,
    font=dict(size=22),
    xref='paper',
    yref='paper',
    xanchor='center',
    yanchor='bottom',
    textangle=-90
), 
)

fig.update_layout(
    height=height,
    width=width,
    title_x=0.5,
    title_y=0.95,
    font_size=20,
    font_family='Arial',
    font_color='black',
    legend_font_size=20,
    legend_font_family='Arial',
    legend_font_color='black',
    legend_x=1.03,
    legend_y=0.95,
    # legend_orientation='h',
    legend_traceorder='normal',
    legend_bordercolor='black',
    template="plotly_white",

)






In [72]:
df

Unnamed: 0,bpRNA,RNAStralign,archiveII
tRNA,35622.0,9234,557
rRNA,19920.0,21904,1882
SRP,1603.0,164,928
Introns,368.0,453,126
tmRNA,161.0,133,462
telomerase,,37,37
