In [2]:
import json
import pandas as pd
import numpy as np

## Characterize the length and family distributions of commonly used databases

We need a subplot with one database per column: PDB, bpRNA, and RNAStralign
- On the first row, a pie chart of the family distribution: No need to have all possible families, check the biggest one used (probably the ones used by RNAstralign)
- One the second row, a histogram of the length distribution

**Assigned to**: Yves

Use Ploty, and a white background

## Ribonanza

In [3]:
def rename_families(x):
    if x == '5s' or x == '16s' or x == '23s' or 'rRNA' in x:
        return 'rRNA'
    if x == 'grp1' or x == 'group_I_intron':
        return 'Introns'
    if x == 'grp2' or 'intron' in x:
        return 'Introns'
    if x == 'srp' or x == 'SRP':
        return 'SRP'
    if x == 'telomerase':
        return 'telomerase'
    if x == 'RNaseP':
        return 'rRNA'
    else: 
        return x

## RNAStralign

In [4]:
import rouskinhf
data = rouskinhf.get_dataset('RNAstralign')
families_rnastralign = pd.DataFrame.from_dict(data, orient='index')['family'].apply(lambda x: x.split('__')[0].replace('_database', '')).apply(rename_families)
fam_rnastralign = families_rnastralign.value_counts()
len_rnastralign = pd.DataFrame.from_dict(data, orient='index')['sequence'].apply(len)
df_rnastralign = pd.merge(families_rnastralign,  pd.DataFrame.from_dict(data, orient='index')['sequence'], left_index=True, right_index=True)

df_rnastralign

Unnamed: 0,family,sequence
B06573,rRNA,UUUAAGGUGGCUAUAGCAACGGGGCUCACCUCUUCCCAUUCCGAAC...
tdbD00004108,tRNA,GGGGAAUUAGCUCAAAAGGUAGAGCGCUCGCUUUGCAUGUGAGAGG...
tdbD00004299,tRNA,GGCGCGUUGACAGAUUGGUUAUGUAGCGGAUUGCAAAUCCGCCUAG...
B04476,rRNA,CGCCUGACGACCAUAGCGAGUUGGUCCCACUCCUUCCCAUCCCGAA...
Z38006,rRNA,GCUCAGGACGAACGCUGGCGGCGUGCUUAACACAUGCAAGUCGAAC...
...,...,...
E00929,rRNA,GCCAACGUCCAUACCAUGCUGAAUACACCGGUUCUCGUCCGAUCAC...
tdbD00004309,tRNA,GGCCGAGUAGCAAAAUGGUUAUGCAGCGGAUUGCAAAUCCGCCUAC...
AF176322,rRNA,GUUUGAUCCUGGCUCAGGAUGAACGCUGGCGGCGUGCCUAAUACAU...
B04677,rRNA,UCUAGUGGCGAUGCGUCCAGGGGUCACACCCGUUCUCAUCCCGAAC...


## archiveII

In [5]:
data = rouskinhf.get_dataset('archiveII')
families_archivII = pd.DataFrame.from_dict(data, orient='index').reset_index()['index'].apply(lambda x: x.split('_')[0]).apply(rename_families)
fam_archivII = families_archivII.value_counts()
len_archivII = pd.DataFrame.from_dict(data, orient='index')['sequence'].apply(len)
df_archivII = pd.merge(families_archivII,  pd.DataFrame.from_dict(data, orient='index')['sequence'].reset_index(drop=True), left_index=True, right_index=True).rename(columns={'index': 'family'})
df_archivII

Unnamed: 0,family,sequence
0,rRNA,UGCCUGGCGGCCAUAGUGCGGUGGUCCCACCUGACCCCAUGCCGAA...
1,rRNA,GCCUACGGCCACACCACACUGAAUGCGCCUGAUCUCGUCUGAUCUC...
2,SRP,GCUGGCGGGCCCCUUCGCAUGGUUCGGCGGUGAAUCUGGUCAGGUC...
3,rRNA,UUGCUUGGCGACCAUAGCGUUUUGGACCCACCUGAAUCCAUUCCGA...
4,tmRNA,GGGGGCGUUCUUGGAUUCGACGGGGAUUGCGAGGCUCGAGGUGCAU...
...,...,...
3481,tRNA,GGUUCUCUGGCCGAGUGGUCUAAGGCGCAUGGUUAAGGUCCAUGUC...
3482,rRNA,AGAGGGGUGAGAAUCCCUCCACCGAAUGCCUAAGGGUUCCUGAGGA...
3483,rRNA,UACCUGGUUGAUCCUGCCAGUAGCAUAUGCUUGUCUCAAAGAUUAA...
3484,rRNA,GAUCUGGUGGCCAUGGCGGGGCGCAAUCACCCGAUCCCAUCCCGAA...


## bpRNA (own analysis)


In [6]:
df_bprna = pd.read_feather('saved_data_plot/bpRNA.feather')
df_bprna['family'] = df_bprna['reference'].apply(lambda x: x.split('_')[1])
df_bprna

Unnamed: 0,reference,sequence,length,family
0,bpRNA_CRW_9331,CGUGUCGUGAGAUGUUGGGUUAAGUCCCGCAACGAGCGCAACCCUU...,420,CRW
1,bpRNA_CRW_16343,CCUGACGACCAUAGCGAGCGGGUCCCACCUGACCCCAUGCCGAACU...,116,CRW
2,bpRNA_CRW_13525,CUUUGACGUUAGCGGCGGACGGGUGAGUAACACGUGGAUAACCUAC...,989,CRW
3,bpRNA_CRW_14876,GAUGAACGCUGGCGGCGUGCCUAAUACAUGCAAGUCGAGCGAACAG...,1484,CRW
4,bpRNA_CRW_50032,GUCAGGAUAGCUCAGUUGGUAGAGCAGAGGACUGAAAAUC,40,CRW
...,...,...,...,...
102314,bpRNA_CRW_15161,GAUCCUGGCUCAGGACGAACGCUGGCGGCGUGCCUAAUACAUGCAA...,1537,CRW
102315,bpRNA_CRW_22308,GGGGGCUUAGCUCAGCUGGGAGAGCGCCUGCUUUGCACGCAGGAGG...,73,CRW
102316,bpRNA_RFAM_19577,UGUGCAUCGUGGUCAAAUGCUCAGACUCCUGUGGUGGCUGCUCAUG...,81,RFAM
102317,bpRNA_RFAM_23681,CCCAAAGGUUCCCUCAGGCUGAAUGGAAACCAGCCAGAGAGUGUAA...,108,RFAM


### bpRNA/labeled

In [7]:
valid_labels = list(fam_archivII.keys()) + list(fam_rnastralign.keys()) + ['SPR', 'RNP', 'ncRNA']
df_bprna['family'] = df_bprna['family'].apply(lambda x: 'ncRNA' if x == 'PDB' else x)
df_labeled = df_bprna[df_bprna['family'].isin(valid_labels)].copy()
df_labeled['family'].value_counts()

family
SRP      959
tmRNA    728
ncRNA    669
SPR      623
RNP      466
Name: count, dtype: int64

### bpRNA/unlabeled

In [8]:
df_un = df_bprna[~df_bprna['family'].isin(valid_labels)].copy()
df_un = df_un[['sequence', 'length']].reset_index().rename(columns={'index': 'reference'})
len(df_un)

98874

### Add labels from external databases through sequence matching

#### Rfam fasta files

In [9]:
rfam = pd.read_csv('/Users/yvesmartin/src/supermodels-data/rfam/data/rfam.csv')
rfam = rfam[['ref_desc', 'sequence', 'family_name', 'full_family_name', 'clan_name']]
rfam['sequence'] = rfam['sequence'].apply(lambda x: x.replace('T', 'U'))

In [10]:
df_bprna = pd.merge(df_un, rfam, on='sequence', how='left')
df_bprna.sort_values(by='family_name', inplace=True, na_position='last')
df_bprna.drop_duplicates(subset=['reference'], inplace=True, keep='first')
df_bprna

Unnamed: 0,reference,sequence,length,ref_desc,family_name,full_family_name,clan_name
826485,69039,CCUUCAUUGGUUUACCUCAAACCUGUUGUGAUGUAAGUUAAUGAAG...,96,"Streptococcus sanguinis SK36, complete genome.",23S-methyl,23S methyl RNA motif,
792602,66386,CAUUCGUUGGUUUAAAUCAAACCUGUUAUGAUUUAAGUUAGCGAGU...,99,"Streptococcus pyogenes M1 GAS, complete genome.",23S-methyl,23S methyl RNA motif,
50361,4686,UUUUCAUUGGUUUUUAUCAGGUUCCUGUUCUGAUAAAAGUUAGUGA...,105,Enterococcus durans ATCC 6056 genomic scaffold...,23S-methyl,23S methyl RNA motif,
342385,28544,GAUUUGUUAGUUUAAAUCAAACCUGUUAUGAUUUAAGUUAACAAAC...,99,"Streptococcus agalactiae 2603V/R, complete gen...",23S-methyl,23S methyl RNA motif,
463875,39125,GAUUUGUUAGUUUAAAUCAAACCUGUUAUGAUUUAAGCUAACAAAC...,99,Streptococcus agalactiae H36B s_agalactiae_h36...,23S-methyl,23S methyl RNA motif,
...,...,...,...,...,...,...,...
1211827,102312,GAAUUCCACGUGUAGCGGUGAAAUGCGUAGAGAUGUGGAGGAACAC...,804,,,,
1211828,102313,GGGCUUGUAGCUCAGUUGGUUAGAGCGCGCGCUUGAUAAGCGUGAG...,77,,,,
1211829,102314,GAUCCUGGCUCAGGACGAACGCUGGCGGCGUGCCUAAUACAUGCAA...,1537,,,,
1211885,102317,CCCAAAGGUUCCCUCAGGCUGAAUGGAAACCAGCCAGAGAGUGUAA...,108,,,,


#### CRW 

In [11]:
data = pd.DataFrame.from_dict(json.load(open('/Users/yvesmartin/src/supermodels-data/CRW/crw.json')), orient='index').reset_index().rename(columns={'index': 'reference', 'family':'family_name'})[['sequence', 'family_name']]
df_bprna = pd.merge(df_bprna, data, on='sequence', how='left')
df_bprna['family_name'] = df_bprna.apply(lambda x: x['family_name_x'] if pd.isnull(x['family_name_y']) else x['family_name_y'], axis=1)
df_bprna.drop(['family_name_x', 'family_name_y'], axis=1, inplace=True)
df_bprna['family_name'].value_counts()
df_bprna

Unnamed: 0,reference,sequence,length,ref_desc,full_family_name,clan_name,family_name
0,69039,CCUUCAUUGGUUUACCUCAAACCUGUUGUGAUGUAAGUUAAUGAAG...,96,"Streptococcus sanguinis SK36, complete genome.",23S methyl RNA motif,,23S-methyl
1,66386,CAUUCGUUGGUUUAAAUCAAACCUGUUAUGAUUUAAGUUAGCGAGU...,99,"Streptococcus pyogenes M1 GAS, complete genome.",23S methyl RNA motif,,23S-methyl
2,4686,UUUUCAUUGGUUUUUAUCAGGUUCCUGUUCUGAUAAAAGUUAGUGA...,105,Enterococcus durans ATCC 6056 genomic scaffold...,23S methyl RNA motif,,23S-methyl
3,28544,GAUUUGUUAGUUUAAAUCAAACCUGUUAUGAUUUAAGUUAACAAAC...,99,"Streptococcus agalactiae 2603V/R, complete gen...",23S methyl RNA motif,,23S-methyl
4,39125,GAUUUGUUAGUUUAAAUCAAACCUGUUAUGAUUUAAGCUAACAAAC...,99,Streptococcus agalactiae H36B s_agalactiae_h36...,23S methyl RNA motif,,23S-methyl
...,...,...,...,...,...,...,...
99533,102312,GAAUUCCACGUGUAGCGGUGAAAUGCGUAGAGAUGUGGAGGAACAC...,804,,,,
99534,102313,GGGCUUGUAGCUCAGUUGGUUAGAGCGCGCGCUUGAUAAGCGUGAG...,77,,,,
99535,102314,GAUCCUGGCUCAGGACGAACGCUGGCGGCGUGCCUAAUACAUGCAA...,1537,,,,
99536,102317,CCCAAAGGUUCCCUCAGGCUGAAUGGAAACCAGCCAGAGAGUGUAA...,108,,,,


In [12]:
translation_family_name = {
    'sRNA': ['sRNA', 'rli', 'rivX', 'CC', 'STnc', 'sau-', 'Atu_', 'GlsR', '6C', 'ArcZ', 'Bsr', 'C0', 'CyaR_RyeE', 'whalefall-1', 'tpke11', 'tfoR',\
        't44', 'sro', 'FsrA', 'sraA', 'ryfA', 'GadY', 'GcvB', 'Hgc', 'IS009', 'InvR', 'MtlS', 'OmrA-B', 'OrzO-P', 'OxyS', 'PrrF', 'Qrr', 'RybB', 'RydC', 'SgrS',\
        'Spot_42', 'Sra'
            ],
    'tRNA': ['tRNA', 'TLS-PK'],
    'rRNA': ['rRNA', '5s', '5S_rRNA', '16s', '16S_rRNA', '23s', '23S_rRNA', 'RF_site', 'ribozyme', 'GOLLD'],
    'ncRNA': ['ncRNA', 'rdlD', 'NRON', 'CopA', 'DicF', 'uc_338', 'Dicty_Class_I_RNA', 'DsrA', 'FourU', 'srg1', 'sok', 'symR', 'sar', 'rncO', \
            'rydB', 'msr', 'IS102', 'IS128', 'MicC', 'MicF', 'NrrF', 'NsiR1', 'Plasmid_RNAIII','RNA-OUT', 'RNAI','RUF', \
             'RprA', 'Rsa', 'SprD'
                ],
    'group_I_intron': ['grp1'],
    'group_II_intron': ['grp2', 'group-II'],
    'SRP': ['srp', 'SRP'],
    'RNaseP': ['RNaseP'],
    'crRNA': ['CRISPR'],
    'RNP': ['HACA'],
    'snRNA': ['snRNA', 'sn', 'Gl_U', 'SNORA', 'U1', 'U7', 'U3', 'VA', 'SCARNA'],
    'microRNA': ['mir-', 'MIR', 'lsy-6'],
    'virus': ['virus', 'CuYV_BPYV', 'SPCSV', 'HAV', 'BMV3_UPD', 'Rubella_3', 'HIV', 'HBV', 'IRES'],
    'tmRNA': ['tmRNA'],
    'mRNA': ['SAM', 'mini-ykkC', 'FIE3'],
    'CRE': ['sucA', 'Antizyme_FSE', 'CAESAR', 'ylbH', 'yjdF', 'ydaO-yuaA', 'ybhL', 'wcaG', 'G-CSF_SLDE', 'GABA3', 'GAIT', 'speF', 'GP_knot', 
            'Gurken', 'K_chan_RES', 'Mg_sensor', 'PyrR', 'RtT', 'SECIS_'                                   ],
    'other': ['IMES-3', 'isrK', 'isrL', 'DapZ', 'ppoRNA', 'DNA', 'RyhB'],
    'telomerase': ['tp2'],
    'motif': ['MS2', 'OLE', 'PYLIS_'],
}

translation_clan_name = {
    'snRNA': ['7SK'],
    'ncRNA': ['Csr_Rsm_clan', 'FinP-traJ','Glm', 'suhB'],
    'sRNA': ['RyeA-RyeB', 'LhrC'],
    'RNaseP': ['RNaseP'],
    'mRNA': ['SL'],
}

translation_family_description = {
    'mRNA': ['riboswitch', 'UTR', 'mRNA', 'promoter', ],
    'snRNA': ['spliceosomal', 'Small nucleolar', 'snoRNA', 'Small Nucleolar RNA', 'small nucleolar'],
    'sRNA': ['sRNA', 'small RNA', 'AniS', 'Anti-Q RNA', 'Hfq binding', 'antisense','anti-sense', 'Antisense', 'anti-toxin', 'Short', 'antitoxin'],
    'virus': ['virus'],
    'microRNA': ['microRNA'],
    'tRNA': ['tRNA'],
    'tmRNA': ['tmRNA'],
    'crRNA': ['CRISPR'],
    'telomerase': ['telomerase'],
    '23S_rRNA': ['23S'],
    '6S_rRNA': ['6S'],
    'motif': ['pseudoknot', 'Hammerhead', 'motif', 'Stem loopII regulatory element in POLB', 'Pseudoknot'],
    'CRE': ['Alpha operon ribosome binding site', 'leader', 'cis-regulatory element', 'element'],
    'rRNA': ['rRNA', 'Ribosomal', 'ribosomal', 'ribozyme', 'pRNA', 'ribosome'],
    'ncRNA': ['Y RNA', 'thermometer', 'noncoding', 'SscA', 'Non-coding'], 
    'group_I_intron': ['Group I'],
}

def clean_family(row):
    x = row['clan_name']
    if not pd.isna(x):
        for family, v in translation_clan_name.items():
            for vv in v:
                if vv in x:
                    return family

    x = row['family_name']
    if pd.isna(x):
        return 'other'
    for family, v in translation_family_name.items():
        for vv in v:
            if vv in x:
                return family
            
    full_family_name = row['full_family_name']
    if pd.isna(full_family_name):
        return 'other'
    for family, keywords in translation_family_description.items():
        for kw in keywords:
            if kw in full_family_name:
                return family     
    return 'other'

def rename_families_bp2(x):
    if x == '5s' or x == '16s' or x == '23s' or 'rRNA' in x:
        return 'rRNA'
    if x == 'grp1' or x == 'group_I_intron' or x == 'Introns':
        return 'Introns'
    if x == 'grp2' or 'intron' in x:
        return 'Introns'
    if x == 'srp' or x == 'SRP':
        return 'Other'
    if x == 'telomerase':
        return 'telomerase'
    if x == 'RNaseP':
        return 'rRNA'
    if x == 'snRNA' or x == 'microRNA' or x == 'CRE' or x == 'sRNA':
        return 'sRNA'
    if x in ['virus', 'motif', 'other']:
        return 'Other'
    else: 
        return x


df_bprna['family'] = df_bprna.apply(clean_family, axis=1)
df_bprna['length'] = df_bprna['sequence'].apply(len)
df_bprna.set_index('reference', inplace=True)
df_bprna = pd.concat([df_labeled, df_bprna])[['sequence', 'family']]
df_bprna['family'] = df_bprna['family'].apply(rename_families_bp2)
df_bprna.family.value_counts()

family
Other         64916
sRNA          16143
tRNA          10033
rRNA           3456
mRNA           2688
ncRNA          1602
Introns        1538
tmRNA          1361
SPR             623
RNP             525
telomerase       98
Name: count, dtype: int64

In [13]:
# add ribonanza
import rouskinhf
df_ribo = pd.DataFrame.from_dict(rouskinhf.get_dataset('ribo500-blast'), orient='index').reset_index().rename(columns={'index': 'reference'})
df_ribo['database'] = 'Ribonanza'
df_ribo['family'] = 'Ribonanza'

In [14]:
df_bprna['database'] = 'bpRNA'
df_rnastralign['database'] = 'RNAStralign'
df_archivII['database'] = 'archiveII'
df_final = pd.concat([df_bprna, df_rnastralign, df_archivII, df_ribo])
df_final['length'] = df_final['sequence'].apply(len)
df_final.family.value_counts()

family
Other         64916
Ribonanza     46060
rRNA          24206
tRNA          16962
sRNA          16143
mRNA           2688
Introns        2379
tmRNA          2165
ncRNA          1602
SRP            1170
SPR             623
RNP             525
telomerase      172
Name: count, dtype: int64

In [15]:
df_final

Unnamed: 0,sequence,family,database,reference,structure,dms,shape,length
22,CGGUGGCGCGUGCCUGUAGUCCCAGCUACUCGGGAGGCUGAGGCUG...,Other,bpRNA,,,,,288
33,AUGCAGGAAAUGCGGGUAGCCGCUGCCGCAAUCGUCUCGGCGAUUG...,RNP,bpRNA,,,,,411
78,ACAGAUUGUAGCUUAAUCACAAAGCAUCUGGCCUACACCCAGAAGA...,SPR,bpRNA,,,,,72
85,GGCAGAGAGAGCCCAGUUCCCGUGCCCGAGACGGGCAUGAGGAAAG...,RNP,bpRNA,,,,,433
86,GACUGUAAUGGUCUAACGGUGAAGGCGUUCAAACCCGUUCAACCGC...,Other,bpRNA,,,,,298
...,...,...,...,...,...,...,...,...
46055,GGGAACGACUCGAGUAGAGUCGAAAAGAUAUGGAUCGGUGUGGGAG...,Ribonanza,Ribonanza,936c659efa3d,"[[5, 21], [6, 20], [7, 19], [8, 18], [9, 17], ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...",177
46056,GGGAACGACUCGAGUAGAGUCGAAAAGAUAUGGACCUGCUACCCAC...,Ribonanza,Ribonanza,9a0027005c92,"[[5, 21], [6, 20], [7, 19], [8, 18], [9, 17], ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...",177
46057,GGGAACGACUCGAGUAGAGUCGAAAACUUUACCAACCACCACAAAC...,Ribonanza,Ribonanza,96104a2c5fe6,"[[5, 21], [6, 20], [7, 19], [8, 18], [9, 17], ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...",155
46058,GGGAACGACUCGAGUAGAGUCGAAAAGAAAGAAAGGCUCUGUGGCA...,Ribonanza,Ribonanza,5b6f71620c10,"[[5, 21], [6, 20], [7, 19], [8, 18], [9, 17], ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...","[-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, ...",177


In [16]:
# one value per bin
def make_histograms(l):
    min_val = 0
    max_val = 4400
    bin_size = 100
    bins = np.arange(min_val + bin_size/2, max_val - bin_size/2, bin_size)
    hist, bins = np.histogram(l, bins=max_val//bin_size, range=(min_val, max_val))
    return hist, bins


hists = {
    dataset: make_histograms(df_final[df_final['database'] == dataset]['length']) for dataset in ['bpRNA', 'RNAStralign', 'archiveII']
}

hists

{'bpRNA': (array([57662, 20176,  2935,  3013,  1282,   984,   503,   626,   931,
           417,   159,   155,   389,  2374,  7804,  2548,    37,    99,
           129,    27,    22,    13,     3,     1,     2,     4,     4,
            25,   136,   310,    66,    20,     7,    19,    17,    25,
             9,    10,     5,    17,     6,     0,     6,     6]),
  array([   0.,  100.,  200.,  300.,  400.,  500.,  600.,  700.,  800.,
          900., 1000., 1100., 1200., 1300., 1400., 1500., 1600., 1700.,
         1800., 1900., 2000., 2100., 2200., 2300., 2400., 2500., 2600.,
         2700., 2800., 2900., 3000., 3100., 3200., 3300., 3400., 3500.,
         3600., 3700., 3800., 3900., 4000., 4100., 4200., 4300., 4400.])),
 'RNAStralign': (array([6490, 9389,  424,  936,  831,  523,  297,  488,  732,  212,   80,
           79,  185, 1087, 4223, 1100,    3,    1,    2,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0, 

### Plot as piecharts

In [17]:
horizontal_spacing = 0.08
vertical_spacing = 0.2
height = 650
width = 1200

In [18]:

import plotly.graph_objects as go
from plotly.subplots import make_subplots

df_final['length'] = df_final['sequence'].apply(len) 
df_final['family'] = df_final['family'].fillna('Other').apply(rename_families_bp2)   

# sort families by count
for family in df_final.family.unique():
    df_final.loc[df_final['family'] == family, 'family_count'] = df_final[df_final['family'] == family].shape[0]
    
for family_name in df_final.family.unique():
    df_final.loc[df_final['family'] == family_name, 'family'] = family_name.split(' (')[0] + ' (N={:,})'.format(df_final[df_final['family'] == family_name].shape[0])
    
df_final.sort_values(by='family_count', ascending=False, inplace=True)

df_family = df_final.groupby('database').apply(lambda x: x['family'].value_counts()).unstack().fillna(0).astype(int).T.copy()

df_family['total'] = df_family.sum(axis=1)
df_family.sort_values(by='total', ascending=False, inplace=True)
df_family.drop(columns=['total', 'Ribonanza'], inplace=True)

df_family = df_family[['bpRNA', 'RNAStralign', 'archiveII']]
df_family = df_family.loc[df_family.index != 'Ribonanza (N=46,060)']

df_family

database,bpRNA,RNAStralign,archiveII
family,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Other (N=66,086)",64916,445,725
"rRNA (N=24,206)",3456,19027,1723
"tRNA (N=16,962)",10033,6436,493
"sRNA (N=16,143)",16143,0,0
"mRNA (N=2,688)",2688,0,0
"Introns (N=2,379)",1538,737,104
"tmRNA (N=2,165)",1361,400,404
"ncRNA (N=1,602)",1602,0,0
SPR (N=623),623,0,0
RNP (N=525),525,0,0


In [19]:


# show length distribution for each family
import plotly.express as px
import plotly.graph_objects as go

# first row is piechart of family distribution
# second row is histogram of sequence length distribution
# I want it to look like a paper figure

if 'bpRNA-1m(90)' in hists.keys():
    del hists['bpRNA-1m(90)'] 

fig = make_subplots(rows=2, cols=3, specs=[[{'type': 'domain'}]*3, [{'type': 'histogram'}]*3], 
                    subplot_titles=["{} (N={:,})".format(name, np.nansum(df_family[name]).astype(int)) for name in df_family.columns] + ['']*3,
                    vertical_spacing=vertical_spacing,
                    horizontal_spacing=horizontal_spacing,
                    row_heights=[0.5, 0.2],
)

for i in fig['layout']['annotations']:
    i['font'] = dict(size=22)
    

for i, name in enumerate(df_family.columns):
    fig.add_trace(go.Pie(
        labels=df_family.index,
        values=df_family[name],
        name=name,
        textinfo=f'percent',
        texttemplate='%{percent}',
        textposition='inside',
        # colors=px.colors.qualitative.Plotly + [px.colors.qualitative.Plotly[-1]]*10000,
        showlegend=i==0 and name !='Ribonanza',
        # colors
        marker_colors=px.colors.qualitative.Plotly + [px.colors.qualitative.Plotly[-1]]*10000,
        # sort=True,
        # legennd location
        # domain={'x': [0.0, 0.25], 'y': [0.5, 1.0]} if i == 0 else {'x': [0.25, 0.5], 'y': [0.5, 1.0]} if i == 1 else {'x': [0.5, 0.75], 'y': [0.5, 1.0]} if i == 2 else {'x': [0.75, 1.0], 'y': [0.5, 1.0]},
    ), row=1, col=i+1)
    fig.add_trace(go.Bar(
        x=hists[name][1][:-1],
        y=hists[name][0],
        name=name,
        showlegend=False,
        marker_color='rgb(0, 0, 0)',
    ), row=2, col=i+1)
    # fig.update_yaxes(row=2, col=i+1,  range=[0, 50000])
    fig.update_xaxes(row=2, col=i+1, range=[-100, 2000], tick0=0, dtick=2000)
    # make bar width constant
    # add horitontal lines
    fig.update_layout(barmode='overlay', bargap=0.1, bargroupgap=0.1)

# write 'sequence length' at the bottom, centered
fig.add_annotation(dict(
    x=0.5,
    y=-0.15,
    text='Sequence length (bin size: 100)',
    showarrow=False,
    font=dict(size=22),
    xref='paper',
    yref='paper',
    xanchor='center',
    yanchor='bottom',
))

fig.add_annotation(dict(
    x=-0.06,
    y=0.03,
    text='Count',
    showarrow=False,
    font=dict(size=22),
    xref='paper',
    yref='paper',
    xanchor='center',
    yanchor='bottom',
    textangle=-90
), 
)

fig.update_layout(
    height=height,
    width=width,
    title_x=0.5,
    title_y=0.95,
    font_size=20,
    font_family='helvetica',
    font_color='black',
    legend_font_size=20,
    legend_font_family='helvetica',
    legend_font_color='black',
    legend_x=1.03,
    legend_y=0.95,
    # legend_orientation='h',
    legend_traceorder='normal',
    legend_bordercolor='black',
    template="plotly_white",
)

fig.update_layout(
    font_family="helvetica",
    font_color="black",
    title_font_family="helvetica",
    title_font_color="black",
    legend_title_font_color="black",
    legend_font_color="black",
    xaxis_title_font_family="helvetica",
    yaxis_title_font_family="helvetica",
    xaxis_tickfont_family="helvetica",
    yaxis_tickfont_family="helvetica",
    legend_font_family="helvetica",
)

fig.show()

# save to pdf
import plotly.io as pio
pio.write_image(fig, 'images/a_family_available_data.pdf')


In [20]:
df = df_final

# Post processing

# only keep bpRNA from rouskinhf

bpRNA =pd.DataFrame.from_dict(rouskinhf.get_dataset('bpRNA-1m'), orient='index')

df[(df['database'] != 'bpRNA') | (df['sequence'].isin(bpRNA['sequence']))]

# Remove duplicates
df.drop_duplicates(subset=['sequence'], inplace=True)


# rewrite family
df['family'] = df['family'].apply(lambda x: x.split(' (')[0])

for family in df.family.unique():
    df.loc[df['family'] == family, 'family_count'] = df[df['family'] == family].shape[0]
    
df['family'] = df.apply(lambda x: x['family'] + ' (N={:,})'.format(int(x['family_count'])), axis=1)
df

df[['sequence','family','database','family_count']].to_csv('../Supplementary/saved_data_plot/family_distribution.tsv', sep='\t', index=False)

In [21]:
df

Unnamed: 0,sequence,family,database,reference,structure,dms,shape,length,family_count
22,CGGUGGCGCGUGCCUGUAGUCCCAGCUACUCGGGAGGCUGAGGCUG...,"Other (N=41,052)",bpRNA,,,,,288,41052.0
44585,GGUCUGUAGCUCAGUUGGUUAGAGCGCACCCCUGAUAAGGGUGAGG...,"Other (N=41,052)",bpRNA,,,,,76,41052.0
44600,UACAGACCAAGUUAUUAAGAGCUAUUGGUGGAUGCCUUGGCAUUGA...,"Other (N=41,052)",bpRNA,,,,,2942,41052.0
44599,UCGGGAUAGCUCAGCUGGUAGAGCAGAGGACUGAAAAUC,"Other (N=41,052)",bpRNA,,,,,39,41052.0
44598,AGUCGAGCGAAUGGAUUAAGAGCUUGCUCUUAUGAAGUUAGCGGCG...,"Other (N=41,052)",bpRNA,,,,,1413,41052.0
...,...,...,...,...,...,...,...,...,...
76826,AUAUCCCCGCAAAUUCAUUCUGUUUGCAUUCAAACAGUCAUUCAAC...,telomerase (N=97),bpRNA,,,,,164,97.0
17556,AGUUUCUCGAUAAUUGAUCUGUAGAAUCUGUCAAGCAAAACCCCAA...,telomerase (N=97),bpRNA,,,,,189,97.0
21726,AGUCGGCGGAAAUCAGUCAGUCAUAGCGCUGUCAACAAAACCCCAA...,telomerase (N=97),bpRNA,,,,,186,97.0
43897,GGUAUUCCUUAUCUAUCUAAUACGUUGAGUUAUCUGGAACCUGUCC...,telomerase (N=97),bpRNA,,,,,112,97.0
