In [1]:
import json
import pandas as pd
import numpy as np

## Ribonanza

In [2]:
def rename_families(x):
    if x == '5s' or x == '5S_rRNA' or x == '16s' or x == '16S_rRNA' or x == '23s' or x == '23S_rRNA':
        return 'rRNA'
    if x == 'grp1': 
        return 'group_I_intron'
    if x == 'grp2':
        return 'group_II_intron'
    if x == 'srp':
        return 'SRP'
    else: 
        return x

## RNAStralign

In [3]:
data = json.load(open('../RNAStralign/data.json'))
families = pd.DataFrame.from_dict(data, orient='index')['family'].apply(lambda x: x.split('__')[0].replace('_database', '')).apply(rename_families)
fam_rnastralign = families.value_counts()
fam_rnastralign

family
rRNA              18893
tRNA               6436
group_I_intron      511
SRP                 193
tmRNA               170
RNaseP               46
telomerase           37
Name: count, dtype: int64

## archiveII

In [4]:
data = json.load(open('../archiveII/data/archiveII/data.json'))
families = pd.DataFrame.from_dict(data, orient='index').reset_index()['index'].apply(lambda x: x.split('_')[0]).apply(rename_families)
fam_archivII = families.value_counts()
fam_archivII

index
rRNA               1294
SRP                 725
tRNA                493
RNaseP              429
tmRNA               404
group_I_intron       93
telomerase           37
group_II_intron      11
Name: count, dtype: int64

## bpRNA

In [5]:
data = json.load(open('../bpRNA/data/bpRNA/data.json'))
df = pd.DataFrame.from_dict(data, orient='index').reset_index().rename(columns={'index': 'reference'})
df['family'] = df['reference'].apply(lambda x: x.split('_')[1])
df['sequence'] = df['sequence'].apply(lambda x: x.replace('T', 'U'))
df.set_index('reference', inplace=True)
print(len(df))  

66715


In [6]:
df['family'].value_counts()

family
RFAM     39601
CRW      24552
SRP        723
PDB        533
SPR        483
RNP        419
tmRNA      404
Name: count, dtype: int64

### bpRNA/labeled

In [7]:
valid_labels = list(fam_archivII.keys()) + list(fam_rnastralign.keys()) + ['SPR', 'RNP', 'ncRNA']
df['family'] = df['family'].apply(lambda x: 'ncRNA' if x == 'PDB' else x)
df_labeled = df[df['family'].isin(valid_labels)].copy()
df_labeled['family'].value_counts()

family
SRP      723
ncRNA    533
SPR      483
RNP      419
tmRNA    404
Name: count, dtype: int64

### bpRNA/unlabeled

In [8]:
df_un = df[~df['family'].isin(valid_labels)].copy()
df_un = df_un[['sequence']].reset_index().rename(columns={'index': 'reference'})
len(df_un)

64153

In [9]:
df_un

Unnamed: 0,reference,sequence
0,bpRNA_CRW_9331,CGUGUCGUGAGAUGUUGGGUUAAGUCCCGCAACGAGCGCAACCCUU...
1,bpRNA_CRW_16343,CCUGACGACCAUAGCGAGCGGGUCCCACCUGACCCCAUGCCGAACU...
2,bpRNA_CRW_14876,GAUGAACGCUGGCGGCGUGCCUAAUACAUGCAAGUCGAGCGAACAG...
3,bpRNA_CRW_50032,GUCAGGAUAGCUCAGUUGGUAGAGCAGAGGACUGAAAAUC
4,bpRNA_CRW_40524,AGGGCCUAUAGCUCAGUUGGUUAGAGCACACGCCUGAUAAGCGUGA...
...,...,...
64148,bpRNA_CRW_43706,GGGUCUGUAGCUCAGUCGGUUAGAGCAGGGGACUCAUAAUCCCUUG...
64149,bpRNA_CRW_15161,GAUCCUGGCUCAGGACGAACGCUGGCGGCGUGCCUAAUACAUGCAA...
64150,bpRNA_RFAM_19577,UGUGCAUCGUGGUCAAAUGCUCAGACUCCUGUGGUGGCUGCUCAUG...
64151,bpRNA_RFAM_23681,CCCAAAGGUUCCCUCAGGCUGAAUGGAAACCAGCCAGAGAGUGUAA...


### Add labels from external databases through sequence matching

#### Rfam fasta files

In [10]:
rfam = pd.read_csv('/Users/yvesmartin/src/supermodels-data/rfam/data/rfam.csv')
rfam = rfam[['ref_desc', 'sequence', 'family_name', 'full_family_name', 'clan_name']]
rfam['sequence'] = rfam['sequence'].apply(lambda x: x.replace('T', 'U'))

In [11]:
df = pd.merge(df_un, rfam, on='sequence', how='left')
df.sort_values(by='family_name', inplace=True, na_position='last')
df.drop_duplicates(subset=['reference'], inplace=True, keep='first')
df

Unnamed: 0,reference,sequence,ref_desc,family_name,full_family_name,clan_name
182156,bpRNA_RFAM_23070,CCUUCAUUGGUUUACCUCAAACCUGUUGUGAUGUAAGUUAAUGAAG...,"Streptococcus sanguinis SK36, complete genome.",23S-methyl,23S methyl RNA motif,
184249,bpRNA_RFAM_23072,CGUUUGGCGGUCGAUAUCAGCGUUUAACUGUUAGCGGCAGACAAGU...,"Lactobacillus brevis ATCC 367, complete genome.",23S-methyl,23S methyl RNA motif,
126285,bpRNA_RFAM_23065,CGUUUGGUAGUUAACAUCGACAUGUCGUUGGUGACUACCGAGUUGU...,Lactobacillus plantarum WCFS1 complete genome,23S-methyl,23S methyl RNA motif,
34815,bpRNA_RFAM_23059,UUUUCAUUGGUUUUUAUCAGGUUCCUGUUCUGAUAAAAGUUAGUGA...,Enterococcus villorum ATCC 700913 genomic scaf...,23S-methyl,23S methyl RNA motif,
206677,bpRNA_RFAM_23077,UUUUCCCUAACUUUUAUCAGAAUACUUUUUGAUAAAAGCUAGUGAU...,Lactobacillus sakei strain 23K complete genome.,23S-methyl,23S methyl RNA motif,
...,...,...,...,...,...,...
223807,bpRNA_CRW_54700,AAAGACUCAGUCCUAACCUUACUAUUGGUUUUUGCUAGACAUAUAC...,,,,
223808,bpRNA_CRW_43706,GGGUCUGUAGCUCAGUCGGUUAGAGCAGGGGACUCAUAAUCCCUUG...,,,,
223809,bpRNA_CRW_15161,GAUCCUGGCUCAGGACGAACGCUGGCGGCGUGCCUAAUACAUGCAA...,,,,
223830,bpRNA_RFAM_23681,CCCAAAGGUUCCCUCAGGCUGAAUGGAAACCAGCCAGAGAGUGUAA...,,,,


#### CRW 

In [12]:
data = pd.DataFrame.from_dict(json.load(open('../CRW/crw.json')), orient='index').reset_index().rename(columns={'index': 'reference', 'family':'family_name'})[['sequence', 'family_name']]
df = pd.merge(df, data, on='sequence', how='left')
df['family_name'] = df.apply(lambda x: x['family_name_x'] if pd.isnull(x['family_name_y']) else x['family_name_y'], axis=1)
df.drop(['family_name_x', 'family_name_y'], axis=1, inplace=True)
df['family_name'].value_counts()

family_name
tRNA              3620
5S_rRNA            848
RNaseP_bact_a      450
SAM                433
tmRNA              413
                  ... 
CbSR2                1
CbSR14               1
SBWMV2_UPD-PKl       1
CbSR1                1
TB10Cs2H2            1
Name: count, Length: 1604, dtype: int64

In [16]:
translation_family_name = {
    'sRNA': ['sRNA', 'rli', 'rivX', 'CC', 'STnc', 'sau-', 'Atu_', 'GlsR', '6C', 'ArcZ', 'Bsr', 'C0', 'CyaR_RyeE', 'whalefall-1', 'tpke11', 'tfoR',\
        't44', 'sro', 'FsrA', 'sraA', 'ryfA', 'GadY', 'GcvB', 'Hgc', 'IS009', 'InvR', 'MtlS', 'OmrA-B', 'OrzO-P', 'OxyS', 'PrrF', 'Qrr', 'RybB', 'RydC', 'SgrS',\
        'Spot_42', 'Sra'
            ],
    'tRNA': ['tRNA', 'TLS-PK'],
    'rRNA': ['rRNA', '5s', '5S_rRNA', '16s', '16S_rRNA', '23s', '23S_rRNA', 'RF_site', 'ribozyme', 'GOLLD'],
    'ncRNA': ['ncRNA', 'rdlD', 'NRON', 'CopA', 'DicF', 'uc_338', 'Dicty_Class_I_RNA', 'DsrA', 'FourU', 'srg1', 'sok', 'symR', 'sar', 'rncO', \
            'rydB', 'msr', 'IS102', 'IS128', 'MicC', 'MicF', 'NrrF', 'NsiR1', 'Plasmid_RNAIII','RNA-OUT', 'RNAI','RUF', \
             'RprA', 'Rsa', 'SprD'
                ],
    'group_I_intron': ['grp1'],
    'group_II_intron': ['grp2', 'group-II'],
    'SRP': ['srp', 'SRP'],
    'RNaseP': ['RNaseP'],
    'crRNA': ['CRISPR'],
    'RNP': ['HACA'],
    'snRNA': ['snRNA', 'sn', 'Gl_U', 'SNORA', 'U1', 'U7', 'U3', 'VA', 'SCARNA'],
    'microRNA': ['mir-', 'MIR', 'lsy-6'],
    'virus': ['virus', 'CuYV_BPYV', 'SPCSV', 'HAV', 'BMV3_UPD', 'Rubella_3', 'HIV', 'HBV', 'IRES'],
    'tmRNA': ['tmRNA'],
    'mRNA': ['SAM', 'mini-ykkC', 'FIE3'],
    'CRE': ['sucA', 'Antizyme_FSE', 'CAESAR', 'ylbH', 'yjdF', 'ydaO-yuaA', 'ybhL', 'wcaG', 'G-CSF_SLDE', 'GABA3', 'GAIT', 'speF', 'GP_knot', 
            'Gurken', 'K_chan_RES', 'Mg_sensor', 'PyrR', 'RtT', 'SECIS_'                                   ],
    'other': ['IMES-3', 'isrK', 'isrL', 'DapZ', 'ppoRNA', 'DNA', 'RyhB'],
    'telomerase': ['tp2'],
    'motif': ['MS2', 'OLE', 'PYLIS_'],
}

translation_clan_name = {
    'snRNA': ['7SK'],
    'ncRNA': ['Csr_Rsm_clan', 'FinP-traJ','Glm', 'suhB'],
    'sRNA': ['RyeA-RyeB', 'LhrC'],
    'RNaseP': ['RNaseP'],
    'mRNA': ['SL'],
}

translation_family_description = {
    'mRNA': ['riboswitch', 'UTR', 'mRNA', 'promoter', ],
    'snRNA': ['spliceosomal', 'Small nucleolar', 'snoRNA', 'Small Nucleolar RNA', 'small nucleolar'],
    'sRNA': ['sRNA', 'small RNA', 'AniS', 'Anti-Q RNA', 'Hfq binding', 'antisense','anti-sense', 'Antisense', 'anti-toxin', 'Short', 'antitoxin'],
    'virus': ['virus', 'Salmonella'],
    'microRNA': ['microRNA'],
    'tRNA': ['tRNA'],
    'tmRNA': ['tmRNA'],
    'crRNA': ['CRISPR'],
    'telomerase': ['telomerase'],
    '23S_rRNA': ['23S'],
    '6S_rRNA': ['6S'],
    'motif': ['pseudoknot', 'Hammerhead', 'motif', 'Stem loopII regulatory element in POLB', 'Pseudoknot'],
    'CRE': ['Alpha operon ribosome binding site', 'leader', 'cis-regulatory element', 'element'],
    'rRNA': ['rRNA', 'Ribosomal', 'ribosomal', 'ribozyme', 'pRNA', 'ribosome'],
    'ncRNA': ['Y RNA', 'thermometer', 'noncoding', 'SscA', 'Non-coding'], 
    'group_I_intron': ['Group I'],
}

def clean_family(row):
    x = row['clan_name']
    if not pd.isna(x):
        for family, v in translation_clan_name.items():
            for vv in v:
                if vv in x:
                    return family

    x = row['family_name']
    if pd.isna(x):
        return 'other'
    for family, v in translation_family_name.items():
        for vv in v:
            if vv in x:
                return family
            
    full_family_name = row['full_family_name']
    if pd.isna(full_family_name):
        return 'other'
    for family, keywords in translation_family_description.items():
        for kw in keywords:
            if kw in full_family_name:
                return family     
    return 'other'

df_temp = df.copy()
df_temp['family'] = df.apply(clean_family, axis=1)
df_temp['length'] = df_temp['sequence'].apply(len)
print(df_temp.value_counts('family'))

Series([], Name: count, dtype: int64)


In [17]:
## Merge with labeled
pd.concat([df_labeled, df_temp])

Unnamed: 0_level_0,sequence,structure,family,ref_desc,full_family_name,clan_name,family_name,length
reference,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bpRNA_SRP_362,CGGUGGCGCGUGCCUGUAGUCCCAGCUACUCGGGAGGCUGAGGCUG...,"[[0, 9], [1, 8], [19, 34], [20, 33], [21, 32],...",SRP,,,,,
bpRNA_RNP_57,AUGCAGGAAAUGCGGGUAGCCGCUGCCGCAAUCGUCUCGGCGAUUG...,"[[3, 404], [4, 403], [5, 402], [6, 401], [7, 4...",RNP,,,,,
bpRNA_SPR_487,ACAGAUUGUAGCUUAAUCACAAAGCAUCUGGCCUACACCCAGAAGA...,"[[1, 67], [2, 66], [3, 65], [4, 64], [6, 62], ...",SPR,,,,,
bpRNA_RNP_12,GGCAGAGAGAGCCCAGUUCCCGUGCCCGAGACGGGCAUGAGGAAAG...,"[[0, 428], [1, 427], [2, 426], [3, 425], [4, 4...",RNP,,,,,
bpRNA_SRP_327,GACUGUAAUGGUCUAACGGUGAAGGCGUUCAAACCCGUUCAACCGC...,"[[0, 12], [1, 11], [2, 10], [14, 294], [15, 29...",SRP,,,,,
...,...,...,...,...,...,...,...,...
bpRNA_CRW_54700,AAAGACUCAGUCCUAACCUUACUAUUGGUUUUUGCUAGACAUAUAC...,,,,,,16S,971.0
bpRNA_CRW_43706,GGGUCUGUAGCUCAGUCGGUUAGAGCAGGGGACUCAUAAUCCCUUG...,,,,,,,77.0
bpRNA_CRW_15161,GAUCCUGGCUCAGGACGAACGCUGGCGGCGUGCCUAAUACAUGCAA...,,,,,,,1537.0
bpRNA_RFAM_23681,CCCAAAGGUUCCCUCAGGCUGAAUGGAAACCAGCCAGAGAGUGUAA...,,,,,,,108.0
