## External databases

In [129]:
import pandas as pd
from io import StringIO
import rouskinhf


length_intervals = [10, 200, 500, 1000, 2000, float("inf")]

class Database:
    def __init__(self, name, method, families, stage, source = None):
        data = rouskinhf.get_dataset(name)
        self.name = name
        self.method = method
        self.source = source if source else name
        self.N_sequences = "{:,}".format(len(data)).replace(',',"'")
        self.families = families
        self.stage = stage
        self.lengths = self.make_length_intervals([len(dp['sequence']) for dp in data.values()])
        
    def make_length_intervals(self, lengths):
        intervals = [0] * (len(length_intervals)-1)
        for length in lengths:
            for i, interval in enumerate(length_intervals[1:]):
                if length < interval:
                    intervals[i] += 1
                    break
        return '\t'.join(["{:,}".format(interval).replace(',',"'") for interval in intervals])
    
    def column_names(self):
        return '\t'.join(["Training stage", "Name on HuggingFace", "Source", "Method", "Number of sequences", "Families"] + [f"L ∈ [{length_intervals[i]}, {length_intervals[i+1]-1}]" for i in range(len(length_intervals)-1)]) 
        
    def __str__(self):
        return '\t'.join([self.stage, self.name, self.source, self.method, self.N_sequences, self.families, self.lengths])
    
    def __repr__(self):
        return self.__str__()
    
databases = []

databases.append(Database(
    name = "rnacentral_synthetic", 
    method="RNAstructure",
    source="Sequences from RNA central",
    families="All known families",
    stage="Pre-training",
))

databases.append(Database(
    name = "ribo500-blast", 
    method="RNAstructure + DMS and/or SHAPE",
    source="Ribonanza Competition",
    families="Unlabelled",
    stage="Pre-training",
))



In [130]:
databases.append(Database(
    name = "bpRNA-1m", 
    method="Covariance analysis",
    families="Unlabelled, sRNA, tRNA",
    stage="Pre-training",
))


In [131]:
databases.append(Database(
    name = "RNAstralign", 
    method="Covariance analysis",
    families="rRNA, tRNA",
    stage="Pre-training",
))


In [132]:
databases.append(Database(
    name = "pri_miRNA", 
    method="RNAstructure + DMS",
    source="This work",
    families="pri-miRNA",
    stage="Fine-tuning",
))

In [133]:
databases.append(Database(
    name = "human_mRNA", 
    method="RNAstructure + DMS",
    source="This work",
    families="mRNA",
    stage="Fine-tuning",
))

In [134]:
databases.append(Database(
    name = "PDB", 
    method="NMR, crystallography",
    families="Short non-coding RNA",
    stage="Testing",
))

In [135]:
databases.append(Database(
    name = "viral_fragments", 
    method="RNAstructure + DMS",
    families="Viral RNA",
    source="Peer-reviewed literature",
    stage="Testing",
))

In [136]:
databases.append(Database(
    name = "lncRNA", 
    method="RNAstructure + DMS",
    source="Bugnon and al, 2022",
    families="Long non-coding RNA",
    stage="Testing",
))

In [137]:
databases.append(Database(
    name = "archiveII_blast", 
    method="Covariance analysis",
    source="Archive II",
    families="rRNA, tRNA, tmRNA, unlabelled",
    stage="Testing",
))

In [138]:
df = pd.read_csv(StringIO('\n'.join([databases[0].column_names()]+[database.__str__() for database in databases])), sep='\t', index_col=False)
df.to_csv("saved_data_plot/datasets.csv", sep=';', index=False)
df

Unnamed: 0,Training stage,Name on HuggingFace,Source,Method,Number of sequences,Families,"L ∈ [10, 199]","L ∈ [200, 499]","L ∈ [500, 999]","L ∈ [1000, 1999]","L ∈ [2000, inf]"
0,Pre-training,rnacentral_synthetic,Sequences from RNA central,RNAstructure,226'729,All known families,176'486,49'463,780,0,0
1,Pre-training,ribo500-blast,Ribonanza Competition,RNAstructure + DMS and/or SHAPE,46'060,Unlabelled,46'049,11,0,0,0
2,Pre-training,bpRNA-1m,bpRNA-1m,Covariance analysis,66'715,"Unlabelled, sRNA, tRNA",48'090,6'167,2'829,9'260,369
3,Pre-training,RNAstralign,RNAstralign,Covariance analysis,27'082,"rRNA, tRNA",15'879,2'191,2'252,6'760,0
4,Fine-tuning,pri_miRNA,This work,RNAstructure + DMS,1'098,pri-miRNA,0,1'098,0,0,0
5,Fine-tuning,human_mRNA,This work,RNAstructure + DMS,1'456,mRNA,0,493,882,81,0
6,Testing,PDB,PDB,"NMR, crystallography",356,Short non-coding RNA,343,6,6,1,0
7,Testing,viral_fragments,Peer-reviewed literature,RNAstructure + DMS,58,Viral RNA,11,29,18,0,0
8,Testing,lncRNA,"Bugnon and al, 2022",RNAstructure + DMS,15,Long non-coding RNA,0,2,1,12,0
9,Testing,archiveII_blast,Archive II,Covariance analysis,355,"rRNA, tRNA, tmRNA, unlabelled",242,65,43,5,0


In [139]:
pd.set_option('colheader_justify', 'center')   # FOR TABLE <th>

html_string = '''
<html>
  <head><title>HTML Pandas Dataframe with CSS</title></head>
  <link rel="stylesheet" type="text/css" href="df_style.css"/>
  <body>
    {table}
  </body>
</html>.
'''

# OUTPUT AN HTML FILE
with open('saved_data_plot/datasets.html', 'w') as f:
    f.write(html_string.format(table=df.to_html(classes='mystyle', index=False)))
    