## External databases

In [113]:
import pandas as pd
from io import StringIO
import rouskinhf


length_intervals = [10, 200, 500, 1000, 2000, float("inf")]

class Database:
    def __init__(self, name, method,source, families, stage):
        data = rouskinhf.get_dataset(name)
        self.name = name
        self.method = method
        self.source = source
        self.N_sequences = "{:,}".format(len(data))
        self.families = families
        self.stage = stage
        self.lengths = self.make_length_intervals([len(dp['sequence']) for dp in data.values()])
        
    def make_length_intervals(self, lengths):
        intervals = [0] * (len(length_intervals)-1)
        for length in lengths:
            for i, interval in enumerate(length_intervals[1:]):
                if length < interval:
                    intervals[i] += 1
                    break
        return '\t'.join(["{:,}".format(interval) for interval in intervals])
    
    def column_names(self):
        return '\t'.join(["Training stage", "Name on HuggingFace", "Source", "Method", "Number of sequences", "Filtering", "Families"] + [f"L ∈ [{length_intervals[i]}, {length_intervals[i+1]-1}]" for i in range(len(length_intervals)-1)]) 
        
    def __str__(self):
        return '\t'.join([self.stage, self.name, self.source, self.method, self.N_sequences, self.families, self.lengths])
    
    def __repr__(self):
        return self.__str__()
    
databases = []

databases.append(Database(
    name = "rnacentral_synthetic", 
    method="RNAstructure",
    source="RNA central",
    families="All known families",
    stage="Pre-training",
))

databases.append(Database(
    name = "ribo500-blast", 
    method="RNAstructure + DMS and/or SHAPE",
    source="Ribonanza Competition",
    families="",
    stage="Pre-training",
))



In [114]:
databases.append(Database(
    name = "bpRNA-1m", 
    method="Covariance analysis",
    source="Published structures",
    families="Unlabeled, sRNA, tRNA",
    stage="Pre-training",
))


In [115]:
databases.append(Database(
    name = "RNAstralign", 
    method="Covariance analysis",
    source="Published structures",
    families="rRNA, tRNA",
    stage="Pre-training",
))


In [116]:
databases.append(Database(
    name = "pri_miRNA", 
    method="RNAstructure + DMS",
    source="This work",
    families="pri-miRNA",
    stage="Fine-tuning",
))

In [117]:
databases.append(Database(
    name = "human_mRNA", 
    method="RNAstructure + DMS",
    source="This work",
    families="mRNA",
    stage="Fine-tuning",
))

In [118]:
databases.append(Database(
    name = "PDB", 
    method="NMR, crystallography",
    source="Published 3D structures",
    families="Short non-coding RNA",
    stage="Testing",
))

In [119]:
databases.append(Database(
    name = "viral_fragments", 
    method="RNAstructure + DMS",
    source="TODO",
    families="Viral RNA",
    stage="Testing",
))

In [120]:
databases.append(Database(
    name = "lncRNA", 
    method="RNAstructure + DMS",
    source="TODO",
    families="Long non-coding RNA",
    stage="Testing",
))

In [122]:
df = pd.read_csv(StringIO('\n'.join([databases[0].column_names()]+[database.__str__() for database in databases])), sep='\t', index_col=False)
df.to_csv("datasets.csv", sep=',', index=False)
df

Unnamed: 0,Training stage,Name on HuggingFace,Source,Method,Number of sequences,Filtering,Families,"L ∈ [10, 199]","L ∈ [200, 499]","L ∈ [500, 999]","L ∈ [1000, 1999]","L ∈ [2000, inf]"
0,Pre-training,rnacentral_synthetic,RNA central,RNAstructure,226729,All known families,176486,49463,780,0,0,
1,Pre-training,ribo500-blast,Ribonanza Competition,RNAstructure + DMS and/or SHAPE,46060,,46049,11,0,0,0,
2,Pre-training,bpRNA-1m,Published structures,Covariance analysis,66715,"Unlabeled, sRNA, tRNA",48090,6167,2829,9260,369,
3,Pre-training,RNAstralign,Published structures,Covariance analysis,27082,"rRNA, tRNA",15879,2191,2252,6760,0,
4,Fine-tuning,pri_miRNA,This work,RNAstructure + DMS,1098,pri-miRNA,0,1098,0,0,0,
5,Fine-tuning,human_mRNA,This work,RNAstructure + DMS,1456,mRNA,0,493,882,81,0,
6,Testing,PDB,Published 3D structures,"NMR, crystallography",356,Short non-coding RNA,343,6,6,1,0,
7,Testing,viral_fragments,TODO,RNAstructure + DMS,58,Viral RNA,11,29,18,0,0,
8,Testing,lncRNA,TODO,RNAstructure + DMS,15,Long non-coding RNA,0,2,1,12,0,
