## External databases

In [91]:
import pandas as pd
from io import StringIO
import rouskinhf


length_intervals = [10, 200, 500, 1000, 2000, float("inf")]

class Database:
    def __init__(self, name, method,source, families, filtering, stage):
        data = rouskinhf.get_dataset(name)
        self.name = name
        self.method = method
        self.source = source
        self.filtering = filtering
        self.N_sequences = "{:,}".format(len(data))
        self.families = families
        self.stage = stage
        self.lengths = self.make_length_intervals([len(dp['sequence']) for dp in data.values()])
        
    def make_length_intervals(self, lengths):
        intervals = [0] * (len(length_intervals)-1)
        for length in lengths:
            for i, interval in enumerate(length_intervals[1:]):
                if length < interval:
                    intervals[i] += 1
                    break
        return '\t'.join(["{:,}".format(interval) for interval in intervals])
    
    def column_names(self):
        return '\t'.join(["Training stage", "Name on HuggingFace", "Source", "Method", "Number of sequences", "Filtering", "Families"] + [f"L ∈ [{length_intervals[i]}, {length_intervals[i+1]-1}]" for i in range(len(length_intervals)-1)]) 
        
    def __str__(self):
        return '\t'.join([self.stage, self.name, self.source, self.method, self.N_sequences, self.filtering, self.families, self.lengths])
    
    def __repr__(self):
        return self.__str__()
    
databases = []

databases.append(Database(
    name = "rnacentral_synthetic", 
    method="RNAstructure",
    source="RNA central",
    families="All known families",
    filtering="Diverse subset across families",
    stage="Pre-training",
))

databases.append(Database(
    name = "ribo500-blast", 
    method="RNAstructure + DMS and/or SHAPE",
    source="Ribonanza Competition",
    families="",
    filtering="500+ reads, AUROC > 0.8, BLAST",
    stage="Pre-training",
))



In [92]:
databases.append(Database(
    name = "bpRNA-1m", 
    method="Covariance analysis",
    source="Published structures",
    families="Unlabeled, sRNA, tRNA",
    filtering="BLAST",
    stage="Pre-training",
))


In [93]:
databases.append(Database(
    name = "RNAstralign", 
    method="Covariance analysis",
    source="Published structures",
    families="rRNA, tRNA",
    filtering="BLAST",
    stage="Pre-training",
))


In [94]:
databases.append(Database(
    name = "pri_miRNA", 
    method="RNAstructure + DMS",
    source="This work",
    families="pri-miRNA",
    filtering="Reads < 3000, AUROC > 0.8, BLAST",
    stage="Fine-tuning",
))

In [95]:
databases.append(Database(
    name = "human_mRNA", 
    method="RNAstructure + DMS",
    source="This work",
    families="mRNA",
    filtering="Reads < 3000, AUROC > 0.8, BLAST",
    stage="Fine-tuning",
))

In [96]:
databases.append(Database(
    name = "PDB", 
    method="NMR, crystallography",
    source="Published 3D structures",
    families="Short non-coding RNA",
    filtering="TODO",
    stage="Testing",
))

In [98]:
databases.append(Database(
    name = "viral_fragments", 
    method="RNAstructure + DMS",
    source="TODO",
    families="Viral RNA",
    filtering="TODO",
    stage="Testing",
))

In [99]:
databases.append(Database(
    name = "lncRNA", 
    method="RNAstructure + DMS",
    source="TODO",
    families="Long non-coding RNA",
    filtering="TODO",
    stage="Testing",
))

In [102]:
df = pd.read_csv(StringIO('\n'.join([databases[0].column_names()]+[database.__str__() for database in databases])), sep='\t', index_col=False)
df.to_csv("datasets.tsv", sep=',', index=False)