# Imports

In [1]:
import pandas as pd
import json
import warnings
warnings.filterwarnings('ignore')
import urllib

# Define paths

In [2]:
REPO_URL="https://raw.githubusercontent.com/sinc-lab/rna-llm-folding/refs/heads/main"
DATA_PATH=f"{REPO_URL}/data"
RESULTS_PATH=f"{REPO_URL}/results"

# Labels and common methods

In [3]:
llms = ["RiNALMo", "ERNIE-RNA", "rna-msm", "rnabert", "rnafm", "one-hot", "RNAErnie"]
llm_names = {
    "rna-msm": "RNA-MSM",
    "rnafm": "RNA-FM",
    "rnabert": "RNABERT",
}

In [4]:
## UTILS from sincFold
MATCHING_BRACKETS = [
    ["(", ")"],
    ["[", "]"],
    ["{", "}"],
    ["<", ">"],
    ["A", "a"],
    ["B", "a"],
]

def f1_strict(ref_bp, pre_bp):
    """F1 score strict, same as triangular but less efficient"""
    # corner case when there are no positives
    if len(ref_bp) == 0 and len(pre_bp) == 0:
        return 1.0, 1.0, 1.0

    tp1 = 0
    for rbp in ref_bp:
        if rbp in pre_bp:
            tp1 = tp1 + 1
    tp2 = 0
    for pbp in pre_bp:
        if pbp in ref_bp:
            tp2 = tp2 + 1

    fn = len(ref_bp) - tp1
    fp = len(pre_bp) - tp1

    tpr = pre = f1 = 0.0
    if tp1 + fn > 0:
        tpr = tp1 / float(tp1 + fn)  # sensitivity (=recall =power)
    if tp1 + fp > 0:
        pre = tp2 / float(tp1 + fp)  # precision (=ppv)
    if tpr + pre > 0:
        f1 = 2 * pre * tpr / (pre + tpr)  # F1 score

    return tpr, pre, f1



def fold2bp(struc, xop="(", xcl=")"):
    """Get base pairs from one page folding (using only one type of brackets).
    BP are 1-indexed"""
    openxs = []
    bps = []
    if struc.count(xop) != struc.count(xcl):
        return False
    for i, x in enumerate(struc):
        if x == xop:
            openxs.append(i)
        elif x == xcl:
            if len(openxs) > 0:
                bps.append([openxs.pop() + 1, i + 1])
            else:
                return False
    return bps


def dot2bp(struc):
    bp = []
    if not set(struc).issubset(
        set(["."] + [c for par in MATCHING_BRACKETS for c in par])
    ):
        return False

    for brackets in MATCHING_BRACKETS:
        if brackets[0] in struc:
            bpk = fold2bp(struc, brackets[0], brackets[1])
            if bpk:
                bp = bp + bpk
            else:
                return False
    return list(sorted(bp))

# Generate kfold F1 scores

In [31]:
def make_list_kfold(emb_name):
  """
  Method that computes the F1 score for all ArchiveII kfold test partitions
  and returns a list of tuples with format <f1 score, LLM name>.
  """
  kfold_list = []
  df = pd.read_csv(f'{DATA_PATH}/ArchiveII.csv', index_col="id")
  splits = pd.read_csv(f'{DATA_PATH}/ArchiveII_kfold_splits.csv', index_col="id")
  for k in range(5):
    test = df.loc[splits[(splits.fold==k) & (splits.partition=="test")].index]
    kfold_results_path = f"{RESULTS_PATH}/ArchiveII_kfold/{emb_name}/{k}"
    try:
      preds = pd.read_csv(f"{kfold_results_path}/preds.csv")
    except urllib.error.HTTPError:
      preds = pd.read_csv(f"{kfold_results_path}/preds_test.csv")

    ref_bps = test["base_pairs"].tolist()
    pred_bps = preds["base_pairs"].tolist()
    assert len(ref_bps) == len(pred_bps)

    for ref_bp, pred_bp in zip(ref_bps, pred_bps):
      _, _, f1 = f1_strict(json.loads(ref_bp), json.loads(pred_bp))
      kfold_list.append([f1, llm_names.get(emb_name, emb_name)])
  return kfold_list

# Generate famfold F1 scores

In [58]:
def make_list_famfold(emb_name):
  """
  Method that computes the F1 score for ArchiveII famfold partitions
  and returns a list of tuples with format <f1 score, LLM name, RNA family>.
  """
  famfold_list = []
  df = pd.read_csv(f"{DATA_PATH}/ArchiveII.csv")
  df["fam"] = df["id"].str.split("_").str[0]
  for fam in df["fam"].unique():
    fam_path = f"{RESULTS_PATH}/ArchiveII_famfold/{emb_name}/{fam}"
    try:
      preds = pd.read_csv(f"{fam_path}/preds.csv")
    except urllib.error.HTTPError:
      preds = pd.read_csv(f"{fam_path}/preds_test.csv")

    test = df[df["fam"] == fam]

    ref_bps = test["base_pairs"].tolist()
    pred_bps = preds["base_pairs"].tolist()
    assert len(ref_bps) == len(pred_bps)

    for ref_bp, pred_bp in zip(ref_bps, pred_bps):
      _, _, f1 = f1_strict(json.loads(ref_bp), json.loads(pred_bp))
      famfold_list.append([f1, llm_names.get(emb_name, emb_name), fam])
  return famfold_list

# Generate PDB-RNA, bpRNA and bpRNA-new F1 scores

In [59]:
def make_list(emb_name, dataset):
  """
  Method that computes the F1 score for pdb and bpRNA test partitions
  and returns a list of tuples with format <f1 score, LLM name>.
  For bpRNA, it also returns a second list with the bpRNA new test partition results.
  """
  df = pd.read_csv(f'{DATA_PATH}/{dataset}.csv', index_col="id")
  splits = pd.read_csv(f'{DATA_PATH}/{dataset}_splits.csv', index_col="id")
  f1_list = []
  f1_list_2 = []

  if dataset=="PDB-RNA":
    test = df.loc[splits.partition=="test"]
  if dataset=="bpRNA":
    test = df.loc[splits.partition=="TS0"]
    new_test = df.loc[splits.partition=="new"]

  dataset_results_path = f"{RESULTS_PATH}/{dataset}/{emb_name}"
  try:
    preds = pd.read_csv(f"{dataset_results_path}/preds.csv")
    if dataset=="bpRNA":
      new_preds = pd.read_csv(f"{RESULTS_PATH}/{dataset}_new/{emb_name}preds_new_test.csv")
  except urllib.error.HTTPError:
    preds = pd.read_csv(f"{dataset_results_path}/preds_test.csv")
    if dataset=="bpRNA":
      new_preds = pd.read_csv(f"{RESULTS_PATH}/{dataset}_new/{emb_name}/preds_new_test.csv")

  ref_bps = test["base_pairs"].tolist()
  pred_bps = preds["base_pairs"].tolist()
  assert len(ref_bps) == len(pred_bps)

  for ref_bp, pred_bp in zip(ref_bps, pred_bps):
    _, _, f1 = f1_strict(json.loads(ref_bp), json.loads(pred_bp))
    f1_list.append([f1, llm_names.get(emb_name, emb_name)])

  if dataset=="bpRNA":
    ref_bps = new_test["base_pairs"].tolist()
    pred_bps = new_preds["base_pairs"].tolist()
    assert len(ref_bps) == len(pred_bps)
    for ref_bp, pred_bp in zip(ref_bps, pred_bps):
      _, _, f1 = f1_strict(json.loads(ref_bp), json.loads(pred_bp))
      f1_list_2.append([f1, llm_names.get(emb_name, emb_name)])
  return f1_list, f1_list_2

# Generate baseline F1 for ArchiveII kfold, PDB-RNA, bpRNA and bpRNA-new

In [34]:
def make_baseline_list(baseline_method, dataset, variant=""):
  """
  Method that computes the baseline F1 scores for Linear Partition C
  for PDB-RNA, bpRNA, bpRNA-new and ArchiveII kfold.
  """
  df = pd.read_csv(f'{DATA_PATH}/{dataset}.csv', index_col="id")
  splits = pd.read_csv(f'{DATA_PATH}/{dataset}{variant}_splits.csv', index_col="id")

  f1_list = []
  f1_list_2 = []
  if dataset=="PDB-RNA":
    test = df.loc[splits.partition=="test"]
  if dataset=="bpRNA":
    test = df.loc[splits.partition=="TS0"]
    new_test = df.loc[splits.partition=="new"]
  if dataset=='ArchiveII':
    test = df

  preds = pd.read_csv(f"{RESULTS_PATH}/{dataset}{variant}/{baseline_method}_{dataset}.csv")
  if dataset=="bpRNA":
    new_preds = pd.read_csv(f"{RESULTS_PATH}/{dataset}_new/{baseline_method}_{dataset}-new.csv")

  pred_foldings = preds["folding"].tolist()
  ref_bps = test["base_pairs"].tolist()
  assert len(ref_bps) == len(pred_foldings)

  # CONVERT FOLDING TO BP FORMAT
  pred_bps = []
  for pred_folding in pred_foldings:
    pred_bps.append(dot2bp(pred_folding))

  for ref_bp, pred_bp in zip(ref_bps, pred_bps):
    _, _, f1 = f1_strict(json.loads(ref_bp), pred_bp)
    f1_list.append(f1)

  if dataset=='bpRNA':
    new_pred_foldings = new_preds["folding"].tolist()
    ref_bps = new_test["base_pairs"].tolist()
    assert len(ref_bps) == len(new_pred_foldings)
    new_pred_bps = []
    for new_pred_folding in new_pred_foldings:
      new_pred_bps.append(dot2bp(new_pred_folding))

    for ref_bp, new_pred_bp in zip(ref_bps, new_pred_bps):
      _, _, f1 = f1_strict(json.loads(ref_bp), new_pred_bp)
      f1_list_2.append(f1)

  return f1_list, f1_list_2

# Generate baseline F1 for ArchiveII famfold

In [42]:
def make_baseline_list_famfold(baseline_method):
  """
  Method that computes the baseline F1 score for Linear Partition C
  for ArchiveII fam fold, returing a list of tuples with format <f1 score, family name>
  """
  df = pd.read_csv(f'{DATA_PATH}/ArchiveII.csv')
  df["fam"] = df["id"].str.split("_").str[0]
  preds = pd.read_csv(f"{RESULTS_PATH}/ArchiveII_kfold/{baseline_method}_ArchiveII.csv")
  preds["fam"] = preds["id"].str.split("_").str[0]
  f1_list = []
  for fam in df["fam"].unique():
    test = df[df["fam"] == fam]
    preds_fam = preds[preds["fam"] == fam]
    pred_foldings = preds_fam["folding"].tolist()
    ref_bps = test["base_pairs"].tolist()
    assert len(ref_bps) == len(pred_foldings)
    # CONVERT FOLDING TO BP FORMAT
    pred_bps = []
    for pred_folding in pred_foldings:
      pred_bps.append(dot2bp(pred_folding))
    for ref_bp, pred_bp in zip(ref_bps, pred_bps):
      _, _, f1 = f1_strict(json.loads(ref_bp), pred_bp)
      f1_list.append([f1,fam])
  return f1_list

# Save csv files

In [36]:
all_kfold = []
for llm in llms:
  all_kfold.extend(make_list_kfold(llm))
kfold_df = pd.DataFrame(all_kfold, columns=['F1', 'LLM'])
kfold_df.to_csv("ArchiveII_kfold_scores.csv",index=False,header=True)

In [60]:
all_famfold = []
for llm in llms:
  all_famfold.extend(make_list_famfold(llm))
famfold_df = pd.DataFrame(all_famfold, columns=['F1', 'LLM', 'fam'])
famfold_df.to_csv("ArchiveII_famfold_scores.csv",index=False,header=True)

In [46]:
all_pdb = []
for llm in llms:
  llm_pdb, _ = make_list(llm, "PDB-RNA")
  all_pdb.extend(llm_pdb)
pdb_df = pd.DataFrame(all_pdb, columns=['F1', 'LLM'])
pdb_df.to_csv("pdb-rna_scores.csv",index=False,header=True)

In [55]:
all_bpRNA = []
all_bpRNA_new = []
for llm in llms:
  llm_bpRNA, llm_bpRNA_new = make_list(llm, "bpRNA")
  all_bpRNA.extend(llm_bpRNA)
  all_bpRNA_new.extend(llm_bpRNA_new)

bprna_df = pd.DataFrame(all_bpRNA, columns=['F1', 'LLM'])
bprna_new_df = pd.DataFrame(all_bpRNA_new, columns=['F1', 'LLM'])
bprna_df.to_csv("bpRNA_scores.csv",index=False,header=True)
bprna_new_df.to_csv("bpRNA_new_scores.csv",index=False,header=True)

In [56]:
baseline_kfold, _ = make_baseline_list("linearpartitionC", "ArchiveII", "_kfold")
baseline_kfold_df = pd.DataFrame(baseline_kfold, columns=['F1'])
baseline_kfold_df.to_csv("ArchiveII_baseline_scores.csv",index=False,header=True)

In [17]:
baseline_bprna, baseline_bprna_new = make_baseline_list("linearpartitionC", "bpRNA")
baseline_bprna_df = pd.DataFrame(baseline_bprna, columns=['F1'])
baseline_bprna_df.to_csv("bpRNA_baseline_scores.csv",index=False,header=True)
baseline_bprna_new_df = pd.DataFrame(baseline_bprna_new, columns=['F1'])
baseline_bprna_new_df.to_csv("bpRNA_new_baseline_scores.csv",index=False,header=True)

In [22]:
baseline_pdb, _ = make_baseline_list("linearpartitionC", "PDB-RNA")
baseline_pdb_df = pd.DataFrame(baseline_pdb, columns=['F1'])
baseline_pdb_df.to_csv("PDB-RNA_baseline_scores.csv",index=False,header=True)

In [57]:
baseline_famfold = make_baseline_list_famfold("linearpartitionC")
baseline_famfold_df = pd.DataFrame(baseline_famfold, columns=['F1', 'fam'])
baseline_famfold_df.to_csv("ArchiveII_famfold_baseline_scores.csv",index=False,header=True)