In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score
from sklearn.metrics import (
    average_precision_score,roc_auc_score
)

In [6]:
all=list(range(129))
cdr1 = list(range(25, 40 + 1))
cdr2=list(range(54, 67 + 1))
cdr3=list(range(103, 119 + 1))
cdrs = cdr1 + cdr2 + cdr3

def get_f1_scores(predictions, column="prediction", threshold1=0.5, threshold2=0.5):
    # Filter predictions where IMGT_bis is in cdrs
    cdr_predictions = predictions.query("IMGT_bis in @cdrs")
    non_cdr_predictions = predictions.query("IMGT_bis not in @cdrs")

    # Apply threshold1 to cdr_predictions
    cdr_preds = (cdr_predictions[column] >= threshold1).astype(int).tolist()
    cdr_labs = cdr_predictions["labels"].tolist()

    # Apply threshold2 to non_cdr_predictions
    non_cdr_preds = (non_cdr_predictions[column] >= threshold2).astype(int).tolist()
    non_cdr_labs = non_cdr_predictions["labels"].tolist()

    # Combine predictions and labels
    all_preds = cdr_preds + non_cdr_preds
    all_labs = cdr_labs + non_cdr_labs

    # Calculate F1 score
    f1 = f1_score(all_labs, all_preds)

    return f1


def get_mcc_scores(predictions, column="prediction", threshold1=0.5,threshold2=0.5):
    # Filter predictions where IMGT_bis is in cdrs
    cdr_predictions = predictions.query("IMGT_bis in @cdrs")
    non_cdr_predictions = predictions.query("IMGT_bis not in @cdrs")

    # Apply threshold1 to cdr_predictions
    cdr_preds = (cdr_predictions[column] >= threshold1).astype(int).tolist()
    cdr_labs = cdr_predictions["labels"].tolist()

    # Apply threshold2 to non_cdr_predictions
    non_cdr_preds = (non_cdr_predictions[column] >= threshold2).astype(int).tolist()
    non_cdr_labs = non_cdr_predictions["labels"].tolist()

    # Combine predictions and labels
    all_preds = cdr_preds + non_cdr_preds
    all_labs = cdr_labs + non_cdr_labs
    mcc = matthews_corrcoef(all_labs, all_preds)

    return mcc
def get_ap_scores(predictions, column="prediction"):

    preds = predictions[column].tolist()
    labs = predictions["labels"].tolist()
    ap = average_precision_score(labs, preds)

    return ap
def get_roc_scores(predictions, column="prediction"):
    preds = predictions[column].tolist()
    labs = predictions["labels"].tolist()
    roc = roc_auc_score(labs, preds)

    return roc

In [7]:

def get_ap_roc_f1_mcc_df(llm_path1,llm_path2, threshold=0.5, column="labels_llm2"):
    predictions_llm1 = pd.read_csv(llm_path1)
    predictions_llm2 = pd.read_csv(llm_path2)
    predictions_llm = pd.merge(predictions_llm1, predictions_llm2[['pdb', 'IMGT', 'chain_type', 'prediction', 'labels']],
                        on=["pdb", "IMGT", "chain_type"],
                        how="left",
                        suffixes=("_llm1", "_llm2"))
    predictions_llm=predictions_llm.dropna()
    predictions_llm["labels"]=predictions_llm["labels_llm1"]
    predictions_llm["prediction"]=predictions_llm[column]
    predictions_llm["IMGT_bis"] = predictions_llm["IMGT"].str.replace(r'[a-zA-Z]$', '', regex=True).astype(int)

    # Compute all metrics

    f1_dict = {
    }

    mcc_dict = {
    }
    f1_llm_list = []
    mcc_llm_list = []

    for _, df_pdb in predictions_llm.groupby("pdb"):

        f1_llm_list.append(get_f1_scores(df_pdb, column="prediction", threshold1=threshold, threshold2=threshold))
        mcc_llm_list.append(get_mcc_scores(df_pdb, column="prediction", threshold1=threshold, threshold2=threshold))

    # Average the results across pdb groups


    f1_dict["llm"] = np.mean(f1_llm_list)
    mcc_dict["llm"] = np.mean(mcc_llm_list)
    mcc_dict["metric"]="mcc"
    f1_dict["metric"]="f1"


    return f1_dict, mcc_dict


# Paragraph dataset

### upper bound

In [None]:
from tqdm import tqdm
records=[]

for seed in tqdm(range(1, 2)):
    llm_path1 = f"/home/athenes/Paraplume/benchmark/paragraph/250526/lr-0.00005_dr-0.4,0.4,0.4_mk-0.4_bs-16_dim1-2000,1000,500_alphas-4,5,6_pen-0.00001_weight_1_emb_all_seed_{seed}/prediction_paragraph_test_limits1.csv"
    llm_path2 = f"/home/athenes/Paraplume/benchmark/paragraph/250526/lr-0.00005_dr-0.4,0.4,0.4_mk-0.4_bs-16_dim1-2000,1000,500_alphas-4,5,6_pen-0.00001_weight_1_emb_all_seed_{seed}/prediction_paragraph_test_limits2.csv"

    f1_dict, mcc_dict = get_ap_roc_f1_mcc_df(llm_path1,llm_path2, threshold=0.5)
    for each in [f1_dict, mcc_dict]:
        each["seed"]=seed
        records.append(each)
final_paragraph_limit = pd.DataFrame.from_records(records).groupby("metric").mean()
print(final_paragraph_limit)

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:01<00:00,  1.12s/it]

             llm  seed
metric                
f1      0.953282   1.0
mcc     0.949408   1.0





### Paraplume

In [None]:
from tqdm import tqdm
records=[]

for seed in tqdm(range(1, 17)):
    llm_path1 = f"/home/athenes/Paraplume/benchmark/paragraph/250526/lr-0.00005_dr-0.4,0.4,0.4_mk-0.4_bs-16_dim1-2000,1000,500_alphas-4,5,6_pen-0.00001_weight_1_emb_all_seed_{seed}/prediction_paragraph_test_limits1.csv"
    llm_path2 = f"/home/athenes/Paraplume/benchmark/paragraph/250526/lr-0.00005_dr-0.4,0.4,0.4_mk-0.4_bs-16_dim1-2000,1000,500_alphas-4,5,6_pen-0.00001_weight_1_emb_all_seed_{seed}/prediction_paragraph_test_limits2.csv"

    f1_dict, mcc_dict = get_ap_roc_f1_mcc_df(llm_path1,llm_path2, threshold=0.5, column="prediction_llm1")
    for each in [f1_dict, mcc_dict]:
        each["seed"]=seed
        records.append(each)
final_paragraph_model = pd.DataFrame.from_records(records).groupby("metric").mean()
print(final_paragraph_model)

  0%|          | 0/16 [00:00<?, ?it/s]

100%|██████████| 16/16 [00:11<00:00,  1.39it/s]

             llm  seed
metric                
f1      0.710647   8.5
mcc     0.686340   8.5





# MIPE dataset

### uppder bound

In [None]:
from tqdm import tqdm
records=[]

for seed in tqdm(range(1,2 )):
    for cv in range(1):
        llm_path1 = f"/home/athenes/Paraplume/benchmark/mipe/250526/lr-0.00005_dr-0.4,0.4,0.4_mk-0.4_bs-16_dim1-2000,1000,500_alphas-4,5,6_pen-0.00001_weight_1_emb_all_seed_{seed}/{cv}/prediction_mipe_test_limits1.csv"
        llm_path2 = f"/home/athenes/Paraplume/benchmark/mipe/250526/lr-0.00005_dr-0.4,0.4,0.4_mk-0.4_bs-16_dim1-2000,1000,500_alphas-4,5,6_pen-0.00001_weight_1_emb_all_seed_{seed}/{cv}/prediction_mipe_test_limits2.csv"

        f1_dict, mcc_dict = get_ap_roc_f1_mcc_df(llm_path1,llm_path2, threshold=0.5)
        for each in [f1_dict, mcc_dict]:
            each["seed"]=seed
            records.append(each)
final_mipe_limit = pd.DataFrame.from_records(records).groupby("metric").mean()
print(final_mipe_limit)

100%|██████████| 1/1 [00:00<00:00,  5.16it/s]

             llm  seed
metric                
f1      0.961295   1.0
mcc     0.958439   1.0





### paraplume

In [None]:
from tqdm import tqdm
records=[]

for seed in tqdm(range(1,6)):
    for cv in range(5):
        llm_path1 = f"/home/athenes/Paraplume/benchmark/mipe/250310_final/lr-0.00005_dr-0.4,0.4,0.4_mk-0.4_bs-16_dim1-2000,1000,500_alphas-4,5,6_pen-0.00001_weight_1_emb_all_seed_{seed}/{cv}/prediction_mipe_test_limits1.csv"
        llm_path2 = f"/home/athenes/Paraplume/benchmark/mipe/250310_final/lr-0.00005_dr-0.4,0.4,0.4_mk-0.4_bs-16_dim1-2000,1000,500_alphas-4,5,6_pen-0.00001_weight_1_emb_all_seed_{seed}/{cv}/prediction_mipe_test_limits2.csv"

        f1_dict, mcc_dict = get_ap_roc_f1_mcc_df(llm_path1,llm_path2, threshold=0.5, column="prediction_llm1")
        for each in [f1_dict, mcc_dict]:
            each["seed"]=seed
            records.append(each)
final_mipe_model = pd.DataFrame.from_records(records).groupby("metric").mean()
print(final_mipe_model)

100%|██████████| 5/5 [00:04<00:00,  1.19it/s]

             llm  seed
metric                
f1      0.711872   3.0
mcc     0.689328   3.0





# Pecan dataset

### upper bound

In [None]:
from tqdm import tqdm
records=[]

for seed in tqdm(range(1, 2)):
    llm_path1 = f"/home/athenes/Paraplume/benchmark/pecan/250526/lr-0.00005_dr-0.4,0.4,0.4_mk-0.4_bs-16_dim1-2000,1000,500_alphas-4,5,6_pen-0.00001_weight_1_emb_all_seed_{seed}/prediction_pecan_test_limits1.csv"
    llm_path2 = f"/home/athenes/Paraplume/benchmark/pecan/250526/lr-0.00005_dr-0.4,0.4,0.4_mk-0.4_bs-16_dim1-2000,1000,500_alphas-4,5,6_pen-0.00001_weight_1_emb_all_seed_{seed}/prediction_pecan_test_limits2.csv"

    f1_dict, mcc_dict = get_ap_roc_f1_mcc_df(llm_path1,llm_path2, threshold=0.5)
    for each in [f1_dict, mcc_dict]:
        each["seed"]=seed
        records.append(each)
final_pecan_limit = pd.DataFrame.from_records(records).groupby("metric").mean()
print(final_pecan_limit)

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  1.67it/s]

             llm  seed
metric                
f1      0.947263   1.0
mcc     0.943651   1.0





### paraplume

In [None]:
from tqdm import tqdm
records=[]

for seed in tqdm(range(1, 4)):
    llm_path1 = f"/home/athenes/Paraplume/benchmark/pecan/250310_final/lr-0.00005_dr-0.4,0.4,0.4_mk-0.4_bs-16_dim1-2000,1000,500_alphas-4,5,6_pen-0.00001_weight_1_emb_all_seed_{seed}/prediction_pecan_test_limits1.csv"
    llm_path2 = f"/home/athenes/Paraplume/benchmark/pecan/250310_final/lr-0.00005_dr-0.4,0.4,0.4_mk-0.4_bs-16_dim1-2000,1000,500_alphas-4,5,6_pen-0.00001_weight_1_emb_all_seed_{seed}/prediction_pecan_test_limits2.csv"

    f1_dict, mcc_dict = get_ap_roc_f1_mcc_df(llm_path1,llm_path2, threshold=0.5, column="prediction_llm1")
    for each in [f1_dict, mcc_dict]:
        each["seed"]=seed
        records.append(each)
final_pecan_model = pd.DataFrame.from_records(records).groupby("metric").mean()
print(final_pecan_model)

100%|██████████| 3/3 [00:01<00:00,  1.74it/s]

             llm  seed
metric                
f1      0.662878   2.0
mcc     0.637215   2.0



