In [11]:
import pandas as pd
from sklearn.metrics import (
    average_precision_score,roc_auc_score
)
import numpy as np
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score


In [12]:
all=list(range(129))
cdr1 = list(range(25, 40 + 1))
cdr2=list(range(54, 67 + 1))
cdr3=list(range(103, 119 + 1))
cdrs = cdr1 + cdr2 + cdr3
from sklearn.metrics import f1_score

def get_f1_scores(predictions, column="prediction", threshold1=0.5, threshold2=0.5):
    # Filter predictions where IMGT_bis is in cdrs
    cdr_predictions = predictions.query("IMGT_bis in @cdrs")
    non_cdr_predictions = predictions.query("IMGT_bis not in @cdrs")

    # Apply threshold1 to cdr_predictions
    cdr_preds = (cdr_predictions[column] >= threshold1).astype(int).tolist()
    cdr_labs = cdr_predictions["labels"].tolist()

    # Apply threshold2 to non_cdr_predictions
    non_cdr_preds = (non_cdr_predictions[column] >= threshold2).astype(int).tolist()
    non_cdr_labs = non_cdr_predictions["labels"].tolist()

    # Combine predictions and labels
    all_preds = cdr_preds + non_cdr_preds
    all_labs = cdr_labs + non_cdr_labs

    # Calculate F1 score
    f1 = f1_score(all_labs, all_preds)

    return f1


def get_mcc_scores(predictions, column="prediction", threshold1=0.5,threshold2=0.5):
    # Filter predictions where IMGT_bis is in cdrs
    cdr_predictions = predictions.query("IMGT_bis in @cdrs")
    non_cdr_predictions = predictions.query("IMGT_bis not in @cdrs")

    # Apply threshold1 to cdr_predictions
    cdr_preds = (cdr_predictions[column] >= threshold1).astype(int).tolist()
    cdr_labs = cdr_predictions["labels"].tolist()

    # Apply threshold2 to non_cdr_predictions
    non_cdr_preds = (non_cdr_predictions[column] >= threshold2).astype(int).tolist()
    non_cdr_labs = non_cdr_predictions["labels"].tolist()

    # Combine predictions and labels
    all_preds = cdr_preds + non_cdr_preds
    all_labs = cdr_labs + non_cdr_labs
    mcc = matthews_corrcoef(all_labs, all_preds)

    return mcc


def get_ap_scores(predictions, column="prediction"):

    preds = predictions[column].tolist()
    labs = predictions["labels"].tolist()
    ap = average_precision_score(labs, preds)

    return ap
def get_roc_scores(predictions, column="prediction"):
    preds = predictions[column].tolist()
    labs = predictions["labels"].tolist()
    roc = roc_auc_score(labs, preds)

    return roc


# LLM / Paragraph / combined

In [13]:

def get_ap_roc_f1_mcc_df(llm_path, paragraph_path, threshold1=0.5,threshold2=0.5):

    predictions_llm = pd.read_csv(llm_path)
    predictions_llm["IMGT_bis"] = predictions_llm["IMGT"].str.replace(r'[a-zA-Z]$', '', regex=True).astype(int)
    predictions_paragraph = pd.read_csv(paragraph_path)
    predictions_paragraph["IMGT_bis"] = predictions_paragraph["IMGT"].str.replace(r'[a-zA-Z]$', '', regex=True).astype(int)

    combined = pd.merge(predictions_llm, predictions_paragraph[['pdb', 'IMGT', 'chain_type', 'prediction']],
                        on=["pdb", "IMGT", "chain_type"],
                        how="left",
                        suffixes=("_llm", "_paragraph"))

    combined["prediction"] = np.where(
        combined["prediction_paragraph"].notna(),
        combined["prediction_paragraph"],
        combined["prediction_llm"]
    )
    combined["prediction_paragraph"] = np.where(
        combined["prediction_paragraph"].notna(),
        combined["prediction_paragraph"],
        0
    )

    # Compute all metrics
    ap_dict = {
    }

    roc_dict = {
    }

    f1_dict = {
    }

    mcc_dict = {
    }

    ap_paragraph_list = []
    roc_paragraph_list = []
    f1_paragraph_list = []
    mcc_paragraph_list = []
    ap_llm_list = []
    roc_llm_list = []
    f1_llm_list = []
    mcc_llm_list = []
    ap_combined_list = []
    roc_combined_list = []
    f1_combined_list = []
    mcc_combined_list = []

    for _, df_pdb in combined.groupby("pdb"):
        ap_llm_list.append(get_ap_scores(df_pdb, column="prediction_llm"))
        roc_llm_list.append(get_roc_scores(df_pdb, column="prediction_llm"))
        f1_llm_list.append(get_f1_scores(df_pdb, column="prediction_llm", threshold1=threshold2, threshold2=threshold2))
        mcc_llm_list.append(get_mcc_scores(df_pdb, column="prediction_llm", threshold1=threshold2, threshold2=threshold2))
        ap_paragraph_list.append(get_ap_scores(df_pdb, column="prediction_paragraph"))
        roc_paragraph_list.append(get_roc_scores(df_pdb, column="prediction_paragraph"))
        f1_paragraph_list.append(get_f1_scores(df_pdb, column="prediction_paragraph", threshold1=threshold1, threshold2=threshold1))
        mcc_paragraph_list.append(get_mcc_scores(df_pdb, column="prediction_paragraph", threshold1=threshold1, threshold2=threshold1))
        ap_combined_list.append(get_ap_scores(df_pdb, column="prediction"))
        roc_combined_list.append(get_roc_scores(df_pdb, column="prediction"))
        f1_combined_list.append(get_f1_scores(df_pdb, column="prediction", threshold1=threshold1, threshold2=threshold2))
        mcc_combined_list.append(get_mcc_scores(df_pdb, column="prediction", threshold1=threshold1, threshold2=threshold2))

    # Average the results across pdb groups
    ap_dict["Combined"] = np.mean(ap_combined_list)
    roc_dict["Combined"] = np.mean(roc_combined_list)
    f1_dict["Combined"] = np.mean(f1_combined_list)
    mcc_dict["Combined"] = np.mean(mcc_combined_list)
    ap_dict["paragraph"] = np.mean(ap_paragraph_list)
    roc_dict["paragraph"] = np.mean(roc_paragraph_list)
    f1_dict["paragraph"] = np.mean(f1_paragraph_list)
    mcc_dict["paragraph"] = np.mean(mcc_paragraph_list)
    ap_dict["llm"] = np.mean(ap_llm_list)
    roc_dict["llm"] = np.mean(roc_llm_list)
    f1_dict["llm"] = np.mean(f1_llm_list)
    mcc_dict["llm"] = np.mean(mcc_llm_list)
    mcc_dict["metric"]="mcc"
    f1_dict["metric"]="f1"
    ap_dict["metric"]="ap"
    roc_dict["metric"]="roc"


    return ap_dict, roc_dict, f1_dict, mcc_dict


# PECAN

In [14]:
from tqdm import tqdm
records=[]

for seed in tqdm(range(1, 17)):
    llm_path = f"/home/athenes/Paraplume/benchmark/pecan/250526/lr-0.00005_dr-0.4,0.4,0.4_mk-0.4_bs-16_dim1-2000,1000,500_alphas-4,5,6_pen-0.00001_weight_1_emb_all_seed_{seed}/prediction_test_set.csv"
    paragraph_path = f"/home/athenes/Paraplume/benchmark/pecan/3D/{seed}/prediction_.csv"

    ap_dict, roc_dict, f1_dict, mcc_dict = get_ap_roc_f1_mcc_df(llm_path, paragraph_path, threshold1=0.5, threshold2=0.5)
    for each in [ap_dict, roc_dict, f1_dict, mcc_dict]:
        each["seed"]=seed
        records.append(each)

final_pecan = pd.DataFrame.from_records(records).groupby("metric").mean()
print(final_pecan)

  0%|          | 0/16 [00:00<?, ?it/s]

100%|██████████| 16/16 [01:03<00:00,  3.98s/it]

        Combined  paragraph       llm  seed
metric                                     
ap      0.771552   0.754739  0.730465   8.5
f1      0.696988   0.694492  0.681540   8.5
mcc     0.674910   0.671838  0.656573   8.5
roc     0.964925   0.940270  0.963271   8.5





# Paragraph

In [15]:
from tqdm import tqdm
records=[]

for seed in tqdm(range(1, 17)):
    llm_path = f"/home/athenes/Paraplume/benchmark/paragraph/250526/lr-0.00005_dr-0.4,0.4,0.4_mk-0.4_bs-16_dim1-2000,1000,500_alphas-4,5,6_pen-0.00001_weight_1_emb_all_seed_{seed}/prediction_test_set.csv"
    paragraph_path = f"/home/athenes/Paraplume/benchmark/paragraph/3D/{seed}/prediction_.csv"

    ap_dict, roc_dict, f1_dict, mcc_dict = get_ap_roc_f1_mcc_df(llm_path, paragraph_path, threshold1=0.5, threshold2=0.5)
    for each in [ap_dict, roc_dict, f1_dict, mcc_dict]:
        each["seed"]=seed
        records.append(each)
final_paragraph = pd.DataFrame.from_records(records).groupby("metric").mean()
print(final_paragraph)

  0%|          | 0/16 [00:00<?, ?it/s]

100%|██████████| 16/16 [01:30<00:00,  5.64s/it]

        Combined  paragraph       llm  seed
metric                                     
ap      0.790771   0.769627  0.757530   8.5
f1      0.704156   0.699992  0.700768   8.5
mcc     0.683468   0.678120  0.676167   8.5
roc     0.968110   0.939005  0.966174   8.5





# MIPE

In [16]:
from tqdm import tqdm
records=[]

for seed in tqdm(range(1, 6)):
    for cv in range(5):
        llm_path = f"/home/athenes/Paraplume/benchmark/mipe/250526/lr-0.00005_dr-0.4,0.4,0.4_mk-0.4_bs-16_dim1-2000,1000,500_alphas-4,5,6_pen-0.00001_weight_1_emb_all_seed_{seed}/{cv}/prediction_test_set.csv"
        paragraph_path = f"/home/athenes/Paraplume/benchmark/mipe/3D/seed{seed}/{cv}/prediction_.csv"

        ap_dict, roc_dict, f1_dict, mcc_dict = get_ap_roc_f1_mcc_df(llm_path, paragraph_path, threshold1=0.5, threshold2=0.5)
        for each in [ap_dict, roc_dict, f1_dict, mcc_dict]:
            each["seed"]=seed
            each["cv"]=cv
            records.append(each)

final_mipe = pd.DataFrame.from_records(records).groupby("metric").mean()
print(final_mipe)

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:40<00:00,  8.13s/it]

        Combined  paragraph       llm  seed   cv
metric                                          
ap      0.753108   0.742379  0.715676   3.0  2.0
f1      0.662914   0.663246  0.651207   3.0  2.0
mcc     0.648357   0.648532  0.632226   3.0  2.0
roc     0.963797   0.943297  0.962228   3.0  2.0





# MIPE, paragraph method only with abb3 models

In [17]:
from tqdm import tqdm
records=[]

for seed in tqdm(range(1, 6)):
    for cv in range(5):
        llm_path = f"/home/athenes/Paraplume/benchmark/mipe/250526/lr-0.00005_dr-0.4,0.4,0.4_mk-0.4_bs-16_dim1-2000,1000,500_alphas-4,5,6_pen-0.00001_weight_1_emb_all_seed_{seed}/{cv}/prediction_test_set.csv"
        paragraph_path = f"/home/athenes/Paraplume/benchmark/mipe/3D/seed{seed}/{cv}/prediction_abb3.csv"

        ap_dict, roc_dict, f1_dict, mcc_dict = get_ap_roc_f1_mcc_df(llm_path, paragraph_path, threshold1=0.734, threshold2=0.5)
        for each in [ap_dict, roc_dict, f1_dict, mcc_dict]:
            each["seed"]=seed
            each["cv"]=cv
            records.append(each)

final_mipe = pd.DataFrame.from_records(records).groupby("metric").mean()
print(final_mipe["paragraph"])

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:40<00:00,  8.12s/it]

metric
ap     0.689409
f1     0.617336
mcc    0.596099
roc    0.936910
Name: paragraph, dtype: float64





# MIPE, paragraph method with crystal structures

In [None]:
from tqdm import tqdm
records=[]

for seed in tqdm(range(1, 6)):
    for cv in range(5):
        llm_path = f"/home/athenes/Paraplume/benchmark/mipe/250526/lr-0.00005_dr-0.4,0.4,0.4_mk-0.4_bs-16_dim1-2000,1000,500_alphas-4,5,6_pen-0.00001_weight_1_emb_all_seed_{seed}/{cv}/prediction_test_set.csv"
        paragraph_path = f"/home/athenes/Paraplume/benchmark/mipe/3D/seed{seed}/{cv}/prediction_.csv"

        ap_dict, roc_dict, f1_dict, mcc_dict = get_ap_roc_f1_mcc_df(llm_path, paragraph_path, threshold1=0.734, threshold2=0.5)
        for each in [ap_dict, roc_dict, f1_dict, mcc_dict]:
            each["seed"]=seed
            each["cv"]=cv
            records.append(each)

final_mipe = pd.DataFrame.from_records(records).groupby("metric").mean()
print(final_mipe["paragraph"])

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:40<00:00,  8.07s/it]

metric
ap     0.742379
f1     0.651322
mcc    0.634459
roc    0.943297
Name: paragraph, dtype: float64



