In [48]:
import pandas as pd
import numpy as np
from fastembed import TextEmbedding
from sklearn.metrics.pairwise import cosine_similarity

def calculate_column_metrics_with_label_similarity(
file_path,
df_ground_truth: pd.DataFrame,
df_predicted: pd.DataFrame,
similarity_threshold: float = 0.7,
embedding_cache_dir: str = "./embeddings_model_cache",
embedding_model: str = "BAAI/bge-large-en-v1.5"   
) -> dict:



    gt_labels = list(df_ground_truth.columns)
    pred_labels = list(df_predicted.columns)

    gt_norm = {lbl.lower().strip(): lbl for lbl in gt_labels}
    pred_norm = {lbl.lower().strip(): lbl for lbl in pred_labels}

    exact_pairs = []
    for norm_lbl, gt_lbl in gt_norm.items():
        if norm_lbl in pred_norm:
            exact_pairs.append((gt_lbl, pred_norm[norm_lbl]))


    matched_gt = {gt for gt, _ in exact_pairs}
    matched_pred = {pred for _, pred in exact_pairs}

    remaining_gt = [lbl for lbl in gt_labels if lbl not in matched_gt]
    remaining_pred = [lbl for lbl in pred_labels if lbl not in matched_pred]

    print("exact pairs:", len(exact_pairs))
    print("remaining gt:", len(remaining_gt))


    sim_pairs = []
    if remaining_gt and remaining_pred:
        model = TextEmbedding(
            model_name=embedding_model,
            cache_dir=embedding_cache_dir
        )
        gt_embeds = np.vstack(list(model.embed(remaining_gt)))
        pred_embeds = np.vstack(list(model.embed(remaining_pred)))
        sim_matrix = cosine_similarity(gt_embeds, pred_embeds)

        used_pred_idx = set()
        for i, gt_lbl in enumerate(remaining_gt):
            for j in np.argsort(sim_matrix[i])[::-1]:
                print(f"{gt_lbl} -> {remaining_pred[j]} (similarity: {sim_matrix[i, j]})")

                if sim_matrix[i, j] < similarity_threshold or j in used_pred_idx:
                    continue
                if np.argmax(sim_matrix[:, j]) == i:
                    sim_pairs.append((gt_lbl, remaining_pred[j]))
                    used_pred_idx.add(j)
                    break 

    matches = exact_pairs + sim_pairs

    print(matches)



In [49]:
import pandas as pd
import numpy as np
import sys
import os

sys.path.append('/Users/sebastian/Documents/Bachelor Thesis/sparql-rag-agent/sparql-rag-agent')


from experiments.utilities.result_metric import calculate_column_metrics_with_label_similarity

# Seed for reproducibility
np.random.seed(42)

df_ground_truth = pd.DataFrame({
    #"Disease": ["Alzheimer's Disease", "Parkinson's Disease", "Huntington's Disease"],
    #"Gene": ["APP", "SNCA", "HTT"],
    "Organism": ["Human", "Human", "Human"],
    #"Location": ["Brain", "Brain", "Brain"],
    "human": ["gene_1", "gene_2", "gene_3"],  
    "compound": ["Donepezil", "Levodopa", "Tetrabenazine"],  
    "gene_name": ["Amyloid precursor protein", "Alpha-synuclein", "Huntingtin"],  
    "target": ["Acetylcholinesterase", "Dopamine receptor", "VMAT2"],
    "clinical phase": ["Approved", "Approved", "Phase III"]
})

df_predicted = pd.DataFrame({
    #"disease": ["Alzheimer's Disease", "Parkinson's Disease", "Huntington's Disease"],
    #"gene_symbol": ["APP", "SNCA", "HTT"],  
    "species": ["Human", "Human", "Human"],  
    #"location": ["Brain", "Brain", "Brain"],
    "homo sapiens": ["gene_1", "gene_2", "gene_3"],
    "drug": ["Donepezil", "Levodopa", "Tetrabenazine"],  
    "gene": ["Amyloid precursor protein", "Alpha-synuclein", "Huntingtin"], 
    "protein target": ["Acetylcholinesterase", "Dopamine receptor", "VMAT2"],
    "development stage": ["Market", "Market", "Late-stage"]
})
# Randomize column order
gt_cols = list(df_ground_truth.columns)
pred_cols = list(df_predicted.columns)
np.random.shuffle(gt_cols)
np.random.shuffle(pred_cols)

df_ground_truth = df_ground_truth[gt_cols]
df_predicted = df_predicted[pred_cols]

print("Ground truth columns (random order):", df_ground_truth.columns.tolist())
print("Predicted columns (random order):", df_predicted.columns.tolist())

file_path = "test_file.json"

metrics = calculate_column_metrics_with_label_similarity(
    file_path,
    df_ground_truth,
    df_predicted,
    similarity_threshold=0.5  # Lowered for demo purposes
)

print("Column-wise metrics with label similarity:")
print(metrics)

Ground truth columns (random order): ['Organism', 'human', 'clinical phase', 'compound', 'target', 'gene_name']
Predicted columns (random order): ['gene', 'species', 'homo sapiens', 'drug', 'development stage', 'protein target']
[calculate_column_metrics_with_label_similarity] Calculating metrics for file: test_file.json
exact pairs: 0
remaining gt: 6
Organism -> species (similarity: 0.768045961856842)
human -> homo sapiens (similarity: 0.8240918517112732)
clinical phase -> development stage (similarity: 0.7203159332275391)
compound -> drug (similarity: 0.695464551448822)
target -> protein target (similarity: 0.7543721795082092)
gene_name -> gene (similarity: 0.7999933958053589)
gt_tuples: [('Human', 'gene_3', 'Phase III', 'Tetrabenazine', 'VMAT2', 'Huntingtin'), ('Human', 'gene_1', 'Approved', 'Donepezil', 'Acetylcholinesterase', 'Amyloid precursor protein'), ('Human', 'gene_2', 'Approved', 'Levodopa', 'Dopamine receptor', 'Alpha-synuclein')]
pred_tuples: [('Human', 'gene_3', 'Late-st

In [50]:
import pandas as pd
import numpy as np
import time
from fastembed import TextEmbedding
from sklearn.metrics.pairwise import cosine_similarity

df_ground_truth = pd.DataFrame({
    "Organism": ["Human", "Human", "Human"],
    "human": ["gene_1", "gene_2", "gene_3"],  
    "compound": ["Donepezil", "Levodopa", "Tetrabenazine"],  
    "gene_name": ["Amyloid precursor protein", "Alpha-synuclein", "Huntingtin"],  
    "target": ["Acetylcholinesterase", "Dopamine receptor", "VMAT2"],
    "clinical phase": ["Approved", "Approved", "Phase III"],
    "uniprot id": ["P12345", "P67890", "P24680"],
    "uniprotName": ["Protein 1", "Protein 2", "Protein 3"],
    "patentURI": ["http://example.com/patent1", "http://example.com/patent2", "http://example.com/patent3"],
    "swisslipid": ["http://example.com/swisslipid1", "http://example.com/swisslipid2", "http://example.com/swisslipid3"],
})

df_predicted = pd.DataFrame({
    "species": ["Human", "Human", "Human"],  
    "homo sapiens": ["gene_1", "gene_2", "gene_3"],
    "drug": ["Donepezil", "Levodopa", "Tetrabenazine"],  
    "gene": ["Amyloid precursor protein", "Alpha-synuclein", "Huntingtin"], 
    "protein target": ["Acetylcholinesterase", "Dopamine receptor", "VMAT2"],
    "development stage": ["Market", "Market", "Late-stage"], 
    "id": ["P12345", "P67890", "P24680"],
    "uniprot": ["Protein 1", "Protein 2", "Protein 3"],
    "patent": ["http://example.com/patent1", "http://example.com/patent2", "http://example.com/patent3"],
    "lipid": ["http://example.com/lipid1", "http://example.com/lipid2", "http://example.com/lipid3"],
})


true_mappings = {
    "Organism": "species",
    "human": "homo sapiens",
    "compound": "drug",
    "gene_name": "gene",
    "target": "protein target",
    "clinical phase": "development stage",
    "uniprot id": "id",
    "uniprotName": "uniprot",
    "patentURI": "patent",
    "swisslipid": "lipid",
}

In [51]:

class FastEmbedModelWrapper:
    def __init__(self, model_name):
        self.model_name = model_name
        self.model = TextEmbedding(model_name=model_name)
    
    def __str__(self):
        return self.model_name

In [52]:

def calculate_column_similarity_scores(df_ground_truth, df_predicted, embedding_model, true_mappings):
    
    gt_labels = list(df_ground_truth.columns)
    pred_labels = list(df_predicted.columns)
    
    similarity_scores = {}
    
    gt_embeds = np.vstack(list(embedding_model.model.embed(gt_labels)))
    pred_embeds = np.vstack(list(embedding_model.model.embed(pred_labels)))
    
    sim_matrix = cosine_similarity(gt_embeds, pred_embeds)
    
    for i, gt_lbl in enumerate(gt_labels):
        highest_idx = np.argmax(sim_matrix[i])
        highest_pred = pred_labels[highest_idx]
        
        true_match = true_mappings.get(gt_lbl)
        
        for j, pred_lbl in enumerate(pred_labels):
            pair_key = f"{gt_lbl} <-> {pred_lbl}"
            is_true_pair = (pred_lbl == true_match)
            
            similarity_scores[pair_key] = {
                "similarity": float(sim_matrix[i, j]),
                "is_true_pair": is_true_pair,
                "is_highest_match": (j == highest_idx)
            }
            
            print(f"{gt_lbl} <-> {pred_lbl}: {sim_matrix[i, j]:.4f} {'(TRUE PAIR)' if is_true_pair else ''} {'(HIGHEST)' if j == highest_idx else ''}")
    
    return similarity_scores

In [53]:

def evaluate_embedding_models(df_ground_truth, df_predicted, model_names, true_mappings):
    results = {}
    
    for model_name in model_names:
        print(f"\nTesting model: {model_name}")
        model = FastEmbedModelWrapper(model_name)
        
        start_time = time.time()
        similarity_scores = calculate_column_similarity_scores(
            df_ground_truth, 
            df_predicted,
            embedding_model=model,
            true_mappings=true_mappings
        )
        elapsed_time = time.time() - start_time
        
        correct_matches = 0
        total_pairs = len(true_mappings)
        
        for gt_col, pred_col in true_mappings.items():
            pair_key = f"{gt_col} <-> {pred_col}"
            if pair_key in similarity_scores and similarity_scores[pair_key]["is_highest_match"]:
                correct_matches += 1
        
        accuracy = correct_matches / total_pairs if total_pairs > 0 else 0
            
        true_pair_similarities = [
            info["similarity"] for pair, info in similarity_scores.items() 
            if info["is_true_pair"]
        ]
        avg_true_pair_similarity = np.mean(true_pair_similarities) if true_pair_similarities else 0
        
        results[model_name] = {
            "similarity_scores": similarity_scores,
            "accuracy": accuracy,
            "avg_true_pair_similarity": avg_true_pair_similarity,
            "correct_matches": correct_matches,
            "total_pairs": total_pairs,
            "elapsed_time": elapsed_time
        }
        
    return results

In [54]:

def create_comparison_table(results, true_mappings):
    comparison_data = []
    
    for gt_col, true_pred_col in true_mappings.items():
        row = {"Ground Truth": gt_col, "True Match": true_pred_col}
        
        for model_name in results.keys():
            highest_match = None
            highest_sim = 0
            
            for pair, info in results[model_name]["similarity_scores"].items():
                if pair.startswith(f"{gt_col} <->") and info["is_highest_match"]:
                    highest_match = pair.split(" <-> ")[1]
                    highest_sim = info["similarity"]
                    break
            
            true_pair_sim = 0
            true_pair_key = f"{gt_col} <-> {true_pred_col}"
            if true_pair_key in results[model_name]["similarity_scores"]:
                true_pair_sim = results[model_name]["similarity_scores"][true_pair_key]["similarity"]
            
            row[f"{model_name} Best Match"] = highest_match
            row[f"{model_name} Best Sim"] = highest_sim
            row[f"{model_name} True Sim"] = true_pair_sim
            row[f"{model_name} Correct"] = (highest_match == true_pred_col)
        
        comparison_data.append(row)
    
    return pd.DataFrame(comparison_data)

In [55]:

model_names = [
    "BAAI/bge-large-en-v1.5",
    "BAAI/bge-small-en-v1.5",
    "BAAI/bge-base-en-v1.5"
]

results = evaluate_embedding_models(df_ground_truth, df_predicted, model_names, true_mappings)

print("\n===== MODEL PERFORMANCE SUMMARY =====")
for model_name, result in results.items():
    print(f"\nModel: {model_name}")
    print(f"Accuracy: {result['accuracy']:.2f} ({result['correct_matches']}/{result['total_pairs']} correct matches)")
    print(f"Average True Pair Similarity: {result['avg_true_pair_similarity']:.4f}")
    print(f"Time: {result['elapsed_time']:.2f} seconds")

comparison_table = create_comparison_table(results, true_mappings)
print("\n===== DETAILED COMPARISON TABLE =====")
display(comparison_table)


Testing model: BAAI/bge-large-en-v1.5
Organism <-> species: 0.7680 (TRUE PAIR) (HIGHEST)
Organism <-> homo sapiens: 0.6935  
Organism <-> drug: 0.6688  
Organism <-> gene: 0.7508  
Organism <-> protein target: 0.6225  
Organism <-> development stage: 0.5285  
Organism <-> id: 0.6857  
Organism <-> uniprot: 0.6066  
Organism <-> patent: 0.6194  
Organism <-> lipid: 0.6767  
human <-> species: 0.7417  
human <-> homo sapiens: 0.8241 (TRUE PAIR) (HIGHEST)
human <-> drug: 0.6882  
human <-> gene: 0.6886  
human <-> protein target: 0.6026  
human <-> development stage: 0.5262  
human <-> id: 0.6675  
human <-> uniprot: 0.5705  
human <-> patent: 0.6284  
human <-> lipid: 0.6689  
compound <-> species: 0.6508  
compound <-> homo sapiens: 0.6104  
compound <-> drug: 0.6955 (TRUE PAIR) (HIGHEST)
compound <-> gene: 0.6562  
compound <-> protein target: 0.6162  
compound <-> development stage: 0.5407  
compound <-> id: 0.6562  
compound <-> uniprot: 0.5347  
compound <-> patent: 0.6869  
compou

Unnamed: 0,Ground Truth,True Match,BAAI/bge-large-en-v1.5 Best Match,BAAI/bge-large-en-v1.5 Best Sim,BAAI/bge-large-en-v1.5 True Sim,BAAI/bge-large-en-v1.5 Correct,BAAI/bge-small-en-v1.5 Best Match,BAAI/bge-small-en-v1.5 Best Sim,BAAI/bge-small-en-v1.5 True Sim,BAAI/bge-small-en-v1.5 Correct,BAAI/bge-base-en-v1.5 Best Match,BAAI/bge-base-en-v1.5 Best Sim,BAAI/bge-base-en-v1.5 True Sim,BAAI/bge-base-en-v1.5 Correct
0,Organism,species,species,0.768046,0.768046,True,species,0.759065,0.759065,True,species,0.750342,0.750342,True
1,human,homo sapiens,homo sapiens,0.824092,0.824092,True,id,0.75887,0.741379,False,homo sapiens,0.767781,0.767781,True
2,compound,drug,drug,0.695465,0.695465,True,drug,0.750018,0.750018,True,drug,0.717764,0.717764,True
3,gene_name,gene,gene,0.799993,0.799993,True,gene,0.893085,0.893085,True,gene,0.831849,0.831849,True
4,target,protein target,protein target,0.754372,0.754372,True,protein target,0.764328,0.764328,True,protein target,0.734279,0.734279,True
5,clinical phase,development stage,development stage,0.720316,0.720316,True,development stage,0.676837,0.676837,True,development stage,0.685265,0.685265,True
6,uniprot id,id,uniprot,0.894538,0.707785,False,uniprot,0.919238,0.670847,False,uniprot,0.889386,0.683332,False
7,uniprotName,uniprot,uniprot,0.897605,0.897605,True,uniprot,0.882805,0.882805,True,uniprot,0.881243,0.881243,True
8,patentURI,patent,patent,0.796791,0.796791,True,patent,0.7782,0.7782,True,patent,0.74311,0.74311,True
9,swisslipid,lipid,lipid,0.760124,0.760124,True,lipid,0.710318,0.710318,True,lipid,0.731899,0.731899,True


In [56]:
def create_summary_dataframe(results):
    summary_data = []
    
    for model_name, result in results.items():
        summary_data.append({
            "Model": model_name,
            "Accuracy": result["accuracy"],
            "Avg True Pair Similarity": result["avg_true_pair_similarity"],
        })
    
    return pd.DataFrame(summary_data)

# Create and display the summary DataFrame
summary_df = create_summary_dataframe(results)
display(summary_df)

Unnamed: 0,Model,Accuracy,Avg True Pair Similarity
0,BAAI/bge-large-en-v1.5,0.9,0.772459
1,BAAI/bge-small-en-v1.5,0.8,0.762688
2,BAAI/bge-base-en-v1.5,0.9,0.752687
