In [11]:
ground_truth_query = """PREFIX CHEBI: <http://purl.obolibrary.org/obo/CHEBI_>
PREFIX rh: <http://rdf.rhea-db.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX up: <http://purl.uniprot.org/core/>


SELECT 
    DISTINCT
        ?uniprot
        ?uniprotID
        ?recname
        ?gene
        ?chebi
        ?uniprotName
WHERE {
  SERVICE <https://sparql.rhea-db.org/sparql> {
     VALUES (?chebi) { (CHEBI:32395) }
     ?rhea rh:side/rh:contains/rh:compound ?compound .
     ?compound rh:chebi ?chebi .
     ?chebi up:name ?uniprotName .
  }
  ?uniprot up:annotation/up:catalyticActivity/up:catalyzedReaction ?rhea .
  ?uniprot up:mnemonic ?uniprotID .
  ?uniprot up:recommendedName/up:fullName ?recname .
  OPTIONAL {?uniprot up:encodedBy/skos:prefLabel ?gene .}
}
LIMIT 100
"""

ground_truth_endpoint = "https://sparql.uniprot.org/sparql/"

In [12]:
modified_query = """PREFIX CHEBI: <http://purl.obolibrary.org/obo/CHEBI_>
PREFIX rh:    <http://rdf.rhea-db.org/>
PREFIX skos:  <http://www.w3.org/2004/02/skos/core#>
PREFIX up:    <http://purl.uniprot.org/core/>

SELECT DISTINCT
    ?geneSymbol       # was ?gene
    ?compoundId       # was ?chebi
    ?proteinID        # was ?uniprotID
    ?officialName     # was ?recname
    ?protein          # was ?uniprot
    ?compoundName     # was ?uniprotName
WHERE {
  SERVICE <https://sparql.rhea-db.org/sparql> {
    VALUES (?compoundId) { (CHEBI:32395) }
    ?rhea   rh:side/rh:contains/rh:compound  ?cmp .
    ?cmp    rh:chebi                         ?compoundId .
    ?compoundId up:name                      ?compoundName .
  }
  
  ?protein up:annotation/up:catalyticActivity/up:catalyzedReaction  ?rhea .
  ?protein up:mnemonic                                             ?proteinID .
  ?protein up:recommendedName/up:fullName                          ?officialName .
  OPTIONAL {
    ?protein up:encodedBy/skos:prefLabel                           ?geneSymbol .
  }
}
LIMIT 100"""

In [13]:
import os
import sys 

sys.path.append('/Users/sebastian/Documents/Bachelor Thesis/sparql-rag-agent/sparql-rag-agent')
from entity_indexing.endpoint_loader import query_sparql_wrapper

In [14]:
results_ground_truth = query_sparql_wrapper(ground_truth_query, ground_truth_endpoint)
results_modified = query_sparql_wrapper(modified_query, ground_truth_endpoint)

In [15]:
import pandas as pd

def sparql_json_to_dataframe(sparql_json):
    """
    Converts a SPARQL JSON result (with 'head' and 'results' keys) to a pandas DataFrame.
    Handles missing or malformed bindings gracefully.
    """
    columns = sparql_json.get("head", {}).get("vars", [])
    rows = []
    for binding in sparql_json.get("results", {}).get("bindings", []):
        if not isinstance(binding, dict):
            print(f"[sparql_json_to_dataframe] Warning: Expected dict in bindings, got {type(binding)}: {binding}")
            continue
        row = {col: binding.get(col, {}).get("value") for col in columns}
        rows.append(row)
    df = pd.DataFrame(rows, columns=columns)
    return df

In [None]:
df_ground_truth = sparql_json_to_dataframe(results_ground_truth)
df_modified = sparql_json_to_dataframe(results_modified)


In [17]:
df_ground_truth.head()

Unnamed: 0,uniprot,uniprotID,recname,gene,chebi,uniprotName
0,http://purl.uniprot.org/uniprot/O77809,CP1A2_MACFA,Cytochrome P450 1A2,CYP1A2,http://purl.obolibrary.org/obo/CHEBI_32395,"(5Z,8Z,11Z,14Z)-eicosatetraenoate"
1,http://purl.uniprot.org/uniprot/O77810,CP1A2_CALJA,Cytochrome P450 1A2,CYP1A2,http://purl.obolibrary.org/obo/CHEBI_32395,"(5Z,8Z,11Z,14Z)-eicosatetraenoate"
2,http://purl.uniprot.org/uniprot/P00184,CP1A1_MOUSE,Cytochrome P450 1A1,Cyp1a1,http://purl.obolibrary.org/obo/CHEBI_32395,"(5Z,8Z,11Z,14Z)-eicosatetraenoate"
3,http://purl.uniprot.org/uniprot/P00186,CP1A2_MOUSE,Cytochrome P450 1A2,Cyp1a2,http://purl.obolibrary.org/obo/CHEBI_32395,"(5Z,8Z,11Z,14Z)-eicosatetraenoate"
4,http://purl.uniprot.org/uniprot/P00187,CP1A2_RABIT,Cytochrome P450 1A2,CYP1A2,http://purl.obolibrary.org/obo/CHEBI_32395,"(5Z,8Z,11Z,14Z)-eicosatetraenoate"


In [18]:
df_modified.head()

Unnamed: 0,geneSymbol,compoundId,proteinID,officialName,protein,compoundName
0,CYP1A2,http://purl.obolibrary.org/obo/CHEBI_32395,CP1A2_MACFA,Cytochrome P450 1A2,http://purl.uniprot.org/uniprot/O77809,"(5Z,8Z,11Z,14Z)-eicosatetraenoate"
1,CYP1A2,http://purl.obolibrary.org/obo/CHEBI_32395,CP1A2_CALJA,Cytochrome P450 1A2,http://purl.uniprot.org/uniprot/O77810,"(5Z,8Z,11Z,14Z)-eicosatetraenoate"
2,Cyp1a1,http://purl.obolibrary.org/obo/CHEBI_32395,CP1A1_MOUSE,Cytochrome P450 1A1,http://purl.uniprot.org/uniprot/P00184,"(5Z,8Z,11Z,14Z)-eicosatetraenoate"
3,Cyp1a2,http://purl.obolibrary.org/obo/CHEBI_32395,CP1A2_MOUSE,Cytochrome P450 1A2,http://purl.uniprot.org/uniprot/P00186,"(5Z,8Z,11Z,14Z)-eicosatetraenoate"
4,CYP1A2,http://purl.obolibrary.org/obo/CHEBI_32395,CP1A2_RABIT,Cytochrome P450 1A2,http://purl.uniprot.org/uniprot/P00187,"(5Z,8Z,11Z,14Z)-eicosatetraenoate"


In [23]:

import pandas as pd
import numpy as np
from fastembed import TextEmbedding
from sklearn.metrics.pairwise import cosine_similarity

def calculate_column_metrics_with_label_similarity(
    file_path: None,
    df_ground_truth: pd.DataFrame,
    df_predicted: pd.DataFrame,
    similarity_threshold: float = 0.7,
    embedding_cache_dir: str = "./embeddings_model_cache",
    embedding_model: str = "BAAI/bge-large-en-v1.5"   
) -> dict:

    print("[calculate_column_metrics_with_label_similarity] Calculating metrics for file:", file_path)

    if df_ground_truth.empty or df_predicted.empty:
        return {"precision": 0.0, "recall": 0.0, "f1_score": 0.0}


    gt_labels = list(df_ground_truth.columns)
    pred_labels = list(df_predicted.columns)

    gt_norm = {lbl.lower().strip(): lbl for lbl in gt_labels}
    pred_norm = {lbl.lower().strip(): lbl for lbl in pred_labels}

    exact_pairs = []
    for norm_lbl, gt_lbl in gt_norm.items():
        if norm_lbl in pred_norm:
            exact_pairs.append((gt_lbl, pred_norm[norm_lbl]))


    matched_gt = {gt for gt, _ in exact_pairs}
    matched_pred = {pred for _, pred in exact_pairs}

    remaining_gt = [lbl for lbl in gt_labels if lbl not in matched_gt]
    remaining_pred = [lbl for lbl in pred_labels if lbl not in matched_pred]

    print("exact pairs:", len(exact_pairs))
    print("remaining gt:", len(remaining_gt))
    

    sim_pairs = []
    if remaining_gt and remaining_pred:
        model = TextEmbedding(
            model_name=embedding_model,
            cache_dir=embedding_cache_dir
        )
        gt_embeds = np.vstack(list(model.embed(remaining_gt)))
        pred_embeds = np.vstack(list(model.embed(remaining_pred)))
        sim_matrix = cosine_similarity(gt_embeds, pred_embeds)

        used_pred_idx = set()
        for i, gt_lbl in enumerate(remaining_gt):
            # highest‑to‑lowest similarity indices
            
            for j in np.argsort(sim_matrix[i])[::-1]:

                print(f"{gt_lbl} -> {remaining_pred[j]} (similarity: {sim_matrix[i, j]})")

                if sim_matrix[i, j] < similarity_threshold or j in used_pred_idx:
                    continue
                # mutual‑best check
                if np.argmax(sim_matrix[:, j]) == i:
                    sim_pairs.append((gt_lbl, remaining_pred[j]))
                    used_pred_idx.add(j)
                    break  # move to next gt_lbl

    matches = exact_pairs + sim_pairs
    #print(matches)
    if not matches:
        print("no matches found")
        return {"precision": 0.0, "recall": 0.0, "f1_score": 0.0}


    print("matches:", matches)


    gt_matched_cols = [gt for gt, _ in matches]
    pred_matched_cols = [pred for _, pred in matches]

    gt_tuples = set()
    for row in df_ground_truth[gt_matched_cols].astype(str).fillna("").values.tolist():
        gt_tuples.add(tuple(row))

    pred_tuples = set()
    for row in df_predicted[pred_matched_cols].astype(str).fillna("").values.tolist():
        pred_tuples.add(tuple(row))
        
    print("gt_tuples:", gt_tuples)
    print("pred_tuples:", pred_tuples)
    
    if not gt_tuples or not pred_tuples:
        print("no tuples found")
        return {"precision": 0.0, "recall": 0.0, "f1_score": 0.0}

    tp = len(gt_tuples & pred_tuples)
    precision = tp / len(pred_tuples)
    recall = tp / len(gt_tuples)
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0

    return {"precision": precision, "recall": recall, "f1_score": f1}

In [22]:
eval = calculate_column_metrics_with_label_similarity(
    file_path=None,
    df_ground_truth=df_ground_truth,
    df_predicted=df_modified
)

print(eval)

[calculate_column_metrics_with_label_similarity] Calculating metrics for file: None
exact pairs: 0
remaining gt: 6
uniprot -> proteinID (similarity: 0.6607235074043274)
uniprot -> protein (similarity: 0.6434313058853149)
uniprot -> geneSymbol (similarity: 0.6175891160964966)
uniprot -> compoundId (similarity: 0.5491986274719238)
uniprot -> officialName (similarity: 0.5270262956619263)
uniprot -> compoundName (similarity: 0.49618250131607056)
uniprotID -> proteinID (similarity: 0.7115392684936523)
recname -> officialName (similarity: 0.7237762808799744)
gene -> geneSymbol (similarity: 0.7394564747810364)
chebi -> protein (similarity: 0.576590895652771)
chebi -> proteinID (similarity: 0.5659598112106323)
chebi -> compoundId (similarity: 0.5413426160812378)
chebi -> compoundName (similarity: 0.5274407267570496)
chebi -> geneSymbol (similarity: 0.5257027745246887)
chebi -> officialName (similarity: 0.5113140344619751)
uniprotName -> officialName (similarity: 0.6308444738388062)
uniprotName

In [None]:

#     ?geneSymbol       # was ?gene
#     ?compoundId       # was ?chebi
#     ?proteinID        # was ?uniprotID
#     ?officialName     # was ?recname
#     ?protein          # was ?uniprot
#     ?compoundName     # was ?uniprotName



eval = calculate_column_metrics_with_label_similarity(
    file_path=None,
    df_ground_truth=df_ground_truth,
    df_predicted=df_modified,
    embedding_model="BAAI/bge-base-en-v1.5"
)

print(eval)

[calculate_column_metrics_with_label_similarity] Calculating metrics for file: None
exact pairs: 0
remaining gt: 6
uniprot -> geneSymbol (similarity: 0.543870210647583)
uniprot -> compoundName (similarity: 0.5410197973251343)
uniprot -> compoundId (similarity: 0.5391663908958435)
uniprot -> proteinID (similarity: 0.5250837802886963)
uniprot -> protein (similarity: 0.5056989192962646)
uniprot -> officialName (similarity: 0.4674255847930908)
uniprotID -> proteinID (similarity: 0.6716257929801941)
uniprotID -> protein (similarity: 0.5904890298843384)
uniprotID -> compoundId (similarity: 0.5893224477767944)
uniprotID -> geneSymbol (similarity: 0.5620156526565552)
uniprotID -> compoundName (similarity: 0.5254242420196533)
uniprotID -> officialName (similarity: 0.4767494797706604)
recname -> officialName (similarity: 0.6680632829666138)
recname -> compoundName (similarity: 0.6656311750411987)
recname -> compoundId (similarity: 0.5616946816444397)
recname -> geneSymbol (similarity: 0.51423555

In [26]:
eval = calculate_column_metrics_with_label_similarity(
    file_path=None,
    df_ground_truth=df_ground_truth,
    df_predicted=df_modified,
    embedding_model="BAAI/bge-small-en-v1.5"
)

print(eval)

[calculate_column_metrics_with_label_similarity] Calculating metrics for file: None
exact pairs: 0
remaining gt: 6
uniprot -> compoundId (similarity: 0.59628826379776)
uniprot -> proteinID (similarity: 0.5900821685791016)
uniprot -> protein (similarity: 0.5511031746864319)
uniprot -> compoundName (similarity: 0.5500261783599854)
uniprot -> officialName (similarity: 0.5278188586235046)
uniprot -> geneSymbol (similarity: 0.5209227800369263)
uniprotID -> proteinID (similarity: 0.6868076324462891)
uniprotID -> compoundId (similarity: 0.6858739256858826)
uniprotID -> compoundName (similarity: 0.5876275300979614)
uniprotID -> protein (similarity: 0.5644420981407166)
uniprotID -> geneSymbol (similarity: 0.5314558744430542)
uniprotID -> officialName (similarity: 0.5287464261054993)
recname -> officialName (similarity: 0.7922148108482361)
gene -> protein (similarity: 0.7483997941017151)
chebi -> compoundName (similarity: 0.6143814325332642)
chebi -> compoundId (similarity: 0.6079856753349304)
c

In [31]:
import pandas as pd
from thefuzz import fuzz

def normalized_similarity(s1, s2):
    """Compute normalized similarity between two strings using thefuzz's ratio, normalized to [0,1]."""
    return fuzz.ratio(s1, s2) / 100

def calculate_column_metrics_with_fuzzy_label_similarity(
    file_path: None,
    df_ground_truth: pd.DataFrame,
    df_predicted: pd.DataFrame,
    similarity_threshold: float = 0.8
) -> dict:

    print("[calculate_column_metrics_with_fuzzy_label_similarity] Calculating metrics for file:", file_path)

    if df_ground_truth.empty or df_predicted.empty:
        return {"precision": 0.0, "recall": 0.0, "f1_score": 0.0}

    gt_labels = list(df_ground_truth.columns)
    pred_labels = list(df_predicted.columns)

    # Exact matches (case-insensitive)
    gt_norm = {lbl.lower().strip(): lbl for lbl in gt_labels}
    pred_norm = {lbl.lower().strip(): lbl for lbl in pred_labels}

    exact_pairs = []
    for norm_lbl, gt_lbl in gt_norm.items():
        if norm_lbl in pred_norm:
            exact_pairs.append((gt_lbl, pred_norm[norm_lbl]))

    matched_gt = {gt for gt, _ in exact_pairs}
    matched_pred = {pred for _, pred in exact_pairs}

    remaining_gt = [lbl for lbl in gt_labels if lbl not in matched_gt]
    remaining_pred = [lbl for lbl in pred_labels if lbl not in matched_pred]

    print("exact pairs:", len(exact_pairs))
    print("remaining gt:", len(remaining_gt))

    # Fuzzy matches using normalized similarity
    sim_pairs = []
    used_pred = set()
    for gt_lbl in remaining_gt:
        best_match = None
        best_score = 0
        for pred_lbl in remaining_pred:
            if pred_lbl in used_pred:
                continue
            score = normalized_similarity(gt_lbl, pred_lbl)
            print(f"Fuzzy normalized similarity: {gt_lbl} <-> {pred_lbl} = {score:.3f}")
            if score > best_score:
                best_score = score
                best_match = pred_lbl
        if best_match and best_score >= similarity_threshold:
            sim_pairs.append((gt_lbl, best_match))
            used_pred.add(best_match)

    matches = exact_pairs + sim_pairs
    if not matches:
        print("no matches found")
        return {"precision": 0.0, "recall": 0.0, "f1_score": 0.0}

    print("matches:", matches)

    gt_matched_cols = [gt for gt, _ in matches]
    pred_matched_cols = [pred for _, pred in matches]

    gt_tuples = set()
    for row in df_ground_truth[gt_matched_cols].astype(str).fillna("").values.tolist():
        gt_tuples.add(tuple(row))

    pred_tuples = set()
    for row in df_predicted[pred_matched_cols].astype(str).fillna("").values.tolist():
        pred_tuples.add(tuple(row))

    print("gt_tuples:", gt_tuples)
    print("pred_tuples:", pred_tuples)

    if not gt_tuples or not pred_tuples:
        print("no tuples found")
        return {"precision": 0.0, "recall": 0.0, "f1_score": 0.0}

    tp = len(gt_tuples & pred_tuples)
    precision = tp / len(pred_tuples)
    recall = tp / len(gt_tuples)
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0

    return {"precision": precision, "recall": recall, "f1_score": f1}

In [33]:
eval = calculate_column_metrics_with_fuzzy_label_similarity(
    file_path=None,
    df_ground_truth=df_ground_truth,
    df_predicted=df_modified,
    similarity_threshold=0.7
)

print(eval)

[calculate_column_metrics_with_fuzzy_label_similarity] Calculating metrics for file: None
exact pairs: 0
remaining gt: 6
Fuzzy normalized similarity: uniprot <-> geneSymbol = 0.240
Fuzzy normalized similarity: uniprot <-> compoundId = 0.240
Fuzzy normalized similarity: uniprot <-> proteinID = 0.500
Fuzzy normalized similarity: uniprot <-> officialName = 0.110
Fuzzy normalized similarity: uniprot <-> protein = 0.570
Fuzzy normalized similarity: uniprot <-> compoundName = 0.210
Fuzzy normalized similarity: uniprotID <-> geneSymbol = 0.210
Fuzzy normalized similarity: uniprotID <-> compoundId = 0.320
Fuzzy normalized similarity: uniprotID <-> proteinID = 0.670
Fuzzy normalized similarity: uniprotID <-> officialName = 0.100
Fuzzy normalized similarity: uniprotID <-> protein = 0.500
Fuzzy normalized similarity: uniprotID <-> compoundName = 0.190
Fuzzy normalized similarity: recname <-> geneSymbol = 0.350
Fuzzy normalized similarity: recname <-> compoundId = 0.240
Fuzzy normalized similarity

In [36]:
ground_truth_query = """PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rh:<http://rdf.rhea-db.org/>
PREFIX ec:<http://purl.uniprot.org/enzyme/>
PREFIX up:<http://purl.uniprot.org/core/>

SELECT
  (count(?reaction) as ?reactionEcProteinLinkCount)
WHERE {
  ?reaction rdfs:subClassOf rh:Reaction .
  ?reaction rh:ec ?enzyme
  SERVICE <http://sparql.uniprot.org/sparql> {
    ?protein up:reviewed true .
    ?protein up:enzyme ?enzyme.
  }
}"""

modified_query = """PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rh:<http://rdf.rhea-db.org/>
PREFIX ec:<http://purl.uniprot.org/enzyme/>
PREFIX up:<http://purl.uniprot.org/core/>

SELECT
  (count(?reaction) as ?LinkCount)
WHERE {
  ?reaction rdfs:subClassOf rh:Reaction .
  ?reaction rh:ec ?enzyme
  SERVICE <http://sparql.uniprot.org/sparql> {
    ?protein up:reviewed true .
    ?protein up:enzyme ?enzyme.
  }
}"""



ground_truth_endpoint = "http://sparql.rhea-db.org/sparql"

In [37]:
results_ground_truth = query_sparql_wrapper(ground_truth_query, ground_truth_endpoint)
results_modified = query_sparql_wrapper(modified_query, ground_truth_endpoint)
df_ground_truth = sparql_json_to_dataframe(results_ground_truth)
df_modified = sparql_json_to_dataframe(results_modified)


In [39]:
df_ground_truth.head()

Unnamed: 0,reactionEcProteinLinkCount
0,289934


In [40]:
df_modified.head()

Unnamed: 0,LinkCount
0,289934


In [38]:
eval = calculate_column_metrics_with_label_similarity(
    file_path=None,
    df_ground_truth=df_ground_truth,
    df_predicted=df_modified
)

print(eval)

[calculate_column_metrics_with_label_similarity] Calculating metrics for file: None
exact pairs: 0
remaining gt: 1
reactionEcProteinLinkCount -> LinkCount (similarity: 0.794906497001648)
matches: [('reactionEcProteinLinkCount', 'LinkCount')]
gt_tuples: {('289934',)}
pred_tuples: {('289934',)}
{'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0}


In [41]:
eval = calculate_column_metrics_with_fuzzy_label_similarity(
    file_path=None,
    df_ground_truth=df_ground_truth,
    df_predicted=df_modified
)

print(eval)

[calculate_column_metrics_with_fuzzy_label_similarity] Calculating metrics for file: None
exact pairs: 0
remaining gt: 1
Fuzzy normalized similarity: reactionEcProteinLinkCount <-> LinkCount = 0.510
no matches found
{'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0}
