## 50. Entity creation evaluation

In [4]:
import pandas as pd
from itertools import combinations
from collections import defaultdict
from sklearn.metrics import precision_score, recall_score, f1_score, adjusted_rand_score

In [5]:
from graphdbfunctions import *
from graph_evaluation import *
from namespaces import *

In [6]:
GRAPHDB_HOST = "http://localhost:7200"
GRAPHDB_REPO = 'lhay' # Your repository ID/Name
proxies = {
 "http": None,
 "https": None,
}

In [7]:
def get_clusters(df):
    """
    Returns a dictionary of cluster_id -> set of mention_ids
    """
    clusters = defaultdict(set)
    for _, row in df.iterrows():
        clusters[row['t']].add(row['line'])
    return clusters

In [8]:
def extract_pairs(clusters):
    """
    Convert cluster dict to a set of mention_id pairs (unordered)
    """
    pairs = set()
    for cluster_mentions in clusters.values():
        for m1, m2 in combinations(sorted(cluster_mentions), 2):
            pairs.add(frozenset((m1, m2)))
    return pairs

## 1. Taxpayers

In this section, we aim to asses the taxpayer entity creation task. The algorithm groups mentions of taxpayers that are considered as similar to create **Taxpayer** instances that can be added to the KG.

In [9]:
taxpayers_query = """
PREFIX cad: <http://rdf.geohistoricaldata.org/def/cadastre#>
select * where {
    GRAPH <GRAPH_URI> {
        ?t a cad:Taxpayer .
    	OPTIONAL{?t cad:sourcedFrom ?line.}
        OPTIONAL{?t cad:taxpayerLabel ?label.}
        OPTIONAL{?t cad:taxpayerFirstnames ?firstnames}
        OPTIONAL{?t cad:taxpayerFamilyStatus ?status}
    }
}
"""

In [10]:
gold_triples_t, pred_triples_t = get_triples(taxpayers_query, "http://rdf.geohistoricaldata.org/gold", "http://rdf.geohistoricaldata.org/auto",GRAPHDB_HOST,GRAPHDB_REPO,proxies)

In [11]:
gold_in_pred, gold_not_in_pred, pred_in_gold, pred_not_in_gold = compare_dfs(gold_triples_t, pred_triples_t)

In [12]:
pred_clusters = get_clusters(pred_triples_t)
gold_clusters = get_clusters(gold_triples_t)

In [16]:
pred_clusters

defaultdict(set,
            {'http://rdf.geohistoricaldata.org/id/taxpayer/069cfb9b-d8cc-4a83-8baa-67556d15c179': {'http://rdf.geohistoricaldata.org/id/source/eac9bf3a-d21e-4ddf-b10f-22a09dff39a0_line'},
             'http://rdf.geohistoricaldata.org/id/taxpayer/06ee3332-3587-4e7e-b189-76bc0109f4d8': {'http://rdf.geohistoricaldata.org/id/source/c19d661a-a57e-4495-9faf-fcff26014a62_line'},
             'http://rdf.geohistoricaldata.org/id/taxpayer/0709a782-654c-4a97-9892-72a8673459e5': {'http://rdf.geohistoricaldata.org/id/source/a5b2dfbd-b111-4e6f-b23e-b83bbf22b3d4_line'},
             'http://rdf.geohistoricaldata.org/id/taxpayer/0b84849d-0288-4317-8280-ce8e1b255433': {'http://rdf.geohistoricaldata.org/id/source/7687fcf0-46e6-45c6-b727-e2138c07e20d_line'},
             'http://rdf.geohistoricaldata.org/id/taxpayer/0b848d4d-f1f8-4d91-b0b6-f67b600bd4e9': {'http://rdf.geohistoricaldata.org/id/source/e289c18f-58d3-40d8-a631-55134dc4fb47_line'},
             'http://rdf.geohistoricaldata.

In [13]:
print("Number of clusters in pred:", len(pred_clusters))
print("Number of clusters in gold:", len(gold_clusters))

Number of clusters in pred: 185
Number of clusters in gold: 152


In [28]:
source_pred_clusters = []
for p in pred_clusters:
    # pred_clusters[p] is like {1, 2, 3}, turn it to a list
    ls = []
    for i in pred_clusters[p]:
        ls.append(i)
    source_pred_clusters.append(ls)

source_gold_clusters = []
for g in gold_clusters:
    # gold_clusters[g] is like {1, 2, 3}, turn it to a list
    ls = []
    for i in gold_clusters[g]:
        ls.append(i)
    source_gold_clusters.append(ls)

### 1. Pairs evaluation

In [14]:
# Compute the precision, recall, and F1 score
pred_pairs = extract_pairs(pred_clusters)
gold_pairs = extract_pairs(gold_clusters)
print("Number of pairs in pred:", len(pred_pairs))
print("Number of pairs in gold:", len(gold_pairs))
precision = len(pred_pairs & gold_pairs) / len(pred_pairs) if pred_pairs else 0
recall = len(pred_pairs & gold_pairs) / len(gold_pairs) if gold_pairs else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) else 0
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Number of pairs in pred: 352
Number of pairs in gold: 510
Precision: 0.8068181818181818
Recall: 0.5568627450980392
F1 Score: 0.6589327146171693


### 2. Clusters evaluation

In [34]:
def to_label_vector(clusters, all_uris):
    uri_to_label = {}
    for label, cluster in enumerate(clusters):
        for uri in cluster:
            uri_to_label[uri] = label

    # Assign unique cluster IDs to URIs not present in this clustering
    next_label = len(clusters)
    labels = []
    for uri in all_uris:
        if uri in uri_to_label:
            labels.append(uri_to_label[uri])
        else:
            labels.append(next_label)
            next_label += 1
    return labels

def compute_ari_with_missing(clusters_pred, clusters_gold):
    # All unique URIs across both sets
    all_uris = sorted(set(uri for cluster in clusters_pred + clusters_gold for uri in cluster))

    pred_labels = to_label_vector(clusters_pred, all_uris)
    gold_labels = to_label_vector(clusters_gold, all_uris)

    return adjusted_rand_score(gold_labels, pred_labels)

In [35]:
ari = compute_ari_with_missing(source_pred_clusters, source_gold_clusters)
print("Adjusted Rand Index:", ari)

Adjusted Rand Index: 0.6568837881641771


In [36]:
from sklearn.metrics import (
    adjusted_rand_score,
    normalized_mutual_info_score,
    fowlkes_mallows_score,
    homogeneity_completeness_v_measure
)

def to_label_vector(clusters, all_uris):
    uri_to_label = {}
    for label, cluster in enumerate(clusters):
        for uri in cluster:
            uri_to_label[uri] = label

    next_label = len(clusters)
    labels = []
    for uri in all_uris:
        if uri in uri_to_label:
            labels.append(uri_to_label[uri])
        else:
            labels.append(next_label)
            next_label += 1
    return labels

def compute_clustering_metrics(clusters_pred, clusters_gold):
    all_uris = sorted(set(uri for cluster in clusters_pred + clusters_gold for uri in cluster))
    
    pred_labels = to_label_vector(clusters_pred, all_uris)
    gold_labels = to_label_vector(clusters_gold, all_uris)
    
    metrics = {
        "Adjusted Rand Index": adjusted_rand_score(gold_labels, pred_labels),
        "Normalized Mutual Information": normalized_mutual_info_score(gold_labels, pred_labels),
        "Fowlkes-Mallows Index": fowlkes_mallows_score(gold_labels, pred_labels),
    }
    
    h, c, v = homogeneity_completeness_v_measure(gold_labels, pred_labels)
    metrics["Homogeneity"] = h
    metrics["Completeness"] = c
    metrics["V-Measure"] = v
    
    return metrics

# Example
clusters_gold = [['A', 'B'], ['C'], ['D', 'E']]
clusters_pred = [['A', 'C'], ['B'], ['D', 'E'], ['F']]

metrics = compute_clustering_metrics(source_pred_clusters, source_gold_clusters)
for name, value in metrics.items():
    print(f"{name}: {value:.4f}")

Adjusted Rand Index: 0.6569
Normalized Mutual Information: 0.9485
Fowlkes-Mallows Index: 0.6703
Homogeneity: 0.9724
Completeness: 0.9258
V-Measure: 0.9485
