# 10. Evaluation object type properties/property paths

In [1]:
import glob
import pandas as pd
import requests
from graphdbfunctions import *
from graph_evaluation import *
from great_tables import GT
import polars as pl

In [2]:
GRAPHDB_HOST = "http://localhost:7200"
GRAPHDB_REPO = 'lhay' # Your repository ID/Name
proxies = {
 "http": None,
 "https": None,
}

#### 0. All triples in triplestore (including blank nodes)

In [3]:
query_all_triples = prefixes + """
SELECT ?s ?p ?o
WHERE {GRAPH <GRAPH_URI> {
 ?s ?p ?o .
}}
"""

In [4]:
gold_triples_all, pred_triples_all = get_triples(query_all_triples, "http://rdf.geohistoricaldata.org/gold", "http://rdf.geohistoricaldata.org/auto",GRAPHDB_HOST,GRAPHDB_REPO,proxies)

len(gold_triples_all), len(pred_triples_all)

(22521, 22202)

We can't use directly the triples for evaluation because of blank nodes in both grahs that might be the same but have differents URIs. 

So, the evaluation of relations (triples) is performed using property or property paths (*?s ?p ?o*) where *?s* and *?o* are not blank nodes.

### 1. Retrieve relations

In [5]:
query = prefixes + """
SELECT ?s ?p ?o
WHERE {GRAPH <GRAPH_URI> {
 ?s ?p ?o

 FILTER (!STRSTARTS(STR(?s), "_:"))  # Ensures the subject is not a blank node
 FILTER (!STRSTARTS(STR(?o), "_:"))  # Ensures the object is not a blank node
 FILTER (!isLiteral(?o))
 FILTER (!isLiteral(?s))
}}
"""

We first retrieve the simple triples (property path of length = 1) in both graphs (gold standard and automatically build).

In [6]:
gold_triples, pred_triples = get_triples(query, "http://rdf.geohistoricaldata.org/gold", "http://rdf.geohistoricaldata.org/auto",GRAPHDB_HOST,GRAPHDB_REPO,proxies)

In [7]:
print(f'Number of triples in gold : {len(gold_triples)}')
print(f'Number of triples in pred : {len(pred_triples)}')

Number of triples in gold : 10870
Number of triples in pred : 10681


Then, we retrieve triples where ?p is a property path of length 2..n end ?s and ?o are not blank nodes. Property paths are listed bellow.

In [8]:
all__ = [
"cad:hasClasse/cad:hasClasseValue",
"addr:hasAttribute/rdf:type",
"addr:hasAttribute/addr:isAttributeType",
"addr:hasAttribute/addr:hasAttributeVersion/rdf:type",
"addr:hasAttribute/addr:hasAttributeVersion/cad:hasPlotAddress/addr:relatum",
"addr:hasAttribute/addr:hasAttributeVersion/cad:hasPlotAddress/addr:locatum",
"addr:hasAttribute/addr:hasAttributeVersion/cad:hasPlotNature",
"addr:hasAttribute/addr:hasAttributeVersion/cad:hasPlotTaxpayer",
"addr:hasTime/addr:timeCalendar",
"addr:hasTime/addr:timePrecision",
"addr:hasTime/addr:timeStamp",
"addr:hasTime/rdf:type",
"rico:hasCreationDate/rdf:type",
"rico:hasCreationDate/addr:timeStamp",
"rico:hasCreationDate/addr:timePrecision",
"rico:hasCreationDate/addr:timeStamp"
]

In [9]:
chained_properties = [
"cad:hasClasse/cad:hasClasseValue",
"addr:hasAttribute/rdf:type",
"addr:hasAttribute/addr:isAttributeType",
"addr:hasAttribute/addr:hasAttributeVersion/rdf:type",
"addr:hasAttribute/addr:hasAttributeVersion/cad:hasPlotAddress/addr:relatum",
"addr:hasAttribute/addr:hasAttributeVersion/cad:hasPlotAddress/addr:locatum",
"addr:hasAttribute/addr:hasAttributeVersion/cad:hasPlotNature",
"addr:hasAttribute/addr:hasAttributeVersion/cad:hasPlotTaxpayer",
"addr:hasTime/addr:timeCalendar",
"addr:hasTime/addr:timePrecision",
"addr:hasTime/rdf:type",
"rico:hasCreationDate/rdf:type",
"rico:hasCreationDate/addr:timePrecision",
"rico:hasCreationDate/addr:timeCalendar",
]

In [10]:
chained_query = prefixes + """
SELECT ?s ?p ?o
WHERE {
  GRAPH <GRAPH_URI> {
    # Retrieve all subjects that are of type Landmark
    ?s PROP ?o.
    BIND("PROP" AS ?p )
    FILTER (!STRSTARTS(STR(?s), "_:"))  # Ensures the subject is not a blank node
    FILTER (!STRSTARTS(STR(?o), "_:"))  # Ensures the object is not a blank node
    FILTER (!isLiteral(?o))
    FILTER (!isLiteral(?s))
  }
}"""

In [11]:
ls_gold = []
ls_pred = []
for property_ in chained_properties:
  uquery = choose_property(property_, chained_query)
  df_gold, df_pred = get_triples(uquery, "http://rdf.geohistoricaldata.org/gold", "http://rdf.geohistoricaldata.org/auto",GRAPHDB_HOST,GRAPHDB_REPO,proxies)
  ls_gold.append(df_gold)
  ls_pred.append(df_pred)
  print(f"Number of triples in gold for {property_}: {len(df_gold)}")
  print(f"Number of triples in pred for {property_}: {len(df_pred)}")

Number of triples in gold for cad:hasClasse/cad:hasClasseValue: 30
Number of triples in pred for cad:hasClasse/cad:hasClasseValue: 30
Number of triples in gold for addr:hasAttribute/rdf:type: 1084
Number of triples in pred for addr:hasAttribute/rdf:type: 1050
Number of triples in gold for addr:hasAttribute/addr:isAttributeType: 1084
Number of triples in pred for addr:hasAttribute/addr:isAttributeType: 1050
Number of triples in gold for addr:hasAttribute/addr:hasAttributeVersion/rdf:type: 1084
Number of triples in pred for addr:hasAttribute/addr:hasAttributeVersion/rdf:type: 1050
Number of triples in gold for addr:hasAttribute/addr:hasAttributeVersion/cad:hasPlotAddress/addr:relatum: 362
Number of triples in pred for addr:hasAttribute/addr:hasAttributeVersion/cad:hasPlotAddress/addr:relatum: 340
Number of triples in gold for addr:hasAttribute/addr:hasAttributeVersion/cad:hasPlotAddress/addr:locatum: 362
Number of triples in pred for addr:hasAttribute/addr:hasAttributeVersion/cad:hasPlot

In [12]:
#concatenate all dataframes in the list into a single dataframe
gold_ppath_df = pd.concat(ls_gold, ignore_index=True)
pred_ppath_df = pd.concat(ls_pred, ignore_index=True)

In [13]:
gold = pd.concat([gold_triples, gold_ppath_df], ignore_index=True)
pred = pd.concat([pred_triples, pred_ppath_df], ignore_index=True)
len(gold), len(pred)

(15607, 15258)

In [14]:
from namespaces import prefixes_dict
#Using prefixes_dict, replace strings in the type column
for c in ["s","p","o"]:
    for prefix, uri in prefixes_dict.items():
        gold[c] = gold[c].str.replace(prefix, uri, regex=False)
        pred[c] = pred[c].str.replace(prefix, uri, regex=False)

In [15]:
SAVE_FILES = False

gold_in_pred, gold_not_in_pred, pred_in_gold, pred_not_in_gold = compare_dfs(gold, pred)

if SAVE_FILES:
    gold.to_csv("gold.csv", index=False)
    pred.to_csv("pred.csv", index=False)
    gold_in_pred.to_csv("non_identical_gold.csv", index=False)
    pred_in_gold.to_csv("non_identical_pred.csv", index=False)

### 2. Metrics

The **agreement rate** for relation R is defined as the ratio between: the number of GOLD pairs that can be matched with a pair of PREDITED pairs, and the total number of GOLD pairs.
Note:
Matching a pair means that each of the two fields in the pair can be matched with its counterpart in an EJD pair.

The **deficit rate** is the ratio of the number of GOLD pairs that are not found in PREDICTED set to the total number of GOLD pairs.

The **surplus rate** is the ratio of the number of PREDICTED pairs that are not found in GOLD to the total number of PREDICTED pairs.

The **precision** is the ratio of correctly predicted positive observations to the total predicted positives. It indicates how many of the predicted triples are actually correct.

In [16]:
metrics_all = compute_metrics(gold,pred)
metrics_all

{'Agreement': 78.5, 'Deficit': 21.5, 'Surplus': 19.939999999999998}

In [17]:
gold

Unnamed: 0,s,p,o
0,source:064cc4ee-9d5e-4922-a6dc-954aaf38fab2_FR...,rdf:type,rico:Instantiation
1,source:FRAD094_3P_000275_01_0151_page,rdf:type,rico:Record
2,source:FRAD094_3P_000275_01,rdf:type,rico:Instantiation
3,source:0d2dbad9-68dd-4c66-a44d-87980720bc14_FR...,rdf:type,rico:Instantiation
4,source:FRAD094_3P_000275_01_0160_page,rdf:type,rico:Record
...,...,...,...
15602,source:94_LHAY_RECTIFICATION_1835_1842,addr:hasTime/rdf:type,addr:CrispTimeInstant
15603,event:CADASTRE_LHAY_1842,addr:hasTime/rdf:type,addr:CrispTimeInstant
15604,source:94_LHAY_RECTIFICATION_1835_1842,rico:hasCreationDate/rdf:type,addr:CrispTimeInstant
15605,source:94_LHAY_RECTIFICATION_1835_1842,rico:hasCreationDate/addr:timePrecision,time:Year


### 3. Evaluation for each property or property path

#### Count

In [18]:
# Get counts for gold and pred
gold_df = get_property_values_count_df(gold,'p')
pred_df = get_property_values_count_df(pred,'p')

In [19]:
# Merge the two DataFrames on the 'Property' column : Property Count_gold, Count_pred
df = pd.merge(gold_df, pred_df, on='p', how='outer')
df.fillna(0, inplace=True)
#Compmpute the difference (abs)
df['Difference'] = abs(df['Count_x'] - df['Count_y'])
#Sort by difference
df.sort_values(by='Difference', ascending=False, inplace=True)
display(df)

Unnamed: 0,p,Count_x,Count_y,Difference
1,addr:dependsOn,1446,1407,39
13,addr:isChangeType,1446,1407,39
6,addr:hasAttribute/addr:hasAttributeVersion/rdf...,1084,1050,34
7,addr:hasAttribute/addr:isAttributeType,1084,1050,34
8,addr:hasAttribute/rdf:type,1084,1050,34
21,cad:sourcedFrom,1086,1053,33
3,addr:hasAttribute/addr:hasAttributeVersion/cad...,362,340,22
2,addr:hasAttribute/addr:hasAttributeVersion/cad...,362,340,22
22,rdf:type,3121,3100,21
16,addr:locatum,728,712,16


In [20]:
properties = get_list_of_properties(df,'p')
properties

['addr:dependsOn',
 'addr:isChangeType',
 'addr:hasAttribute/addr:hasAttributeVersion/rdf:type',
 'addr:hasAttribute/addr:isAttributeType',
 'addr:hasAttribute/rdf:type',
 'cad:sourcedFrom',
 'addr:hasAttribute/addr:hasAttributeVersion/cad:hasPlotAddress/addr:relatum',
 'addr:hasAttribute/addr:hasAttributeVersion/cad:hasPlotAddress/addr:locatum',
 'rdf:type',
 'addr:locatum',
 'addr:relatum',
 'addr:isLandmarkRelationType',
 'addr:hasAttribute/addr:hasAttributeVersion/cad:hasPlotNature',
 'addr:isLandmarkType',
 'addr:hasAttribute/addr:hasAttributeVersion/cad:hasPlotTaxpayer',
 'addr:appliedTo',
 'cad:isSourceType',
 'rico:isOrWasComponent',
 'addr:hasTime/addr:timePrecision',
 'addr:hasTime/rdf:type',
 'addr:hasTrace',
 'addr:hasTime/addr:timeCalendar',
 'cad:isEventType',
 'rico:hasCreationDate/addr:timeCalendar',
 'cad:hasClasse/cad:hasClasseValue',
 'rico:hasCreationDate/addr:timePrecision',
 'rico:hasCreationDate/rdf:type',
 'rico:hasOrHadDerivedInstantiation',
 'rico:isOrWasCompo

In [21]:
from great_tables import GT
import polars as pl

metrics_by_properties = []
for p in properties:
    # Get the instances of the property in gold and pred
    gold_instances = get_property_instances(gold, 'p', p)
    pred_instances = get_property_instances(pred, 'p', p)
    
    # Compute
    metrics = compute_metrics(gold_instances, pred_instances)
    metrics_by_properties.append({
        "Property": p,
        "Agreement": metrics["Agreement"],
        "Deficit": metrics["Deficit"],
        "Surplus": metrics["Surplus"],
    })
# Create a DataFrame from the metrics
metrics_df = pd.DataFrame(metrics_by_properties)

In [22]:
metrics_df

Unnamed: 0,Property,Agreement,Deficit,Surplus
0,addr:dependsOn,93.91,6.09,3.48
1,addr:isChangeType,93.91,6.09,3.48
2,addr:hasAttribute/addr:hasAttributeVersion/rdf...,95.2,4.8,3.43
3,addr:hasAttribute/addr:isAttributeType,93.54,6.46,3.43
4,addr:hasAttribute/rdf:type,95.2,4.8,3.43
5,cad:sourcedFrom,46.22,53.78,52.33
6,addr:hasAttribute/addr:hasAttributeVersion/cad...,0.0,100.0,100.0
7,addr:hasAttribute/addr:hasAttributeVersion/cad...,90.61,9.39,3.53
8,rdf:type,81.74,18.26,17.71
9,addr:locatum,47.39,52.61,51.54


In [23]:
#Nice display
GT(metrics_df) \
    .data_color(columns=["Agreement"],#, "Precision", "Recall", "F1 score"
                palette=["red", "orange", "green"], domain=[0, 100]) \
    .data_color(columns=["Deficit", "Surplus"],
                palette=["green", "orange", "red"], domain=[0, 100])

Property,Agreement,Deficit,Surplus
addr:dependsOn,93.91,6.09,3.4799999999999995
addr:isChangeType,93.91,6.09,3.4799999999999995
addr:hasAttribute/addr:hasAttributeVersion/rdf:type,95.2,4.8,3.43
addr:hasAttribute/addr:isAttributeType,93.54,6.460000000000001,3.43
addr:hasAttribute/rdf:type,95.2,4.8,3.43
cad:sourcedFrom,46.22,53.78,52.33
addr:hasAttribute/addr:hasAttributeVersion/cad:hasPlotAddress/addr:relatum,0.0,100.0,100.0
addr:hasAttribute/addr:hasAttributeVersion/cad:hasPlotAddress/addr:locatum,90.61,9.39,3.53
rdf:type,81.74,18.26,17.71
addr:locatum,47.39,52.61,51.54


In [24]:
df_to_latex(metrics_df)

\begin{tabular}{lrrr}
\toprule
Property & Agreement & Deficit & Surplus \\
\midrule
rico:hasCreationDate/addr:timePrecision & 100.000000 & 0.000000 & 0.000000 \\
addr:hasTime/addr:timeCalendar & 100.000000 & 0.000000 & 0.000000 \\
rico:isOrWasInstantiationOf & 100.000000 & 0.000000 & 0.000000 \\
rico:isOrWasIncludedIn & 100.000000 & 0.000000 & 0.000000 \\
rico:isOrWasInstantiation & 100.000000 & 0.000000 & 0.000000 \\
rico:isOrWasComponentOf & 100.000000 & 0.000000 & 0.000000 \\
rico:hasOrHadDerivedInstantiation & 100.000000 & 0.000000 & 0.000000 \\
rico:hasCreationDate/rdf:type & 100.000000 & 0.000000 & 0.000000 \\
rico:isOrWasDigitalInstantiationOf & 100.000000 & 0.000000 & 0.000000 \\
addr:hasTime/rdf:type & 100.000000 & 0.000000 & 0.000000 \\
cad:hasClasse/cad:hasClasseValue & 100.000000 & 0.000000 & 0.000000 \\
cad:isEventType & 100.000000 & 0.000000 & 0.000000 \\
rico:hasCreationDate/addr:timeCalendar & 100.000000 & 0.000000 & 0.000000 \\
addr:hasTime/addr:timePrecision & 100.000

'\\begin{tabular}{lrrr}\n\\toprule\nProperty & Agreement & Deficit & Surplus \\\\\n\\midrule\nrico:hasCreationDate/addr:timePrecision & 100.000000 & 0.000000 & 0.000000 \\\\\naddr:hasTime/addr:timeCalendar & 100.000000 & 0.000000 & 0.000000 \\\\\nrico:isOrWasInstantiationOf & 100.000000 & 0.000000 & 0.000000 \\\\\nrico:isOrWasIncludedIn & 100.000000 & 0.000000 & 0.000000 \\\\\nrico:isOrWasInstantiation & 100.000000 & 0.000000 & 0.000000 \\\\\nrico:isOrWasComponentOf & 100.000000 & 0.000000 & 0.000000 \\\\\nrico:hasOrHadDerivedInstantiation & 100.000000 & 0.000000 & 0.000000 \\\\\nrico:hasCreationDate/rdf:type & 100.000000 & 0.000000 & 0.000000 \\\\\nrico:isOrWasDigitalInstantiationOf & 100.000000 & 0.000000 & 0.000000 \\\\\naddr:hasTime/rdf:type & 100.000000 & 0.000000 & 0.000000 \\\\\ncad:hasClasse/cad:hasClasseValue & 100.000000 & 0.000000 & 0.000000 \\\\\ncad:isEventType & 100.000000 & 0.000000 & 0.000000 \\\\\nrico:hasCreationDate/addr:timeCalendar & 100.000000 & 0.000000 & 0.00000