# 10. Instances

This notebook aims to compute completness and semantic presision on Instances of the predicted graph by comparison to the gold standard graph.

In [21]:
import glob
import pandas as pd
import requests
from graphdbfunctions import *
from graph_evaluation import *
from namespaces import prefixes
from great_tables import GT
import polars as pl

In [22]:
GRAPHDB_HOST = "http://localhost:7200"
GRAPHDB_REPO = 'lhay' # Your repository ID/Name
proxies = {
 "http": None,
 "https": None,
}

Before executing this code for the first time, please execute this SPARQL query : 
```sparql
PREFIX lrtype: <http://rdf.geohistoricaldata.org/id/codes/address/landmarkRelationType/>
PREFIX addr: <http://rdf.geohistoricaldata.org/def/address#>

DELETE {
  GRAPH <http://rdf.geohistoricaldata.org/gold> {
    ?s addr:isLandmarkRelationType lrtype:Undefined.
  }
}
INSERT {
  GRAPH <http://rdf.geohistoricaldata.org/gold> {
    ?s addr:isLandmarkRelationType lrtype:Within.
  }
}
WHERE {
  GRAPH <http://rdf.geohistoricaldata.org/gold> {
    ?s addr:isLandmarkRelationType lrtype:Undefined.
  }
}
```

### 1. Global metrics

In [23]:
query = prefixes + """
select * where {
	GRAPH <GRAPH_URI> {
    ?entity rdf:type ?type.
    FILTER (!STRSTARTS(STR(?entity), "_:"))  # Ensures the subject is not a blank node
}}
"""

In [24]:
query = prefixes + """
select ?entity ?type
where 
    {{select * 
    where {
        GRAPH <GRAPH_URI> {
        ?entity rdf:type ?type.
        FILTER (!STRSTARTS(STR(?entity), "_:"))  # Ensures the subject is not a blank node
    	}}
	} UNION {
	{select * 
    where {
        GRAPH <GRAPH_URI> {
        ?entity_att rdf:type addr:Attribute.
        ?entity addr:hasAttribute ?entity_att.
        ?entity rdf:type addr:Landmark.
        BIND(addr:Attribute as ?type)
    	}}
        }} UNION {
        select * 
    where {
        GRAPH <GRAPH_URI> {
        ?entity_attv rdf:type addr:AttributeVersion.
        ?entity_att addr:hasAttributeVersion ?entity_attv.
        ?entity addr:hasAttribute ?entity_att.
        ?entity rdf:type addr:Landmark.
        BIND(addr:AttributeVersion as ?type)
    	}
        }}}
"""

In [25]:
gold, pred = get_triples(query, "http://rdf.geohistoricaldata.org/gold", "http://rdf.geohistoricaldata.org/auto",GRAPHDB_HOST,GRAPHDB_REPO,proxies)

In [26]:
from namespaces import prefixes_dict
#Using prefixes_dict, replace strings in the type column
for c in ["entity","type"]:
    for prefix, uri in prefixes_dict.items():
        gold[c] = gold[c].str.replace(prefix, uri, regex=False)
        pred[c] = pred[c].str.replace(prefix, uri, regex=False)

In [27]:
print(f'Number of triples in gold : {len(gold)}')
print(f'Number of triples in pred : {len(pred)}')

Number of triples in gold : 5289
Number of triples in pred : 5200


In [28]:
SAVE_FILES = False

gold_in_pred, gold_not_in_pred, pred_in_gold, pred_not_in_gold = compare_dfs(gold, pred)

if SAVE_FILES:
    gold.to_csv("gold_entities.csv", index=False)
    pred.to_csv("pred_entities.csv", index=False)
    gold_in_pred.to_csv("non_identical_entities_gold.csv", index=False)
    pred_in_gold.to_csv("non_identical_entities_pred.csv", index=False)

In [29]:
metrics_all = compute_metrics(gold,pred)
metrics_all

{'Agreement': 87.26,
 'Deficit': 12.740000000000002,
 'Surplus': 11.940000000000001}

### 2. Evaluation by type (rdf:type)

#### Count

In [30]:
# Get counts for gold and pred
gold_df = get_property_df(gold)
pred_df = get_property_df(pred)

In [31]:
# Merge the two DataFrames on the 'Property' column : Property Count_gold, Count_pred
df = pd.merge(gold_df, pred_df, on='type', how='outer')
df.fillna(0, inplace=True)
#Compmpute the difference (abs)
df['Difference'] = abs(df['Count_x'] - df['Count_y'])
#Sort by difference
df.sort_values(by='Difference', ascending=False, inplace=True)
display(df)

Unnamed: 0,type,Count_x,Count_y,Difference
2,addr:Change,1446,1407,39
0,addr:Attribute,1084,1050,34
1,addr:AttributeVersion,1084,1050,34
6,cad:Taxpayer,152,185,33
5,addr:LandmarkRelation,728,712,16
4,addr:Landmark,367,373,6
7,rico:Instantiation,395,390,5
3,addr:Event,1,1,0
8,rico:Record,30,30,0
9,rico:RecordSet,2,2,0


#### Compute metrics

In [32]:
properties = get_list_of_properties(df,"type")
properties

['addr:Change',
 'addr:Attribute',
 'addr:AttributeVersion',
 'cad:Taxpayer',
 'addr:LandmarkRelation',
 'addr:Landmark',
 'rico:Instantiation',
 'addr:Event',
 'rico:Record',
 'rico:RecordSet']

In [33]:
#Filter these df by 'type' column value : 
def filter_df(df, column, column_value):
    """
    Filters the DataFrame based on the specified column and value.
    """
    filtered_df = df[df[column] == column_value].copy()
    return filtered_df

In [34]:
from great_tables import GT
import polars as pl

metrics_by_properties = []
for p in properties:
    # Get the instances of the property in gold and pred
    gold_instances = filter_df(gold,"type",p)
    pred_instances = filter_df(pred,"type",p)
    
    # Compute
    metrics = compute_metrics(gold_instances, pred_instances)
    metrics_by_properties.append({
        "Type": p,
        "Agreement": metrics["Agreement"],
        "Deficit": metrics["Deficit"],
        "Surplus": metrics["Surplus"],
        "Actual": len(gold_instances),
        "Predicted": len(pred_instances)
        #"Precision": metrics["Precision"],
        #"Recall": metrics["Recall"],
        #"F1 score": metrics["F1 score"]
    })
# Create a DataFrame from the metrics
metrics_df = pd.DataFrame(metrics_by_properties)

In [35]:
#Nice display
GT(metrics_df) \
    .data_color(columns=["Agreement"],#, "Precision", "Recall", "F1 score"
                palette=["red", "orange", "green"], domain=[0, 100]) \
    .data_color(columns=["Deficit", "Surplus"],
                palette=["green", "orange", "red"], domain=[0, 100])

Type,Agreement,Deficit,Surplus,Actual,Predicted
addr:Change,93.91,6.09,3.4799999999999995,1446,1407
addr:Attribute,95.2,4.8,3.43,1084,1050
addr:AttributeVersion,95.2,4.8,3.43,1084,1050
cad:Taxpayer,60.53,39.47,50.27,152,185
addr:LandmarkRelation,47.39,52.61,51.54,728,712
addr:Landmark,94.28,5.72,7.24,367,373
rico:Instantiation,95.44,4.5600000000000005,3.3300000000000005,395,390
addr:Event,100.0,0.0,0.0,1,1
rico:Record,100.0,0.0,0.0,30,30
rico:RecordSet,100.0,0.0,0.0,2,2


In [36]:
df_to_latex(metrics_df)

\begin{tabular}{lrrrrr}
\toprule
Type & Agreement & Deficit & Surplus & Actual & Predicted \\
\midrule
rico:RecordSet & 100.000000 & 0.000000 & 0.000000 & 2 & 2 \\
rico:Record & 100.000000 & 0.000000 & 0.000000 & 30 & 30 \\
addr:Event & 100.000000 & 0.000000 & 0.000000 & 1 & 1 \\
rico:Instantiation & 95.440000 & 4.560000 & 3.330000 & 395 & 390 \\
addr:AttributeVersion & 95.200000 & 4.800000 & 3.430000 & 1084 & 1050 \\
addr:Attribute & 95.200000 & 4.800000 & 3.430000 & 1084 & 1050 \\
addr:Landmark & 94.280000 & 5.720000 & 7.240000 & 367 & 373 \\
addr:Change & 93.910000 & 6.090000 & 3.480000 & 1446 & 1407 \\
cad:Taxpayer & 60.530000 & 39.470000 & 50.270000 & 152 & 185 \\
addr:LandmarkRelation & 47.390000 & 52.610000 & 51.540000 & 728 & 712 \\
\bottomrule
\end{tabular}



'\\begin{tabular}{lrrrrr}\n\\toprule\nType & Agreement & Deficit & Surplus & Actual & Predicted \\\\\n\\midrule\nrico:RecordSet & 100.000000 & 0.000000 & 0.000000 & 2 & 2 \\\\\nrico:Record & 100.000000 & 0.000000 & 0.000000 & 30 & 30 \\\\\naddr:Event & 100.000000 & 0.000000 & 0.000000 & 1 & 1 \\\\\nrico:Instantiation & 95.440000 & 4.560000 & 3.330000 & 395 & 390 \\\\\naddr:AttributeVersion & 95.200000 & 4.800000 & 3.430000 & 1084 & 1050 \\\\\naddr:Attribute & 95.200000 & 4.800000 & 3.430000 & 1084 & 1050 \\\\\naddr:Landmark & 94.280000 & 5.720000 & 7.240000 & 367 & 373 \\\\\naddr:Change & 93.910000 & 6.090000 & 3.480000 & 1446 & 1407 \\\\\ncad:Taxpayer & 60.530000 & 39.470000 & 50.270000 & 152 & 185 \\\\\naddr:LandmarkRelation & 47.390000 & 52.610000 & 51.540000 & 728 & 712 \\\\\n\\bottomrule\n\\end{tabular}\n'

### 3. Subtypes

Finally, in relevant cases (Landmark, Attributes, Record), we compute the number of entities by subtype.

In [37]:
subtypes_properties = {
    "addr:isLandmarkType":["cad_ltype:Plot","cad_ltype:Section","cad_ltype:Commune","ltype:District","ltype:Undefined"],
    "addr:isLandmarkRelationType":["lrtype:Within","lrtype:Undefined"],
    "cad:isSourceType":["srctype:PageDeRegistre","srctype:LigneEtatDeSection","srctype:Cadastre","srctype:EtatsDeSections","srctype:EtatsDeSections_Scp_Seine_1835"],
    "addr:hasAttribute/addr:isAttributeType":["cad_atype:PlotAddress","cad_atype:PlotTaxpayer","cad_atype:PlotNature"]}

In [38]:
query_subtype = prefixes + """
select ?entity ?type where {
	GRAPH <GRAPH_URI> {
    ?entity SUBTYPE_PROP VALUE.
    BIND("SUBTYPE_PROP;VALUE" AS ?type)
}}"""

In [39]:
metrics_by_subproperties = []
for p in list(subtypes_properties.keys()):
    uquery = query_subtype.replace("SUBTYPE_PROP",p)
    print(subtypes_properties[p])
    for val in subtypes_properties[p]:
        print(val)
        uquery_t = uquery.replace("VALUE",val)
        gold_subtypes, pred_subtypes = get_triples(uquery_t, "http://rdf.geohistoricaldata.org/gold", "http://rdf.geohistoricaldata.org/auto",GRAPHDB_HOST,GRAPHDB_REPO,proxies)
        for c in ["entity","type"]:
            for prefix, uri in prefixes_dict.items():
                if len(gold_subtypes) > 0:
                    gold_subtypes[c] = gold_subtypes[c].str.replace(prefix, uri, regex=False)
                if len(pred_subtypes) > 0:
                    pred_subtypes[c] = pred_subtypes[c].str.replace(prefix, uri, regex=False)
        metrics = compute_metrics(gold_subtypes, pred_subtypes)
        metrics_by_subproperties.append({
            "Property":p,
            "Subtype": val,
            "Agreement": metrics["Agreement"],
            "Deficit": metrics["Deficit"],
            "Surplus": metrics["Surplus"],
        })

['cad_ltype:Plot', 'cad_ltype:Section', 'cad_ltype:Commune', 'ltype:District', 'ltype:Undefined']
cad_ltype:Plot
cad_ltype:Section
cad_ltype:Commune
ltype:District
ltype:Undefined
['lrtype:Within', 'lrtype:Undefined']
lrtype:Within
lrtype:Undefined
['srctype:PageDeRegistre', 'srctype:LigneEtatDeSection', 'srctype:Cadastre', 'srctype:EtatsDeSections', 'srctype:EtatsDeSections_Scp_Seine_1835']
srctype:PageDeRegistre
srctype:LigneEtatDeSection
srctype:Cadastre
srctype:EtatsDeSections
srctype:EtatsDeSections_Scp_Seine_1835
['cad_atype:PlotAddress', 'cad_atype:PlotTaxpayer', 'cad_atype:PlotNature']
cad_atype:PlotAddress
cad_atype:PlotTaxpayer
cad_atype:PlotNature


In [40]:
metrics_df_sub = pd.DataFrame(metrics_by_subproperties)
df_to_latex(metrics_df_sub)

\begin{tabular}{llrrr}
\toprule
Property & Subtype & Agreement & Deficit & Surplus \\
\midrule
addr:isLandmarkType & cad_ltype:Section & 100.000000 & 0.000000 & 0.000000 \\
addr:isLandmarkType & cad_ltype:Commune & 100.000000 & 0.000000 & 0.000000 \\
cad:isSourceType & srctype:PageDeRegistre & 100.000000 & 0.000000 & 0.000000 \\
cad:isSourceType & srctype:EtatsDeSections_Scp_Seine_1835 & 100.000000 & 0.000000 & 0.000000 \\
cad:isSourceType & srctype:EtatsDeSections & 100.000000 & 0.000000 & 0.000000 \\
cad:isSourceType & srctype:Cadastre & 100.000000 & 0.000000 & 0.000000 \\
addr:isLandmarkType & cad_ltype:Plot & 95.030000 & 4.970000 & 3.640000 \\
cad:isSourceType & srctype:LigneEtatDeSection & 95.030000 & 4.970000 & 3.640000 \\
addr:hasAttribute/addr:isAttributeType & cad_atype:PlotTaxpayer & 95.030000 & 4.970000 & 3.370000 \\
addr:hasAttribute/addr:isAttributeType & cad_atype:PlotNature & 95.000000 & 5.000000 & 3.390000 \\
addr:hasAttribute/addr:isAttributeType & cad_atype:PlotAddres

'\\begin{tabular}{llrrr}\n\\toprule\nProperty & Subtype & Agreement & Deficit & Surplus \\\\\n\\midrule\naddr:isLandmarkType & cad_ltype:Section & 100.000000 & 0.000000 & 0.000000 \\\\\naddr:isLandmarkType & cad_ltype:Commune & 100.000000 & 0.000000 & 0.000000 \\\\\ncad:isSourceType & srctype:PageDeRegistre & 100.000000 & 0.000000 & 0.000000 \\\\\ncad:isSourceType & srctype:EtatsDeSections_Scp_Seine_1835 & 100.000000 & 0.000000 & 0.000000 \\\\\ncad:isSourceType & srctype:EtatsDeSections & 100.000000 & 0.000000 & 0.000000 \\\\\ncad:isSourceType & srctype:Cadastre & 100.000000 & 0.000000 & 0.000000 \\\\\naddr:isLandmarkType & cad_ltype:Plot & 95.030000 & 4.970000 & 3.640000 \\\\\ncad:isSourceType & srctype:LigneEtatDeSection & 95.030000 & 4.970000 & 3.640000 \\\\\naddr:hasAttribute/addr:isAttributeType & cad_atype:PlotTaxpayer & 95.030000 & 4.970000 & 3.370000 \\\\\naddr:hasAttribute/addr:isAttributeType & cad_atype:PlotNature & 95.000000 & 5.000000 & 3.390000 \\\\\naddr:hasAttribute/add