## 30. Datatype property values evaluation

In [1]:
import pandas as pd
from graphdbfunctions import *
from graph_evaluation import *
from namespaces import *

In [2]:
GRAPHDB_HOST = "http://localhost:7200"
GRAPHDB_REPO = 'lhay' # Your repository ID/Name
proxies = {
 "http": None,
 "https": None,
}

### 1. Global metrics

In [3]:
query_all = """
PREFIX rico: <https://www.ica.org/standards/RiC/ontology#>
PREFIX addr: <http://rdf.geohistoricaldata.org/def/address#>
SELECT ?s ?p 
WHERE {
    {
    SELECT ?s ?p
    WHERE {GRAPH <GRAPH_URI> {
            ?s rico:hasCreationDate/addr:timeStamp ?o.
            BIND("rico:hasCreationDate/addr:timeStamp" AS ?p)
            FILTER (!STRSTARTS(STR(?s), "_:"))
 			FILTER (isLiteral(?o))
 			FILTER (!isLiteral(?s))
        }}
    } UNION {
        SELECT ?s ?p
    	WHERE {GRAPH <GRAPH_URI> {
            ?s addr:hasTime/addr:timeStamp ?o
            BIND("addr:hasTime/addr:timeStamp" AS ?p)
            FILTER (!STRSTARTS(STR(?s), "_:"))
 			FILTER (isLiteral(?o))
 			FILTER (!isLiteral(?s))
        }}
    } UNION {
        SELECT ?s ?p
    	WHERE {GRAPH <GRAPH_URI> {
            ?s ?p ?o
            FILTER (!STRSTARTS(STR(?s), "_:"))
 			FILTER (isLiteral(?o))
 			FILTER (!isLiteral(?s))
        }}
    }
}
"""

In [4]:
gold_triples, pred_triples = get_triples(query_all, "http://rdf.geohistoricaldata.org/gold","http://rdf.geohistoricaldata.org/auto",GRAPHDB_HOST,GRAPHDB_REPO,proxies)

In [5]:
print(f'Number of triples in gold : {len(gold_triples)}')
print(f'Number of triples in pred : {len(pred_triples)}')

Number of triples in gold : 2877
Number of triples in pred : 3019


In [6]:
gold_in_pred, gold_not_in_pred, pred_in_gold, pred_not_in_gold = compare_dfs(gold_triples, pred_triples)

In [7]:
metrics_all = compute_metrics(gold_triples,pred_triples)
metrics_all

{'Agreement': 87.35000000000001, 'Deficit': 12.65, 'Surplus': 16.66}

## 2. Plot identifiers values
For the *Landmark* of type *Plot* which have homologues in the predited ad gold graphs, compute the similirity between plot identifiers (similarity = 1). 

In [8]:
query_identifiers = prefixes + """
select ?s ?value
where {
    graph <GRAPH_URI> {
    ?s a addr:Landmark.
    ?s addr:isLandmarkType cad_ltype:Plot.
    ?s dcterms:identifier ?value. 
}}
"""

In [9]:
gold, pred = get_triples(query_identifiers, "http://rdf.geohistoricaldata.org/gold", "http://rdf.geohistoricaldata.org/auto",GRAPHDB_HOST,GRAPHDB_REPO,proxies)

In [10]:
for c in ["s"]:
    for prefix, uri in prefixes_dict.items():
        gold[c] = gold[c].str.replace(prefix, uri, regex=False)
        pred[c] = pred[c].str.replace(prefix, uri, regex=False)

In [11]:
print(f'Number of triples in gold : {len(gold)}')
print(f'Number of triples in pred : {len(pred)}')

Number of triples in gold : 362
Number of triples in pred : 357


In [12]:
gold_in_pred, gold_not_in_pred, pred_in_gold, pred_not_in_gold = compare_dfs(gold, pred)

In [13]:
metrics_identifiers = compute_metrics(gold,pred)
metrics_identifiers

{'Agreement': 92.54, 'Deficit': 7.46, 'Surplus': 6.16}

To go further in the analysis of the plot identifiers, we compute normalized levenshtein distance on erroneous identifiers.

In [14]:
#Join gold and pred df on s
gold = gold.rename(columns={"s":"s_gold"})
pred = pred.rename(columns={"s":"s_pred"})
gold = gold.rename(columns={"value":"value_gold"})
pred = pred.rename(columns={"value":"value_pred"})
merged = pd.merge(gold, pred, left_on="s_gold", right_on="s_pred", how="outer", indicator=True)
merged

Unnamed: 0,s_gold,value_gold,s_pred,value_pred,_merge
0,landmark:0058f229-3c7d-4157-9ea5-ae12e04c03f9_...,B-277,,,left_only
1,landmark:00876f72-28fc-4dce-a73f-0eebd7bad322_...,B-99,landmark:00876f72-28fc-4dce-a73f-0eebd7bad322_...,B-99,both
2,landmark:0198dde0-a49d-45f5-a7ed-426b199f7c56_...,B-236,landmark:0198dde0-a49d-45f5-a7ed-426b199f7c56_...,B-236,both
3,landmark:02aaa7bf-1e7b-4f77-a2e5-f6675dd030c9_...,B-185,landmark:02aaa7bf-1e7b-4f77-a2e5-f6675dd030c9_...,B-185,both
4,landmark:02cbcebf-e843-4f17-b58f-92903ed4e75a_...,B-289,landmark:02cbcebf-e843-4f17-b58f-92903ed4e75a_...,B-289,both
...,...,...,...,...,...
370,landmark:fd4fa06d-e9f6-4f13-b043-e1516a8239ef_...,B-276,,,left_only
371,landmark:fd65acf9-998a-4d60-b72e-aa958967f1ba_...,B-124,landmark:fd65acf9-998a-4d60-b72e-aa958967f1ba_...,B-124,both
372,landmark:fd7e03b0-d2f7-4188-9e33-42c1018684dc_...,B-59,landmark:fd7e03b0-d2f7-4188-9e33-42c1018684dc_...,B-59,both
373,landmark:ff729965-a9be-4a4c-b133-d9bc6b53eb1b_...,B-205,landmark:ff729965-a9be-4a4c-b133-d9bc6b53eb1b_...,B-205,both


In [15]:
merged_filtered_nan = merged[merged['value_gold'].notna() & merged['value_pred'].notna()]

In [16]:
#Compute normalized levenshtein distance beweend value_gold and value_pred
merged_filtered_nan['levenshtein'] = merged_filtered_nan.apply(lambda row: normalized_levenshtein(row['value_gold'], row['value_pred']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_filtered_nan['levenshtein'] = merged_filtered_nan.apply(lambda row: normalized_levenshtein(row['value_gold'], row['value_pred']), axis=1)


In [17]:
merged_filtered_nan

Unnamed: 0,s_gold,value_gold,s_pred,value_pred,_merge,levenshtein
1,landmark:00876f72-28fc-4dce-a73f-0eebd7bad322_...,B-99,landmark:00876f72-28fc-4dce-a73f-0eebd7bad322_...,B-99,both,1.0
2,landmark:0198dde0-a49d-45f5-a7ed-426b199f7c56_...,B-236,landmark:0198dde0-a49d-45f5-a7ed-426b199f7c56_...,B-236,both,1.0
3,landmark:02aaa7bf-1e7b-4f77-a2e5-f6675dd030c9_...,B-185,landmark:02aaa7bf-1e7b-4f77-a2e5-f6675dd030c9_...,B-185,both,1.0
4,landmark:02cbcebf-e843-4f17-b58f-92903ed4e75a_...,B-289,landmark:02cbcebf-e843-4f17-b58f-92903ed4e75a_...,B-289,both,1.0
5,landmark:0376160c-792b-4c7c-9f4f-b26cdb17c9cb_...,B-269,landmark:0376160c-792b-4c7c-9f4f-b26cdb17c9cb_...,B-269,both,1.0
...,...,...,...,...,...,...
369,landmark:fbe14ec1-4111-4360-9695-658b38bb2c41_...,B-22,landmark:fbe14ec1-4111-4360-9695-658b38bb2c41_...,B-22,both,1.0
371,landmark:fd65acf9-998a-4d60-b72e-aa958967f1ba_...,B-124,landmark:fd65acf9-998a-4d60-b72e-aa958967f1ba_...,B-124,both,1.0
372,landmark:fd7e03b0-d2f7-4188-9e33-42c1018684dc_...,B-59,landmark:fd7e03b0-d2f7-4188-9e33-42c1018684dc_...,B-59,both,1.0
373,landmark:ff729965-a9be-4a4c-b133-d9bc6b53eb1b_...,B-205,landmark:ff729965-a9be-4a4c-b133-d9bc6b53eb1b_...,B-205,both,1.0


In [18]:
#Print exemple where levenshtein is lower than 1.0
levenshtein_errors = []
print("Exemples where levenshtein is lower than 1.0")
for index, row in merged_filtered_nan.iterrows():
    if row['levenshtein'] < 1.0:
        print(f"gold: {row['value_gold']} pred: {row['value_pred']} levenshtein: {round(row['levenshtein'],2)}")
        levenshtein_errors.append((row['value_gold'], row['value_pred'], row['levenshtein']))

levenshtein_errors_df = pd.DataFrame(levenshtein_errors, columns=['gold', 'pred', 'levenshtein similarity'])
print(levenshtein_errors_df.to_latex())

Exemples where levenshtein is lower than 1.0
gold: B-353bis pred: B-353bT levenshtein: 0.75
gold: B-216bis pred: B-216 levenshtein: 0.62
gold: B-254 pred: B-2547 levenshtein: 0.83
gold: B-255 pred: B-2548 levenshtein: 0.67
gold: B-76bis pred: B-76 levenshtein: 0.57
gold: B-256 pred: B-2549 levenshtein: 0.67
gold: B-352 pred: B-UNKNOWN levenshtein: 0.22
gold: B-52 pred: B-UNKNOWN levenshtein: 0.22
gold: B-×206± pred: B-206 levenshtein: 0.71
\begin{tabular}{lllr}
\toprule
 & gold & pred & levenshtein similarity \\
\midrule
0 & B-353bis & B-353bT & 0.750000 \\
1 & B-216bis & B-216 & 0.625000 \\
2 & B-254 & B-2547 & 0.833333 \\
3 & B-255 & B-2548 & 0.666667 \\
4 & B-76bis & B-76 & 0.571429 \\
5 & B-256 & B-2549 & 0.666667 \\
6 & B-352 & B-UNKNOWN & 0.222222 \\
7 & B-52 & B-UNKNOWN & 0.222222 \\
8 & B-×206± & B-206 & 0.714286 \\
\bottomrule
\end{tabular}



Different type of not similar values : 
* noisy number recognition
* "bis" or "ter" not transcribed or noisy
* missed number