# 30 - Coreference detection and entities creation : example of the taxpayers 

In [1]:
import pandas as pd
from rdflib import Graph, URIRef, Literal, RDFS, Namespace
from rdflib.namespace import SKOS, RDF, RDFS
import glob
import json
import time
import datetime
import re

In [2]:
import sys
import os
# Access to the utils directory
current_dir = os.getcwd()
utils_dir = os.path.join(current_dir, '..', 'utils')
sys.path.append(utils_dir)

In [3]:
from linking_utils import PrepareQueriesForEL
from string_utils import NormalizeText

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/STual/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/STual/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Retrieve the files and values

In [4]:
ROOT = "/home/STual/DAN-cadastre/"
SAVE_FOLDER = "/home/STual/DAN-cadastre/data/LHAY"
JSONS = glob.glob(ROOT + 'inference/LHAY/*.json') #WHere JSONS produced in DAN format output are saved

In [5]:
taxpayers = PrepareQueriesForEL.retrieve_mentions(JSONS, ROOT + 'inference/LHAY/', 'Ⓒ', True)
print(f"Number of documents : {len(taxpayers)}")

Number of documents : 26


In [6]:
distinct_taxpayers, distinct_taxpayers_details = PrepareQueriesForEL.distinct_mentions_with_ne(taxpayers)
print(f"Number of distinct taxpayers mentions : {len(distinct_taxpayers)}")

Number of distinct taxpayers mentions : 289


In [7]:
PRINT_MENTIONS = True
if PRINT_MENTIONS:
    distinct_taxpayers

## 1. Data normalization

In [8]:
remove_chars_regex = '→()↑↓×±.,!?;:-@#$%^&*'
replacement_char = ''

taxpayers_mentions = []
for d in distinct_taxpayers:
    taxpayer_json = distinct_taxpayers_details[d]
    new_taxpayer_json = {}
    if len(taxpayer_json['name']) > 0:
        new_taxpayer_json["name"] = NormalizeText.remove_accents(NormalizeText.replace_characters(NormalizeText.replace_characters(taxpayer_json['name'].lower(), '→', ' '),remove_chars_regex,replacement_char))
    else:
        new_taxpayer_json["name"] = ""
    if len(taxpayer_json['firstnames']) > 0:
        new_taxpayer_json["firstnames"] = NormalizeText.remove_accents(NormalizeText.replace_characters(NormalizeText.replace_characters(taxpayer_json['firstnames'].lower(), '→', ' '),remove_chars_regex,replacement_char))
    else:
        new_taxpayer_json["firstnames"] = ""
    if len(taxpayer_json['familystatus']) > 0:
        status = " ".join(taxpayer_json['familystatus'])
        new_taxpayer_json["familystatus"] = NormalizeText.remove_accents(NormalizeText.replace_characters(NormalizeText.replace_characters(status.lower(), '→', ' '), remove_chars_regex, replacement_char))
    else:
        new_taxpayer_json["familystatus"] = ""

    activities = []
    if len(taxpayer_json['activity']) > 0:
        activities = activities + [NormalizeText.remove_accents(NormalizeText.replace_characters(NormalizeText.replace_characters(a.lower(), '→', ' '), remove_chars_regex, replacement_char)) for a in taxpayer_json['activity']]
        new_taxpayer_json["activity"] = activities
    else:
        new_taxpayer_json["activity"] = activities

    adresses = []
    if len(taxpayer_json['address']) > 0:
        adresses = adresses + [NormalizeText.remove_accents(NormalizeText.replace_characters(NormalizeText.replace_characters(a.lower(), '→', ' '), remove_chars_regex, replacement_char)) for a in taxpayer_json['address']]
        new_taxpayer_json["address"] = adresses
    else:
        new_taxpayer_json["address"] = adresses

    titles = []
    if len(taxpayer_json['title']) > 0:
        titles = titles + [NormalizeText.remove_accents(NormalizeText.replace_characters(NormalizeText.replace_characters(a.lower(), '→', ' '), remove_chars_regex, replacement_char)) for a in taxpayer_json['address']]
        new_taxpayer_json["title"] = titles
    else:
        new_taxpayer_json["title"] = titles
        
    new_taxpayer_json["index_number"] = taxpayer_json["index_num"]
    taxpayers_mentions.append(new_taxpayer_json)

In [9]:
taxpayers_mentions[0]

{'name': 'godefroy ',
 'firstnames': 'denis ',
 'familystatus': 've',
 'activity': [],
 'address': [],
 'title': [],
 'index_number': '129'}

## 2. Create the clusters

In [10]:
from linking_utils import MentionGrouper

# Create an instance of MentionGrouper
grouper = MentionGrouper(name_threshold=0.68,
    firstname_threshold=0.65,
    familystatus_threshold=0.65,
    mesure="normalizedlevenshtein"#embeddingcosinus normalizedlevenshtein)
)
# Group mentions
groups = grouper.group_mentions(
    taxpayers_mentions)

In [11]:
len(groups)

185

In [12]:
counter = 0
for li in groups:
    for i in li:
        counter += 1
counter

289

In [13]:
assert counter == len(distinct_taxpayers)

In [14]:
DISPLAY_GROUPS = True
if DISPLAY_GROUPS : 
    for l in groups:
        print("####################")
        for i in l:
            print(distinct_taxpayers[i], distinct_taxpayers_details[distinct_taxpayers[i]]["index_num"])

####################
Godefroy denis V↑e↓ 129
Godefroy denis ch↑es↓ V↑e↓ 129
Godefroy denis ch↑es↓ 129
####################
Godefroy claude joseph 131
####################
Godefroy antoine 130
####################
Godefroy proppice laude 131
####################
Godefroy claude→antoine per 131
####################
Gabillot claude 124
####################
mul 36
Nul 124
Mil 155
####################
Benoist (lanoslas 26
Benoist slanislac 20
Denoist slanislas 188
Denoist stanislas 26
Denoist Otanislas 26
####################
Benoist Venistay 26
####################
Chevalier louis lt↑re↓ 72
Chevalier louis Et↑e↓ 72
Chevalier louis C↑re↓ 72
Chevalier louis 72
####################
Chevalier claude fils→S↑tre↓ 74
Chevalier claude fils→Sutve 71
Chevallier Claudet↑e↓ 74
Chevalier claude fils→(denoist stanislas 74
####################
Chevallier louis taytor 35
####################
Chevalier f↑ois↓ claude 73
Chevalier f↑ois↓ Claude 72
####################
Chevalier angélique 71
#################

## 3. Create the entities

In [15]:
import Levenshtein as lev
from collections import Counter
import uuid

def normalized_levenshtein(str1, str2):
    """Returns a normalized Levenshtein distance (1 - distance)"""
    if len(str1) == 0 and len(str2) == 0:
        return 1.0  # Both empty, consider them identical
    if len(str1) == 0 or len(str2) == 0:
        return 0.0  # One is empty, completely dissimilar
    return 1 - lev.distance(str1, str2) / max(len(str1), len(str2))

# Helper function to calculate the representative name (minimal Levenshtein distance)
def get_representative_name(names):
    min_distance = float('inf')
    representative_name = None
    
    for name in names:
        avg_distance = sum(normalized_levenshtein(name, other_name) for other_name in names) / len(names)
        if avg_distance < min_distance:
            min_distance = avg_distance
            representative_name = name
    
    return representative_name

# Helper function to calculate the longest string
def get_longest_string(strings):
    return max(strings, key=len)

# Helper function to calculate the most appropriate family status
def get_representative_familystatus(familystatuses):
    # Filter out "id" and "idem" if they are not the only values
    filtered_statuses = [status for status in familystatuses if status.lower() not in ["id", "idem"]]
    
    if filtered_statuses:
        return get_longest_string(filtered_statuses)  # Choose the longest one among filtered values
    else:
        return get_longest_string(familystatuses)  # If "id" or "idem" are the only options, choose the longest one

# Function to create RDF entities based on the most representative values
def create_rdf_entities(groups, mentions):
    rdf_entities = []

    for group in groups:
        group_mentions = [mentions[i] for i in group]
        
        # Step 1: Choose the representative name
        names = [m['name'] for m in group_mentions]
        representative_name = get_representative_name(names)

        # Step 2: Choose the representative firstnames (longest string)
        firstnames = [m['firstnames'] for m in group_mentions]
        representative_firstname = get_longest_string(firstnames)

        # Step 3: Choose the representative familystatus
        familystatuses = [m.get('familystatus', '') for m in group_mentions]
        representative_familystatus = get_representative_familystatus(familystatuses)
        # Step 4: Retrieve and cluster the addresses, activities, and titles for the group
        addresses = [m['address'] for m in group_mentions]
        activities = [m['activity'] for m in group_mentions]
        titles = [m['title'] for m in group_mentions]
        index_num = [m['index_number'] for m in group_mentions]
        
        # Create RDF entity for this group
        rdf_entity = {
            'uuid': str(uuid.uuid4()),
            'name': representative_name.title(),
            'firstnames': representative_firstname.title(),
            'familystatus': representative_familystatus.title(),
            'address': list(set([j for i in addresses for j in i])),
            'activity': list(set([j for i in activities for j in i])),
            'title': list(set([j for i in titles for j in i])),
            'index_number' : list(set([i for i in index_num if i != 'MISSING'])),
            'mentions': group  # Link this entity to the list of mentions in the group
        }
        rdf_entities.append(rdf_entity)
    
    return rdf_entities

In [16]:
rdf_entities = create_rdf_entities(fusion_unique_groups, taxpayers_mentions)

NameError: name 'fusion_unique_groups' is not defined

In [None]:
len(rdf_entities)

In [None]:
rdf_entities[1]

In [None]:
from linking_utils import LinkingUtils

# Function to generate the RDF resource with additional properties (address, activity, title)
def jsonprop_to_rdfprop(prop):
    CAD = Namespace("http://rdf.geohistoricaldata.org/def/cadastre#")
    if prop =="address":
        return CAD.taxpayerAddress
    elif prop =="activity":
        return CAD.taxpayerActivity
    elif prop =="title":
        return CAD.taxpayerTitle
    elif prop =="familystatus":
        return CAD.taxpayerFamilyStatus

def normalize_whitespaces(label):
    label = re.sub('→',' ',label)
    label = re.sub('  ',' ',label)
    label = re.sub('[ ]+$','',label)
    return label

def generate_rdf_resource(rdf_entities, distinct_mentions, distinct_mentions_details):
    """
    Create a simple RDF resource for each group with additional properties (address, activity, title, familystatus).
    """
    g = Graph()
    ADDR = Namespace("http://rdf.geohistoricaldata.org/def/address#")
    CAD = Namespace("http://rdf.geohistoricaldata.org/def/cadastre#")
    TAXPAYER = Namespace("http://rdf.geohistoricaldata.org/id/taxpayer/")
    g.bind("addr", ADDR)
    g.bind("cad", CAD)
    g.bind("taxpayer", TAXPAYER)

    uris_dict = {}
    
    for rdf_entity in rdf_entities:

        uri = URIRef("http://rdf.geohistoricaldata.org/id/taxpayer/" + rdf_entity['uuid'])
        g.add((uri, RDF.type, CAD.Taxpayer))

        mentions = rdf_entity["mentions"]
        for m in mentions:
            key = distinct_mentions[m]
            uris_dict[key] = str(uri)
    
        # Name and firstnames
        if len(rdf_entity['name']) > 0 and len(rdf_entity['firstnames']) > 0 and len(rdf_entity['familystatus']) > 0:
            label = rdf_entity['name'] + ' ' + rdf_entity['firstnames'] + ' ' + rdf_entity['familystatus']
            g.add((uri, CAD.taxpayerLabel, Literal(rdf_entity['name'])))
            g.add((uri, CAD.taxpayerFirstnames, Literal(rdf_entity['firstnames'])))
            g.add((uri, CAD.taxpayerFamilyStatus, Literal(rdf_entity['familystatus'])))
        elif len(rdf_entity['name']) > 0 and len(rdf_entity['firstnames']) > 0 and len(rdf_entity['familystatus']) == 0:
            label = rdf_entity['name'] + ' ' + rdf_entity['firstnames']
            g.add((uri, CAD.taxpayerLabel, Literal(rdf_entity['name'])))
            g.add((uri, CAD.taxpayerFirstnames, Literal(rdf_entity['firstnames'])))
        elif len(rdf_entity['name']) > 0 and len(rdf_entity['firstnames']) == 0 and len(rdf_entity['familystatus']) > 0:
            label = rdf_entity['name'] + ' ' + rdf_entity['familystatus']
            g.add((uri, CAD.taxpayerLabel, Literal(rdf_entity['name'])))
            g.add((uri, CAD.taxpayerFamilyStatus, Literal(rdf_entity['familystatus'])))
        else:
            label = rdf_entity['name']
            g.add((uri, CAD.taxpayerLabel, Literal(rdf_entity['name'])))

        g.add((uri, RDFS.label, Literal(normalize_whitespaces(label))))
    
        # Adding address, activity, and title properties
        for property_name in ['address', 'activity', 'title']:
            if len(rdf_entity[property_name]) == 1:
                g.add((uri, jsonprop_to_rdfprop(property_name), Literal(rdf_entity[property_name][0].title())))
            elif len(rdf_entity[property_name]) > 1:
                # Cluster values by similarity
                values = rdf_entity[property_name]
                clusters = LinkingUtils.cluster_by_embeddings_similarity(values,0.85)
                # For each cluster, get the longest string (representative value)
                for cluster in clusters:
                    strs = []
                    for c in cluster:
                        strs.append(values[c])
                    representative_value = get_longest_string(strs)
                    
                    # Add the RDF triples for this property
                    g.add((uri, jsonprop_to_rdfprop(property_name), Literal(representative_value.title())))

        print(rdf_entity['index_number'])
        if len(rdf_entity['index_number']) == 1 and rdf_entity['index_number'][0] != "MISSING":
            g.add((uri, CAD.hasIndexNumber, Literal(rdf_entity['index_number'][0])))
        elif len(rdf_entity['index_number']) > 1 and rdf_entity['index_number'][0] != "MISSING":
            for elem in rdf_entity['index_number']:
                g.add((uri, CAD.hasIndexNumber, Literal(elem)))

    return g, uris_dict

In [None]:
# Generate RDF resource
graph, uris_dict = generate_rdf_resource(rdf_entities, distinct_taxpayers, distinct_taxpayers_details)

# Print the RDF graph in Turtle format
#graph.serialize(destination=f"{SAVE_FOLDER}/rdf/taxpayers.ttl", format="turtle")

## 4. Annotate the table