# 10 - Fine-grained annotation of land registry columns : dataset  creation
* Address
* Taxpayers

In [4]:
import pandas as pd
from ollama import chat
from pydantic import BaseModel
import time

In [5]:
ROOT = "/home/STual/DAN-cadastre"

In [6]:
#Import entities
ENTITIES_CSV = ROOT + "/data/NER/entities.csv"
df = pd.read_csv(ENTITIES_CSV)
display(df)

Unnamed: 0,element_uuid,address,plot_number,taxpayer,taxpayer_number,nature,former_plot_number,former_nature
0,33278fbe-9845-49a9-9fc0-00faf13c0484,Port à l'anglais,95,Charte Etienne,,terre,,
1,58dfd0f9-62e9-4f95-b17d-922f3ce150d2,§,§,Crispin désiré augustin,,§,,
2,c648b66a-1cd1-4a66-9b48-10429ea2f08d,§,§,chichérit françois auguste,,§,,
3,93b4fad5-d5dd-4763-a56d-a35fc089c11d,§,§,C↑ie↓ parisienne du gaz,,§,,
4,b3ad0224-1d5a-42b1-a41d-13531c23907b,§,§,Coquette léon et Coquette Louis→alfred,,§,,
...,...,...,...,...,...,...,...,...
3221,97b9d7aa-e38b-4032-b263-2ac2433bb850,§,307,idem,167,maison b↑t↓→et cour,220p,Ø
3222,0829167d-6ff5-4ecd-bfd5-a30b7c40b2aa,§,308,idem,107,maison,220p,Ø
3223,5698c83e-8192-48f4-8542-7613edd5d528,le petit parc,16↑bis↓,Pierlot banquier à Paris,,Gazon d'agt,,
3224,4b02c685-a8a2-4547-8002-09a8c6ac4fd4,le bois des champs,17,Delamare Mathurin Bourgeois,,Bois,,


## 1. Taxpayers
### Named entities
* Name (person or company)
* Firstnames
* Address
* Activity
* Title
* Family status
### Types of taxpayers
* Invidual
* Organization

In [17]:
taxpayers_df = df[df['taxpayer'] != '§']
taxpayers_df = taxpayers_df[taxpayers_df['taxpayer'] != 'id']
taxpayers_df = taxpayers_df[taxpayers_df['taxpayer'] != 'idem']

In [18]:
# Roles
USER = 'user' #you
ASSISTANT = 'assistant' #the LLM
MODEL = "llama3.1"

In [47]:
#Define Taxpayer class for data validation with Pydantic
class Taxpayer(BaseModel):
  name: str = ""
  firstnames: str = ""
  activity: list[str] = []
  address: list[str] = []
  title:list[str] = []
  familystatus:list[str] = []
  birthname: str = ""

class TaxpayersList(BaseModel):
    taxpayers: list[Taxpayer]

# Function to analyze a taxpayer transcription and detect named entities in it
# Uses the Ollama `chat` function to interact with a  model (`llama3.1`).
# Returns the results as an Taxpayer object.
def named_entities(role, prompt, model):
    # Call the `chat` function with the specified model, format, and parameters.
    response = chat(
        model=model,
        format=TaxpayersList.model_json_schema(),
        messages=[
            {
                'role': role,
                'content': prompt
            },
        ],
        options={"num_ctx": 4096}
    )
    # Validate and parse the response JSON into an AnimalList object.
    taxpayers_data = TaxpayersList.model_validate_json(response.message.content)
    return response

In [52]:
PROMPT_CONTEXT = """You are a senior researcher specializing in digital humanities, with a particular focus on the valorization of 19th-century French land registers that have been automatically transcribed. Your task is to structure short textual inputs describing taxpayers into a predefined set of entities.

### Input Description:
The input consists of short sentences containing information about a taxpayer, which may include their name, title, activity, address, and family status. These details need to be extracted and categorized according to the following entities:
- **"name"**: The last name of a person or the name of a company.
- **"firstnames"**: One or more first names of the individual.
- **"address"**: An address or any other spatial entity associated with the individual.
- **"activity"**: The profession, occupation, or work associated with the individual.
- **"title"**: Titles such as 'M', 'Mme', 'Mademoiselle', 'Monseigneur', 'General', or 'Prince'.
- **"family"**: Mentions of family or marital status such as 'Père', 'Veuve', 'Fille', 'Fils', or 'Héritier'.

### Expected Output:
The output should be in JSON format, with the extracted entities organized into a structured dictionary. If an entity is not present in the input, it should be represented with an empty string (`''`) for singular entities or an empty list (`[]`) for plural entities.

### Examples:
Here are examples of how inputs should be processed and structured into JSON format:
"""

In [56]:
EXAMPLES_LIST = [["Prudhomme",Taxpayer(name="Prudhomme")],
                ["Société anonyme du Comptoir Central de l'Est",Taxpayer(name="Société anonyme du Comptoir Central de l'Est")],
                ["Germay à Paris",Taxpayer(name="Germay", address=["Paris"])],
                ["Barbaroux quincailler à Paris",Taxpayer(name="Barbaroux",activity=["quincailler"],address=["Paris"])],
                ["Costy, Jn Bte Tailleur de Pierre à Villeneuve Leroy",Taxpayer(name="Costy", firstnames="Jn Bte",activity=["Tailleur de Pierre"],address=["Villeneuve Leroy"])],
                ["Besnet Joseph, Henri - 6/8 Rue Camille Desmoulins 19 Rue Guichard", Taxpayer(name="Besnet", firstnames="Joseph, Henri", address=["6/8 Rue Camille Desmoulins", "19 Rue Guichard"])],
                ["Tellier Catherine fille majeure à Ablon", Taxpayer(name="Tellier", firstnames="Catherine", familystatus=["fille majeure"], address=["Ablon"])],
                ["Pravel Louis Ve né Gerbuisson",Taxpayer(name="Pravel",firstnames="Louis",familystatus=["Ve"],birthname="Gerbuisson")],
                ["Commune d'Ablon",Taxpayer(name="Commune d'Ablon")]
]
PROMPT_EXAMPLES = ""
for elem in EXAMPLES_LIST:
    PROMPT_EXAMPLES += "\n**Input:** `" + elem[0] + "`"
    PROMPT_EXAMPLES += "\n**Output:** ```json\n" + str(elem[1].model_dump()) + "\n```\n"

In [57]:
TASKS = """### Task Instructions:
- Process each input sentence and produce a structured JSON output in the format demonstrated above.
- Don't change the text (keep same case, punctuation etc)
- The final response should be a list of JSON objects, enclosed within square brackets `[ ]`. Don't add any comment, return only the list of JSON.
"""

FULL_PROMPT = PROMPT_CONTEXT + PROMPT_EXAMPLES + TASKS
print(FULL_PROMPT)

You are a senior researcher specializing in digital humanities, with a particular focus on the valorization of 19th-century French land registers that have been automatically transcribed. Your task is to structure short textual inputs describing taxpayers into a predefined set of entities.

### Input Description:
The input consists of short sentences containing information about a taxpayer, which may include their name, title, activity, address, and family status. These details need to be extracted and categorized according to the following entities:
- **"name"**: The last name of a person or the name of a company.
- **"firstnames"**: One or more first names of the individual.
- **"address"**: An address or any other spatial entity associated with the individual.
- **"activity"**: The profession, occupation, or work associated with the individual.
- **"title"**: Titles such as 'M', 'Mme', 'Mademoiselle', 'Monseigneur', 'General', or 'Prince'.
- **"family"**: Mentions of family or marit

In [58]:
# Main block to execute the script.
results = []
if __name__ == "__main__":
    for i in range(0,10):
        # Path to the image to be analyzed.
        uuid = taxpayers_df.iloc[i]["element_uuid"]
        taxpayer_str = taxpayers_df.iloc[i]["taxpayer"]

        print("###########################################################")
        # Print an initial message before starting the analysis.
        print(f"\nAnalyzing a taxpayer : {taxpayer_str}")
    
        # Call the function to analyze the image and get the results.
        result = named_entities(ASSISTANT, PROMPT2 + "Please structure the following **Input:**" + taxpayer_str + " **Output:**" , MODEL)

        print(result.message.content)
        results.append([uuid,result.message.content])
        time.sleep(1)

###########################################################

Analyzing a taxpayer : Charte Etienne
{"taxpayers": [{"name": "Charte", "firstnames": "Etienne", "activity": [""], "address": [""]}]}

###########################################################

Analyzing a taxpayer : Crispin désiré augustin
{"taxpayers":[{"name":"Crispin","firstnames":"désiré augustin","activity":[],"address":[],"title":[],"familystatus":[]}]}
###########################################################

Analyzing a taxpayer : chichérit françois auguste
{"taxpayers": [{"name": "Crispin", "firstnames": "désiré augustin", "activity": [""], "address": [""], "title": [""], "familystatus": [""]}]}
###########################################################

Analyzing a taxpayer : C↑ie↓ parisienne du gaz
{"taxpayers":[{"name": "C↑ie↓ parisienne du gaz","firstnames": "","activity": [""],"address": [""],"title": [""],"familystatus": [""]}]}
###########################################################

Analyzing a tax

In [183]:
[{"name": "Charte", "firstnames": "Etienne", "activity": [""], "address": [""], "title": [""], "family": [""]}]
[{"name": "Crispin", 'firstnames': "désiré augustin", "activity": [""], "address": [""], 'title': [""], "family": [""]}]
[{"name": "chichérit", "firstnames": "françois auguste", "activity": [""], "address": [""], "title": [""], "family": [""]}]
[{"name": "C↑ie↓ parisienne du gaz","firstnames": "","activity": [""],"address": [""],"title": [""],"family": [""]}]
[{"name": "Coquette","firstnames": "léon","activity": [""],"address": [""],"title": [""],"family": [""]},{"name": "Coquette","firstnames": "Louis→alfred","activity": [""],"address": [""],"title": [""],"family": [""]}]
[{"name": "Coste", "firstnames": "sophie", "activity": [""], "address": ["veuve"], "title": [""], "family": [""]}]
[{'name': "chicot", 'firstnames': 'alfred clément', 'activity': [""], 'address': [""], 'title': [""], 'family': [""]}]
[{'name': "Cousin", 'firstnames': "", 'activity': [""], 'address': [""], 'title': [""], 'family': [""]}]

[{'name': 'Cousin',
  'firstnames': '',
  'activity': [''],
  'address': [''],
  'title': [''],
  'family': ['']}]

In [59]:
results[0][1]

'{"taxpayers": [{"name": "Charte", "firstnames": "Etienne", "activity": [""], "address": [""]}]}\n'