# 50. Create Plot and Documents RDF Resources
* Associate each table page with a cover page to retrieve section identifier
* Create the Landmark of type Plot
* Create the Record Resources of each line and page

In [1]:
import pandas as pd
import glob
import json
import uuid
import re
from rdflib import Graph, URIRef, Literal, RDFS, Namespace, BNode
from rdflib.namespace import SKOS, RDF, RDFS, DCTERMS, XSD

In [2]:
import sys
import os
# Access to the utils directory
current_dir = os.getcwd()
utils_dir = os.path.join(current_dir, '..', 'utils')
sys.path.append(utils_dir)

In [3]:
from string_utils import NormalizeText
from rdf_resources_utils import generate_rdf_resource_section_landmark, generate_rdf_resource_commune_landmark

[nltk_data] Downloading package punkt to /home/STual/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/STual/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
COMMUNE = "LHAY"
DEP = "94"

In [5]:
ROOT = f"/home/STual/DAN-cadastre/inference/{COMMUNE}"
DATA = f"/home/STual/DAN-cadastre/data/{COMMUNE}"
JSONS = glob.glob(ROOT + '/*.json')

In [30]:
print(len(JSONS))

26


In [6]:
if not os.path.exists(f'{DATA}/rdf'):
    os.makedirs(f'{DATA}/rdf')

### Retrieve metadata

In [7]:
metadata = pd.read_csv("/home/STual/DAN-cadastre/data/METADATA/details_archives_94_ETS.csv")
metadata = metadata[metadata["Nom_dossier"] == COMMUNE]
metadata

Unnamed: 0,Commune,Departement,Nom_dossier,Cote,Page_group_id,Nombre_de_fichiers,Volume_Mo,Ensemble,Date_infos,Date_doc,...,A_echantillonner,Tag_groupe_doc,Sections,Estimation nb couv,Structure,Remarques,Etat,Cotes liées,Ref_v1_en_ligne,P_debut_en_ligne
64,L'Hay-les-Roses,Seine,LHAY,FRAD094_3P_000275_01,65,337.0,"5 603,94",3P275,1842,1842,...,False,RECTIFICATION_1835,,,,,Propre,,2 MI 97/1,2.0
65,L'Hay-les-Roses,Seine,LHAY,FRAD094_3P_000275_01,66,337.0,"5 603,94",3P275,1842,1842,...,True,RECTIFICATION_1835,"A,B,C,D",,Un chapitre par section,,Propre,,2 MI 97/1,277.0
66,L'Hay-les-Roses,Seine,LHAY,FRAD094_3P_000275_01,67,337.0,"5 603,94",3P275,1842,1842,...,True,RECTIFICATION_1835,"A,B,C,D",4.0,Un chapitre par section,,Propre,,2 MI 97/1,68.0


## 1. Create landmarks of type Section and Commune 
* Section : Use covers information.
* Commune : Use archives metadata
### 1.1 Create landmarks of type Section
*TODO : currently, retrieve covers info is done manually. A data template and DAR model to perform IE from covers would be necessary to create a full registers treatment pipeline.*

In [8]:
df_couv = pd.read_csv(f'{DATA}/{COMMUNE}_couv.csv')

In [9]:
df_couv['id'] = df_couv.groupby('page_uuid').cumcount()
# Pivot the table to transform covers elements into attributs of the page covers
pivoted = df_couv.pivot(index='page_uuid', columns='element_type', values='transcription')
pivoted = pivoted.sort_values(by=["ets_couv_id"])
pivoted = pivoted.reset_index()

In [10]:
pivoted["cad_ltype"] = "Section"
pivoted["identifier"] = pivoted["ets_couv_id"].str[-1]
pivoted["label"] = pivoted["ets_couv_id"].str.title() + ' ' + pivoted["ets_couv_name"]
pivoted

element_type,page_uuid,ets_couv_arr,ets_couv_canton,ets_couv_commune,ets_couv_dep,ets_couv_id,ets_couv_modele,ets_couv_name,ets_couv_titre,cad_ltype,identifier,label
0,546500d4-a13d-47d4-8958-d5add3eca984,ARRONDISSEMENT→de Sceaux,CANTON→de Villejuif,COMMUNE→de L'Hay,DEPARTEMENT→de la Seine,SECTION B,"(N. 10, ART. 75",dite de la Plaine,"TABLEAU INDICATIF des propriétés foncières, de...",Section,B,Section B dite de la Plaine


The UUID of a Landmark of type section is the cover uuid + the section letter identifier (in case several sections are represented in the same page cover).

In [11]:
#Create a dictionnary for each section, used to create the RDF resources
landmarks_df = pivoted[["cad_ltype","identifier","label","page_uuid"]]
landmarks_dict = []
for _, row in landmarks_df.iterrows():
    rdf_entities = {"uuid":row["page_uuid"],"cad_ltype":row["cad_ltype"],"identifier":row["identifier"],"label":row["label"],"source_uuid":row["page_uuid"]}
    landmarks_dict.append(rdf_entities)
landmarks_dict[0]

{'uuid': '546500d4-a13d-47d4-8958-d5add3eca984',
 'cad_ltype': 'Section',
 'identifier': 'B',
 'label': 'Section B dite de la Plaine',
 'source_uuid': '546500d4-a13d-47d4-8958-d5add3eca984'}

In [12]:
g, uri_dict = generate_rdf_resource_section_landmark(landmarks_dict, DEP, COMMUNE)

### 1.2 Add landmark of type commune into the graph
This inclue the creation of the **Landmark** of type **Commune** and of the **LandmarkRelations** between commune and its sections (of LRTYPE Within).

In [13]:
commune_metadata = metadata[["Departement","Commune","Nom_dossier"]]
commune_metadata = commune_metadata.drop_duplicates()
commune_metadata

Unnamed: 0,Departement,Commune,Nom_dossier
64,Seine,L'Hay-les-Roses,LHAY


In [14]:
rdf_entity = {"label":commune_metadata.iloc[0]["Commune"],"departement_num":DEP,"commune_code":COMMUNE}

In [15]:
#Add the landmark commune to the graph
generate_rdf_resource_commune_landmark(g, rdf_entity)

In [16]:
# Print the RDF graph in Turtle format
g.serialize(destination=f"{DATA}/rdf/landmarks.ttl", format="turtle")

<Graph identifier=Ndb06f39828594f888f7e54cd29b1672d (<class 'rdflib.graph.Graph'>)>

## 2. Create RDF resources of sources

### 2.1 Registers

In [17]:
metadata_simple = metadata[["Commune","Cote","Nom_dossier","Date_doc","Date_infos","Tag_groupe_doc"]]
metadata_simple

Unnamed: 0,Commune,Cote,Nom_dossier,Date_doc,Date_infos,Tag_groupe_doc
64,L'Hay-les-Roses,FRAD094_3P_000275_01,LHAY,1842,1842,RECTIFICATION_1835
65,L'Hay-les-Roses,FRAD094_3P_000275_01,LHAY,1842,1842,RECTIFICATION_1835
66,L'Hay-les-Roses,FRAD094_3P_000275_01,LHAY,1842,1842,RECTIFICATION_1835


In [18]:
# Ouvrir le df des pages => prévoir mapping entre les uuid et les noms des cotes d'archives
df_pages = pd.read_csv(f'{DATA}/{COMMUNE}_pages.csv')
#df_pages = df_all_pages[df_all_pages["classe"] == "ets_tab_p1"]
df_pages[0:5]

Unnamed: 0,id,type,name,coords,image_id,image_name,commune,dossier_cote,image_cote,image_index,image_url,image_width,image_height,is_blank,classe
0,546500d4-a13d-47d4-8958-d5add3eca984,page_add,FRAD094_3P_000275_01_0132,"[[0, 0], [0, 5664], [3674, 5664], [3674, 0], [...",77c043e8-c680-4b65-9b5c-a8fc55cca1c1,CADASTRE%2FETATS_DE_SECTION%2FLHAY%2FFRAD094_3...,LHAY,FRAD094_3P_000275_01,FRAD094_3P_000275_01_0132,132,https://iiif.geohistoricaldata.org/iiif/2/CADA...,3674,5664,False,ets_couv
1,eb4fc8e4-abb6-4b61-b3cc-3b244611e66f,page_add_to_cor,FRAD094_3P_000275_01_0133,"[[0, 0], [0, 5656], [3565, 5656], [3565, 0], [...",89c4f1f9-b119-4229-861c-fb64d953099d,CADASTRE%2FETATS_DE_SECTION%2FLHAY%2FFRAD094_3...,LHAY,FRAD094_3P_000275_01,FRAD094_3P_000275_01_0133,133,https://iiif.geohistoricaldata.org/iiif/2/CADA...,3565,5656,False,ets_tab_p1
2,26d52fc2-aafc-4e58-91f8-68bd6851ad37,page_add_to_cor,FRAD094_3P_000275_01_0134,"[[0, 0], [0, 5664], [3624, 5664], [3624, 0], [...",44136ba4-ebf7-442a-b7ff-5d40c17ba080,CADASTRE%2FETATS_DE_SECTION%2FLHAY%2FFRAD094_3...,LHAY,FRAD094_3P_000275_01,FRAD094_3P_000275_01_0134,134,https://iiif.geohistoricaldata.org/iiif/2/CADA...,3624,5664,False,ets_tab_p1
3,f937f814-1950-48e0-a095-bb61ee89b2f3,page_add_to_cor,FRAD094_3P_000275_01_0135,"[[0, 0], [0, 5664], [3668, 5664], [3668, 0], [...",70c70abd-a0e5-49e2-ac06-52642f37ce48,CADASTRE%2FETATS_DE_SECTION%2FLHAY%2FFRAD094_3...,LHAY,FRAD094_3P_000275_01,FRAD094_3P_000275_01_0135,135,https://iiif.geohistoricaldata.org/iiif/2/CADA...,3668,5664,False,ets_tab_p1
4,b5e51a0b-23e8-4b63-a074-3ad00f40ecda,page_add_to_cor,FRAD094_3P_000275_01_0136,"[[0, 0], [0, 5663], [3610, 5663], [3610, 0], [...",dfec8720-063c-4a93-83a1-fe1e7414ff61,CADASTRE%2FETATS_DE_SECTION%2FLHAY%2FFRAD094_3...,LHAY,FRAD094_3P_000275_01,FRAD094_3P_000275_01_0136,136,https://iiif.geohistoricaldata.org/iiif/2/CADA...,3610,5663,False,ets_tab_p1


In [19]:
from string_utils import NormalizeText
from rdf_resources_utils import arkindex_class_to_skosconcept, ets_type

def cote_page_parser(cote_page):
    """
    Parser of numeric references of archival documents of Val-de-Marne (94)
    As FRAD094_3P_000387_01 or eventually FRAD094_3P_000387_01_0042_P1 (for par of double page)
    """
    cote_parts = cote_page.split("_")
    if len(cote_parts) == 6 and cote_page[-2] == 'P':
        archives, serie, item, num_folder, page, part_page = cote_parts
        archives_ = archives.replace('FRAD','')
        archives = NormalizeText.remove_leading_zeros(archives_)
        item = NormalizeText.remove_leading_zeros(item)
        num_folder = NormalizeText.remove_leading_zeros(num_folder)
        paper_cote = serie + item
        page = NormalizeText.remove_leading_zeros(page)
        part_page = part_page[-1]
        return {"page_numeric_cote" : cote_page, "paper_cote": paper_cote, "archives" : archives, "serie" : serie, "item_in_serie" : item, "digitize_folder_num" : num_folder, "page": page, "part_in_double_page" : part_page}
    elif len(cote_parts) == 5:
        archives, serie, item, num_folder, page = cote_parts
        archives_ = archives.replace('FRAD','')
        archives = NormalizeText.remove_leading_zeros(archives_)
        item = NormalizeText.remove_leading_zeros(item)
        paper_cote = serie + item
        page = NormalizeText.remove_leading_zeros(page)
        num_folder = NormalizeText.remove_leading_zeros(num_folder)
        return {"page_numeric_cote" : cote_page, "paper_cote": paper_cote, "archives" : archives, "serie" : serie, "item_in_serie" : item, "digitize_folder_num" : num_folder, "page": page, "part_in_double_page" : ""}

In [20]:
pages_jsons = []
last_cover = ""
for _, row in df_pages.iterrows():
    if row["classe"] == "ets_couv":
        last_cover = row["id"]
    json_ = cote_page_parser(row["name"])
    json_["num_folder"] = row["dossier_cote"]
    json_["uuid"] = row["id"]
    json_["coordinates"] = row["coords"]
    json_["iiif_url"] = row["image_url"]
    json_["commune_folder"] = row["commune"]
    json_["ml_classe"] = row["classe"]
    json_["associated_page_cover_uuid"] = last_cover

    for ix, row2 in metadata_simple.iterrows():
        if row2["Cote"] == json_["num_folder"] and row2['Tag_groupe_doc'] != "AV_1822_B":
            json_["commune"] = row2["Commune"]
            json_["date_registre"] = row2["Date_doc"]
            json_["date_infos"] = row2["Date_infos"]
            json_["type_registre"] = row2["Tag_groupe_doc"]
            continue
    
    pages_jsons.append(json_)

In [21]:
from rdf_resources_utils import generate_source_rdf_resource

g = generate_source_rdf_resource(pages_jsons)

In [22]:
g.serialize(destination=f"{DATA}/rdf/sources.ttl", format="turtle")

<Graph identifier=N62aa0610751544698979db1985c58316 (<class 'rdflib.graph.Graph'>)>

## 3. Create landmarks of type Plot, RecordPart for table lines and LandmarkRelations between Plots and Sections

### 3.1 Add page context infos to page JSON

In [23]:
for JSON in JSONS:
    page_uuid = JSON.replace(ROOT+'/',"").replace(".json","")
    print(page_uuid)
    with open(JSON) as f:
        page = json.load(f)

    page["context"] = {}
    page_json = next((item for item in pages_jsons if item.get("uuid") == page_uuid), None)
    page["context"]["section"] = "http://rdf.geohistoricaldata.org/id/landmark/" + page_json["associated_page_cover_uuid"]

    landmark_dict = next((item for item in landmarks_dict if item.get("uuid") == page_json["associated_page_cover_uuid"]), None)
    page["context"]["section_id"] = landmark_dict["identifier"]
    page["context"]["commune_uri"] = "http://rdf.geohistoricaldata.org/id/landmark/" + page_json["archives"] + '_' + page_json["commune_folder"]
    page["context"]["commune_folder"] = page_json["commune_folder"]
    page["context"]["commune"] = page_json["commune"]
    page["context"]["code_dept"] = page_json["archives"]
    page["context"]["iiif_url"] = page_json["iiif_url"][:page_json["iiif_url"].find('.jpg')+4] + '/info.json'
    page["context"]["page_numeric_cote"] = page_json["page_numeric_cote"]
    page["context"]["date"] = page_json["date_infos"]

    for line in page["entities"]:
        if "Ⓕ" in list(line.keys()) and 'interpreted_text' in list(line["Ⓕ"].keys()):
            line["Ⓕ"]['plot_id'] = landmark_dict["identifier"] + '-' + line["Ⓕ"]['postcorrected_text']
        elif "Ⓕ" not in list(line.keys()):
            line["Ⓕ"] = {}
            line["Ⓕ"]['plot_id'] = landmark_dict["identifier"] + '-' + str("UNKNOWN")

    with open(JSON,'w', encoding='utf-8') as f:
        json.dump(page, f, ensure_ascii=False, indent=4)

3435dedc-ddc0-4dfd-b834-d04148e44ddf
79ee0e05-59c0-4770-a20e-326574627577
683e7c3e-e537-44ba-8a02-3c4678d86b2f
7cc717f5-4612-45f0-8181-7e341e937880
9e52f7f9-0e66-4753-bcf3-d3cb0916678c
2c59c9c6-5ed5-4fc1-85cd-4d162c324239
8d60076b-985d-4374-acb8-c8f17db50d76
f10c1378-c307-4b24-95f8-1c016610d4af
92fb30f9-6891-46e0-adb0-ad396a3ad2dc
ba86e3f2-7bb6-49a8-a2c2-0773e0a79626
b9996140-18ce-4374-8a66-01c4acdcbb2f
d674fbbe-3b26-4e9d-8a6f-c8c8533d3123
c8299489-3750-499c-a924-7f3365401c75
f937f814-1950-48e0-a095-bb61ee89b2f3
9fde8f6c-f4bd-400c-9710-94f268917bf8
724852d2-464c-4364-a051-8a4505d6cbea
b38e1412-734c-423a-8abc-6758bff52e19
2e198618-c734-4d73-a67f-79fda49504f5
b5e51a0b-23e8-4b63-a074-3ad00f40ecda
9d6d69fc-cc55-4007-acbc-4ec375bec1b3
d7b0141b-7950-4d9b-a947-92728f613d8d
ad913b91-539f-4645-bcde-69dee0c09bb3
064cc4ee-9d5e-4922-a6dc-954aaf38fab2
26d52fc2-aafc-4e58-91f8-68bd6851ad37
eb4fc8e4-abb6-4b61-b3cc-3b244611e66f
acb5c0fb-1901-405f-862e-f0726e867e7d


### 3.2 Create the plot landmark and the table line

In [26]:
from rdf_resources_utils import generate_rdf_resource_plot_landmark, generate_rdf_resource_event

In [28]:
g = Graph()
events_ls = []

for JSON in JSONS:
    page_uuid = JSON.replace(ROOT+"/","").replace(".json","")
    with open(JSON) as f:
        page = json.load(f)

    #Plots and lines
    generate_rdf_resource_plot_landmark(g, page, page_uuid)
    
    #Event
    uri_event = "http://rdf.geohistoricaldata.org/id/event/" + "CADASTRE_LHAY_" + str(page["context"]["date"])
    if uri_event not in events_ls:
        generate_rdf_resource_event(g, page)
        events_ls.append(uri_event)

In [29]:
g.serialize(destination=f"{DATA}/rdf/initial-plots.ttl", format="turtle")

<Graph identifier=N08dc9b2206e84ad99a802a600559788e (<class 'rdflib.graph.Graph'>)>