# 00 - Retrieve elements and metadata on Arkindex

This notebook aims to retrieve data on Arkindex using Arkindex API. More precisly : 
1. retrieve the considered registers pages
2. retrieve the content of the page covers
3. retrieve the annotations of the tables and if needed create inference-like jsons

In [1]:
import pandas as pd
import glob
import json
from arkindex import ArkindexClient

In [2]:
import sys
import os
# Access to the utils directory
current_dir = os.getcwd()
utils_dir = os.path.join(current_dir, '..', 'utils')
sys.path.append(utils_dir)

### Parameters

In [10]:
NAME = "LHAY"
FOLDER = f"/home/STual/DAN-cadastre/inference/{NAME}"
SAVE_FOLDER = f'/home/STual/DAN-cadastre/data/{NAME}'

In [4]:
PAGE_ELEMENTS_SLUGS = ["page_add","page_add_to_cor"] #Marolles : ["page", "page_marolles"]
ARKINDEX_FOLDER = "2ec47da6-116b-4f53-bd93-da7aac699a4a" #marolles : "2a045b63-2866-4292-a168-87d342be800b"
PAGE_CLASSES = ["ets_couv","ets_tab_p1","ets_tab_p2","ets_recap_inter","ets_resume"]

In [5]:
cli = ArkindexClient('cdf9b285c9da6e82cddbf18aa070dc471a13facb')

## 1. Retrieve pages infos

In [6]:
elements = cli.request('ListElementChildren',
                   id=ARKINDEX_FOLDER,type=PAGE_ELEMENTS_SLUGS[0],
                   recursive=True,folder=False,page_size=500,page=1,with_classes=True,
                )
for slug in PAGE_ELEMENTS_SLUGS[1:]:
    elements2 = cli.request('ListElementChildren',
                           id=ARKINDEX_FOLDER,type=slug,
                           recursive=True,folder=False,page_size=500,page=1,with_classes=True,
                        )
    elements["results"].extend(elements2["results"])
print(f"{len(elements['results'])} elements of type {', '.join(PAGE_ELEMENTS_SLUGS)} have been retrieved")

32 elements of type page_add, page_add_to_cor have been retrieved


Now, we create a table with the informations of each element (pages) and save them to a CSV.

In [11]:
from arkindex_utils import listelements_withclass_json_to_table

df_commune = listelements_withclass_json_to_table(elements,PAGE_CLASSES,["ets_blank"],["is_blank"])
df_commune = df_commune[df_commune["classe"].isin(["ets_couv", "ets_tab_p1"])]
df_commune['image_index'] = df_commune['image_index'].astype('int')
df_commune.sort_values(by=['image_index'],inplace=True)
df_commune.to_csv(f'{SAVE_FOLDER}/{NAME}_pages.csv',index=False)

In [13]:
df_commune[0:4]

Unnamed: 0,id,type,name,coords,image_id,image_name,commune,dossier_cote,image_cote,image_index,image_url,image_width,image_height,is_blank,classe
0,546500d4-a13d-47d4-8958-d5add3eca984,page_add,FRAD094_3P_000275_01_0132,"[[0, 0], [0, 5664], [3674, 5664], [3674, 0], [...",77c043e8-c680-4b65-9b5c-a8fc55cca1c1,CADASTRE%2FETATS_DE_SECTION%2FLHAY%2FFRAD094_3...,LHAY,FRAD094_3P_000275_01,FRAD094_3P_000275_01_0132,132,https://iiif.geohistoricaldata.org/iiif/2/CADA...,3674,5664,False,ets_couv
3,eb4fc8e4-abb6-4b61-b3cc-3b244611e66f,page_add_to_cor,FRAD094_3P_000275_01_0133,"[[0, 0], [0, 5656], [3565, 5656], [3565, 0], [...",89c4f1f9-b119-4229-861c-fb64d953099d,CADASTRE%2FETATS_DE_SECTION%2FLHAY%2FFRAD094_3...,LHAY,FRAD094_3P_000275_01,FRAD094_3P_000275_01_0133,133,https://iiif.geohistoricaldata.org/iiif/2/CADA...,3565,5656,False,ets_tab_p1
4,26d52fc2-aafc-4e58-91f8-68bd6851ad37,page_add_to_cor,FRAD094_3P_000275_01_0134,"[[0, 0], [0, 5664], [3624, 5664], [3624, 0], [...",44136ba4-ebf7-442a-b7ff-5d40c17ba080,CADASTRE%2FETATS_DE_SECTION%2FLHAY%2FFRAD094_3...,LHAY,FRAD094_3P_000275_01,FRAD094_3P_000275_01_0134,134,https://iiif.geohistoricaldata.org/iiif/2/CADA...,3624,5664,False,ets_tab_p1
5,f937f814-1950-48e0-a095-bb61ee89b2f3,page_add_to_cor,FRAD094_3P_000275_01_0135,"[[0, 0], [0, 5664], [3668, 5664], [3668, 0], [...",70c70abd-a0e5-49e2-ac06-52642f37ce48,CADASTRE%2FETATS_DE_SECTION%2FLHAY%2FFRAD094_3...,LHAY,FRAD094_3P_000275_01,FRAD094_3P_000275_01_0135,135,https://iiif.geohistoricaldata.org/iiif/2/CADA...,3668,5664,False,ets_tab_p1


## 2. Create page covers info files

In [15]:
EXTRACT_COVERS_CONTENT = True

In [17]:
COVERS_CSV_COLUMNS = ["page_uuid","element_uuid","element_type","polygon","transcription_uuid","transcription"]

if EXTRACT_COVERS_CONTENT:
    df_commune_couv = df_commune[df_commune["classe"] == "ets_couv"]
    areas = []
    for _, row in df_commune_couv.iterrows():
        #List the areas
        results = cli.request('ListElementChildren',
                           id=row["id"],
                           with_transcriptions=True,page=1,page_size=500)
        for l in results["results"]:
            area_id = l["id"]
            type_ =  l["type"]
            polygon = l["zone"]["polygon"]
            transcription_id = l["transcriptions"][0]["id"]
            transcription = l["transcriptions"][0]["text"]
            areas.append([row["id"], area_id, type_, polygon, transcription_id, transcription])
            
    #Save data
    areas_df = pd.DataFrame(areas,columns=COVERS_CSV_COLUMNS)
    areas_df.to_csv(f'{SAVE_FOLDER}/{NAME}_couv.csv', index=False)
    display(areas_df[0:6])

Unnamed: 0,page_uuid,element_uuid,element_type,polygon,transcription_uuid,transcription
0,546500d4-a13d-47d4-8958-d5add3eca984,7f4c890b-1c6d-4877-aca0-e63ec65918ea,ets_couv_dep,"[[221, 696], [221, 857], [725, 857], [725, 696...",b7027a5a-c108-4c7b-84a3-cfa45b27ec45,DEPARTEMENT→de la Seine
1,546500d4-a13d-47d4-8958-d5add3eca984,44f6d3b8-0c3c-4fe0-81b3-6cef8fd09032,ets_couv_arr,"[[225, 886], [225, 1044], [738, 1044], [738, 8...",b82a8058-c701-41e6-b16f-dafff6a6fc04,ARRONDISSEMENT→de Sceaux
2,546500d4-a13d-47d4-8958-d5add3eca984,8be3d995-a12f-4cb5-a17d-0813bd7186c9,ets_couv_canton,"[[219, 1068], [219, 1227], [701, 1227], [701, ...",178bcbf4-87e3-4180-92b6-3274ff9d2965,CANTON→de Villejuif
3,546500d4-a13d-47d4-8958-d5add3eca984,0b58ce8e-5bed-4c4a-9b1d-8628eb984fd4,ets_couv_commune,"[[208, 1240], [208, 1421], [653, 1421], [653, ...",e844d0be-4625-43f4-989e-f707e1c5e9b0,COMMUNE→de L'Hay
4,546500d4-a13d-47d4-8958-d5add3eca984,aa87f6b5-5845-44de-a525-4d431b21ad18,ets_couv_id,"[[1347, 3122], [1347, 3396], [2572, 3396], [25...",c8f2dce9-dc40-4752-bae3-09f8920c6009,SECTION B
5,546500d4-a13d-47d4-8958-d5add3eca984,3acea58d-b30d-40a2-b0f2-0c0e018ef12c,ets_couv_name,"[[855, 3378], [855, 3596], [2452, 3596], [2452...",c1c3421f-beff-4f00-ad86-20f724c4822a,dite de la Plaine


## 3. Create inference-like data from Arkindex
### 3.1 Retrieve table lines transcriptions and entities

Create a CSV with the transcriptions of each table line of the pages listes before. The CSV has the following header : ["page_uuid","line_uuid","polygon","transcription_uuid","transcription","entities"]

In [None]:
DOWNLOAD_TRANSCRIPTIONS = False

In [20]:
if DOWNLOAD_TRANSCRIPTIONS:
    df_commune_tab = df_commune[df_commune["classe"] == "ets_tab_p1"]
    lines = []
    for _, row in df_commune_tab.iterrows():
        #List the lines
        results = cli.request('ListElementChildren',
                           id=row["id"],
                           with_transcriptions=True,page=1,page_size=500)
        
        for l in results["results"]:
            try:
                if "tab_header" not in l["type"]:
                    line_id = l["id"]
                    polygon = l["zone"]["polygon"]
                    transcription_id = l["transcriptions"][0]["id"] #Il ne retrouve pas les transcriptions, pourquoi ? (contraindre les types de lignes ?)
                    transcription = l["transcriptions"][0]["text"]
                    #List the entities
                    entities = cli.request("ListTranscriptionEntities",id=transcription_id)
                    ls_ent = []
                    for e in entities["results"]:
                        js = {"entity_uuid" : e["entity"]["id"], "entity_text" : e["entity"]["name"], "entity_type" : e["entity"]["type"]["name"], "offset" : e["offset"], "length": e["length"]}
                        ls_ent.append(js)
                    lines.append([row["id"],line_id,polygon,transcription_id,transcription,ls_ent])
            except:
                print(l) 

{'id': 'a8725c48-46cd-4277-b4cf-a9ecff57aef0', 'type': 'table_line_to_cor', 'name': '13', 'corpus': {'id': '59285827-bba3-42d4-99c7-e2436fe19f94', 'name': 'EPITA-IGN | Registre Napoléonien', 'public': False}, 'thumbnail_url': None, 'zone': {'id': 'a8725c48-46cd-4277-b4cf-a9ecff57aef0', 'polygon': [[114, 4579], [114, 4856], [2531, 4856], [2531, 4579], [114, 4579]], 'image': {'id': '89c4f1f9-b119-4229-861c-fb64d953099d', 'path': 'CADASTRE%2FETATS_DE_SECTION%2FLHAY%2FFRAD094_3P_000275_01%2FFRAD094_3P_000275_01_0133.jpg', 'width': 3565, 'height': 5656, 'url': 'https://iiif.geohistoricaldata.org/iiif/2/CADASTRE%2FETATS_DE_SECTION%2FLHAY%2FFRAD094_3P_000275_01%2FFRAD094_3P_000275_01_0133.jpg', 's3_url': None, 'status': 'checked', 'server': {'display_name': 'https://iiif.geohistoricaldata.org/iiif/2', 'url': 'https://iiif.geohistoricaldata.org/iiif/2', 'max_width': None, 'max_height': None}}, 'url': 'https://iiif.geohistoricaldata.org/iiif/2/CADASTRE%2FETATS_DE_SECTION%2FLHAY%2FFRAD094_3P_000

In [None]:
if DOWNLOAD_TRANSCRIPTIONS:
    lines_df = pd.DataFrame(lines,columns=["page_uuid","line_uuid","polygon","transcription_uuid","transcription","entities"])
    lines_df[0:4]

In [None]:
if DOWNLOAD_TRANSCRIPTIONS:
    lines_df.to_csv(f'{SAVE_FOLDER}/{NAME}_tab_lines.csv', index=False) 

### 3.2 Create json similar to DAN inference

In [14]:
from string_utils import TableValuesPostTreatment 

CREATE_JSONS = False

SyntaxError: invalid syntax (1307066500.py, line 2)

In [9]:
if CREATE_JSONS:
    df_pages = pd.read_csv(f'{SAVE_FOLDER}/{NAME}_pages.csv')
    df_pages = df[df["classe"] == "ets_tab_p1"]
    df_entities = pd.read_csv(f'{SAVE_FOLDER}/{NAME}_tab_lines.csv')

In [10]:
if CREATE_JSONS:

    for _, row in df_pages.iterrows():
        page_json = {}
        name = row["id"]
        page_json["text"] = ""
        page_json["confidences"] = {}
        page_json["confidences"]["total"] = 1.0
        page_json["language_model"] = {}
        page_json["objects"] = []
        page_json["attention_gif"] = ""
        page_json["entities"] = ""
        subdf = df_entities[df_entities["page_uuid"] == name]
        new_ents = []
        for __, row2 in subdf.iterrows():
            obj = {"confidence": 1.0, "polygon":row2["polygon"],"text": row2["transcription"],"text_confidence": 1.0}
            page_json["objects"].append(obj)
            ents = row2["entities"].replace("\'entity_text\'",'"entity_text"').replace("\'entity_uuid\'",'"entity_uuid"').replace("\'entity_type\'",'"entity_type"').replace("\'offset\'",'"offset"').replace("\'length\'",'"length"').replace(": '",': "').replace("',",'",')
            ents = json.loads(ents)
            line_en = {}
            for e in ents:
                t = ''
                if e["entity_type"] == "lieu-dit":
                    t = "Ⓓ"
                elif e["entity_type"] == "ancien_numero_parcelle":
                    t = "Ⓐ"
                elif e["entity_type"] == "ancienne_nature":
                    t = "Ⓑ"
                elif e["entity_type"] == "identite":
                    t = "Ⓒ"
                elif e["entity_type"] == "nature":
                    t = "Ⓔ"
                elif e["entity_type"] == "numero_parcelle":
                    t = "Ⓕ"
                elif e["entity_type"] == "numero_proprietaire":
                    t = "Ⓖ"
                line_en.update({t:{"text": e["entity_text"],
                    "offset": e["offset"],
                    "length": e["length"]}})
            new_ents.append(line_en)
        page_json["entities"] = new_ents
        #Deal with items and dittos
        page_json["entities"] = TableValuesPostTreatment.process_idem_ditto_replacements(page_json["entities"], SPECIAL_VALUE="MISSING", 
                                        idem_list=['§', 'Ø', 'id', 'idem', 'le meme', 'la meme', 'les memes'],
                                        ditto_list=['☼'])
        json_path = os.path.join(SAVE_FOLDER,"data","LHAY","ets_tab_p1",row['id'] + '.json')
        #Save the jsons
        with open(json_path,'w', encoding='utf-8') as f:
            json.dump(page_json, f, ensure_ascii=False, indent=4)

[nltk_data] Downloading package punkt to /home/STual/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/STual/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


NameError: name 'df_pages' is not defined