# 40 - Inference

This notebook details the guidelines to use the fine-tuned DAN on new data. If you don't use Arkindex, use only the command line in part 2.

In [1]:
import sys
import os
import re
import pandas as pd
import glob
import json
import plotly.express as px
import json
from arkindex import ArkindexClient
from dan.datasets.download.images import ImageDownloader
from apistar.exceptions import ErrorResponse
import uuid

# Access to the utils directory
current_dir = os.getcwd()
utils_dir = os.path.join(current_dir, '..', 'utils')
sys.path.append(utils_dir)

In [2]:
from arkindex_utils import listelementschildren_to_df
from download_utils import download_images_from_list
from viz import restore_polygon_to_original_size

In [3]:
ROOT = "/home/STual/DAN-cadastre"
SAVE_DIR = ROOT + "/data/LHAY/ets_tab_p1"

## 1 Download IIIF images from Arkindex
If you already have downloaded the images, you can skip this step.

In [4]:
cli = ArkindexClient('cdf9b285c9da6e82cddbf18aa070dc471a13facb')

In [5]:
# L'hay section b folder : ccc0c979-d7d8-468f-bc1b-38d4dadbb80c
# Retrieve ets_tab_p1 only (51b9743b-3300-4a12-be69-e69395a2efbc)
elements = cli.request('ListElementChildren',
                       id="2ec47da6-116b-4f53-bd93-da7aac699a4a",
                       type="page",class_id="51b9743b-3300-4a12-be69-e69395a2efbc",
                       recursive=True,folder=False,page_size=500,page=1
                    )

elements2 = cli.request('ListElementChildren',
                       id="2ec47da6-116b-4f53-bd93-da7aac699a4a",
                       type="page_add_to_cor",class_id="51b9743b-3300-4a12-be69-e69395a2efbc",
                       recursive=True,folder=False,page_size=500,page=1
                       )

In [6]:
dfelem1 = pd.DataFrame.from_dict(listelementschildren_to_df(elements))
dfelem2 = pd.DataFrame.from_dict(listelementschildren_to_df(elements2))
df = pd.concat([dfelem1,dfelem2])
df = df.reset_index()
df = df.drop('index', axis=1)
display(df)

Unnamed: 0,id,name,coords,image_id,image_name,commune,dossier_cote,image_cote,image_index,image_url,image_width,image_height
0,eb4fc8e4-abb6-4b61-b3cc-3b244611e66f,FRAD094_3P_000275_01_0133,"[[0, 0], [0, 5656], [3565, 5656], [3565, 0], [...",89c4f1f9-b119-4229-861c-fb64d953099d,CADASTRE%2FETATS_DE_SECTION%2FLHAY%2FFRAD094_3...,LHAY,FRAD094_3P_000275_01,FRAD094_3P_000275_01_0133,133,https://iiif.geohistoricaldata.org/iiif/2/CADA...,3565,5656
1,26d52fc2-aafc-4e58-91f8-68bd6851ad37,FRAD094_3P_000275_01_0134,"[[0, 0], [0, 5664], [3624, 5664], [3624, 0], [...",44136ba4-ebf7-442a-b7ff-5d40c17ba080,CADASTRE%2FETATS_DE_SECTION%2FLHAY%2FFRAD094_3...,LHAY,FRAD094_3P_000275_01,FRAD094_3P_000275_01_0134,134,https://iiif.geohistoricaldata.org/iiif/2/CADA...,3624,5664
2,f937f814-1950-48e0-a095-bb61ee89b2f3,FRAD094_3P_000275_01_0135,"[[0, 0], [0, 5664], [3668, 5664], [3668, 0], [...",70c70abd-a0e5-49e2-ac06-52642f37ce48,CADASTRE%2FETATS_DE_SECTION%2FLHAY%2FFRAD094_3...,LHAY,FRAD094_3P_000275_01,FRAD094_3P_000275_01_0135,135,https://iiif.geohistoricaldata.org/iiif/2/CADA...,3668,5664
3,b5e51a0b-23e8-4b63-a074-3ad00f40ecda,FRAD094_3P_000275_01_0136,"[[0, 0], [0, 5663], [3610, 5663], [3610, 0], [...",dfec8720-063c-4a93-83a1-fe1e7414ff61,CADASTRE%2FETATS_DE_SECTION%2FLHAY%2FFRAD094_3...,LHAY,FRAD094_3P_000275_01,FRAD094_3P_000275_01_0136,136,https://iiif.geohistoricaldata.org/iiif/2/CADA...,3610,5663
4,c8299489-3750-499c-a924-7f3365401c75,FRAD094_3P_000275_01_0137,"[[0, 0], [0, 5656], [3668, 5656], [3668, 0], [...",03996fd6-ccfc-4b0e-bcf5-f643dc4f55a5,CADASTRE%2FETATS_DE_SECTION%2FLHAY%2FFRAD094_3...,LHAY,FRAD094_3P_000275_01,FRAD094_3P_000275_01_0137,137,https://iiif.geohistoricaldata.org/iiif/2/CADA...,3668,5656
5,683e7c3e-e537-44ba-8a02-3c4678d86b2f,FRAD094_3P_000275_01_0138,"[[0, 0], [0, 5663], [3618, 5663], [3618, 0], [...",1a75a871-b8c9-4484-aea8-30d0c59def71,CADASTRE%2FETATS_DE_SECTION%2FLHAY%2FFRAD094_3...,LHAY,FRAD094_3P_000275_01,FRAD094_3P_000275_01_0138,138,https://iiif.geohistoricaldata.org/iiif/2/CADA...,3618,5663
6,d7b0141b-7950-4d9b-a947-92728f613d8d,FRAD094_3P_000275_01_0139,"[[0, 0], [0, 5664], [3668, 5664], [3668, 0], [...",9cec091a-b9dd-4d9d-a8bf-3f44b9a4b356,CADASTRE%2FETATS_DE_SECTION%2FLHAY%2FFRAD094_3...,LHAY,FRAD094_3P_000275_01,FRAD094_3P_000275_01_0139,139,https://iiif.geohistoricaldata.org/iiif/2/CADA...,3668,5664
7,8d60076b-985d-4374-acb8-c8f17db50d76,FRAD094_3P_000275_01_0140,"[[0, 0], [0, 5656], [3604, 5656], [3604, 0], [...",0e94578d-4bf2-4c40-9c13-bb7bac9a0d8e,CADASTRE%2FETATS_DE_SECTION%2FLHAY%2FFRAD094_3...,LHAY,FRAD094_3P_000275_01,FRAD094_3P_000275_01_0140,140,https://iiif.geohistoricaldata.org/iiif/2/CADA...,3604,5656
8,acb5c0fb-1901-405f-862e-f0726e867e7d,FRAD094_3P_000275_01_0141,"[[0, 0], [0, 5664], [3674, 5664], [3674, 0], [...",23411457-a033-427d-acd5-e299159734f7,CADASTRE%2FETATS_DE_SECTION%2FLHAY%2FFRAD094_3...,LHAY,FRAD094_3P_000275_01,FRAD094_3P_000275_01_0141,141,https://iiif.geohistoricaldata.org/iiif/2/CADA...,3674,5664
9,92fb30f9-6891-46e0-adb0-ad396a3ad2dc,FRAD094_3P_000275_01_0142,"[[0, 0], [0, 5656], [3604, 5656], [3604, 0], [...",afc98034-9e15-4f9e-8fe0-3cdb5121ec85,CADASTRE%2FETATS_DE_SECTION%2FLHAY%2FFRAD094_3...,LHAY,FRAD094_3P_000275_01,FRAD094_3P_000275_01_0142,142,https://iiif.geohistoricaldata.org/iiif/2/CADA...,3604,5656


In [7]:
DOWNLOAD = False

if DOWNLOAD:
    downloader = ImageDownloader() # Create an instance of ImageDownloader
    
    # IIIF server 
    base_url = "https://iiif.geohistoricaldata.org/iiif/2/" #To adapt to you IIIF server
    
    urls, elements_uuid = [], []
    for _, row in df.iterrows():  
        # Assuming df is a DataFrame and "coords" and "image_name" are columns in it
        elements_uuid.append(row["id"])
        
        coords = row["coords"]
        image_name = row["image_name"]
        full_url = downloader.build_iiif_url(coords, base_url + image_name)
        urls.append(full_url)

    #download_images_from_list(SAVE_DIR, urls, elements_uuid)

## 2. Inference
This command should be run in the terminal, preferably in a persistent session (byobu).

## 3. Load results to Arkindex
You can skip this part if you don't use Arkindex.

In [8]:
PATH_RES = ROOT + "/inference/LHAY"
CORPUS_UUID = '59285827-bba3-42d4-99c7-e2436fe19f94'
TAB_LINE_CLASS = "1cf133ba-7640-4ff5-b518-a150228460b4"

In [9]:
from elements_utils import sort_by_centroid_y
from string_utils import delete_characters, split_text_at_characters, create_split_dictionary

[nltk_data] Downloading package punkt to /home/STual/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/STual/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Retrieve the json files correspoding to pages automatic transcription by DAN.

In [10]:
RES_FILES = sorted(glob.glob(PATH_RES + '/*.json'))

In [11]:
PAGES = []
for R in RES_FILES:
    page_uuid = R.replace(PATH_RES,"")
    page_uuid = page_uuid.replace(".json","").replace('/','')
    PAGES.append([R,page_uuid])

In [21]:
len(PAGES)

26

In [22]:
for p in PAGES[35:]:
    print(p[1])

### 3.1 Load infered lines

In [23]:
# Open the file entities.json (link between the special tokens and Arkindex entities)
with open('entities.json', 'r') as file2:
    entities = json.load(file2)
chars_to_split = list(entities.keys())

In [78]:
IMPORT = False

if IMPORT:
    for json_path, page_uuid in PAGES:
        print(page_uuid)
        #Open the JSON containing DAN results
        with open(json_path, 'r') as file:
            data = json.load(file)
    
        #Retrieve image infos
        image = cli.request('RetrieveElement',id=page_uuid)
        width = image["zone"]["image"]["width"]
        height = image["zone"]["image"]["height"]
        print(width,height)
        #Update page classification
        parent_classification_request = cli.request('CreateClassification',body={"element":page_uuid,"ml_class":TAB_LINE_CLASS})
    
        counter = 0
        for e in data["objects"]:
            original_polygon = restore_polygon_to_original_size(e["polygon"], width, height, max_width=2000, max_height=3000)
            #Create Element
            elem = {"type": "tab_line_add",
            "name": str(counter),
            "corpus": CORPUS_UUID,
            "parent": page_uuid,
            "polygon": original_polygon,
            "confidence": e["confidence"]}
            elem_request = cli.request('CreateElement',body=elem)
            parent_classification_request = cli.request('CreateClassification',body={"element":elem_request["id"],"ml_class":"52aed5fb-791a-4c2a-93d8-f7050c5b5b46"}) #line_true by default
            
            #CreateTranscription
            text = e["text"].replace('ⒼⒼ','Ⓖ')
            t = delete_characters(text,chars_to_split)
            transcription_body={"text": t,"confidence": e["text_confidence"],"orientation": "horizontal-lr"}
            transcription_request = cli.request('CreateTranscription',
                    id=elem_request["id"],
                    body=transcription_body)
            
            entities_dict = create_split_dictionary(text, chars_to_split)
            for ent in entities_dict:
                entity_body = {
                "name": entities_dict[ent]["text"],
                "type_id": entities[ent]["uuid"],
                "corpus": CORPUS_UUID,
                }
                #CreateEntity
                entity_request = cli.request('CreateEntity',body=entity_body)
                transcriptionentity_body = {"entity":entity_request["id"],
                                            "type_id":entities[ent]["uuid"],
                                            "offset":entities_dict[ent]["offset"],
                                            "length":entities_dict[ent]["length"]
                                           }
                #CreateTranscriptionEntity
                transcriptionentity_request = cli.request('CreateTranscriptionEntity',
                    id=transcription_request["id"],
                    body=transcriptionentity_body)
            print(t,transcription_body)
            counter += 1

d7b0141b-7950-4d9b-a947-92728f613d8d
3668 5664
La voie des→21 Barchoux denis 84 Terre 93 T {'text': 'La voie des→21 Barchoux denis 84 Terre 93 T', 'confidence': 0.95, 'orientation': 'horizontal-lr'}
§ 129 Godefroy denis V↑e↓ 85 T 94 T {'text': '§ 129 Godefroy denis V↑e↓ 85 T 94 T', 'confidence': 0.99, 'orientation': 'horizontal-lr'}
§ 212 Massuet germard 86 T 95 T {'text': '§ 212 Massuet germard 86 T 95 T', 'confidence': 0.99, 'orientation': 'horizontal-lr'}
§ 215 Mateu Jacques F↑ois↓ V↑e↓ 87 T 100 T {'text': '§ 215 Mateu Jacques F↑ois↓ V↑e↓ 87 T 100 T', 'confidence': 0.99, 'orientation': 'horizontal-lr'}
§ 147 hnard f↑ois↓ J↑n↓ 88 T 102→101 T {'text': '§ 147 hnard f↑ois↓ J↑n↓ 88 T 102→101 T', 'confidence': 0.99, 'orientation': 'horizontal-lr'}
§ 77 Chevreul michel 89 T 1026↑bis↓→102 T→T {'text': '§ 77 Chevreul michel 89 T 1026↑bis↓→102 T→T', 'confidence': 0.96, 'orientation': 'horizontal-lr'}
§ 77 idem 90 T.pl 1026↑bis↓ T {'text': '§ 77 idem 90 T.pl 1026↑bis↓ T', 'confidence': 0.97, '

In [82]:
UPDATE_INFERENCE_INFO = False

if UPDATE_INFERENCE_INFO:
    for json_path, page_uuid in PAGES[1:]:
        print(page_uuid)
        #Open the JSON containing DAN results
        with open(json_path, 'r') as file:
            data = json.load(file)

        elements = cli.request('ListElementChildren',
                   id=page_uuid,
                   type="tab_line_add",recursive=True,folder=False,page_size=500,page=1
                )

        #Update tab line add type
        for e in elements["results"]:
            update = cli.request('UpdateElement',id=e["id"],body={"type":"table_line_to_cor","name":e["name"]})
            
        update = cli.request('UpdateElement',id=page_uuid,body={"type":"page_add_to_cor","name":e["name"]})

064cc4ee-9d5e-4922-a6dc-954aaf38fab2
0d2dbad9-68dd-4c66-a44d-87980720bc14
0dcdaeaf-a151-4496-9d68-9e30b63270f8
1f14549b-9b2e-4ee7-a3c4-c55b126db8af
26d52fc2-aafc-4e58-91f8-68bd6851ad37
2c59c9c6-5ed5-4fc1-85cd-4d162c324239
2e198618-c734-4d73-a67f-79fda49504f5
3435dedc-ddc0-4dfd-b834-d04148e44ddf
3ab13de2-f0b2-436f-9aa0-3b040e79975b
3e7683fb-6de4-4121-80b2-b1bae28bc950
484bd9b3-8084-4bd0-bec6-1bf0ab4907e5
5f985c84-34b8-4e56-b125-e2d454ade11f
683e7c3e-e537-44ba-8a02-3c4678d86b2f
6b08bf95-e3f5-423c-afa2-b088cc0ca11a
724852d2-464c-4364-a051-8a4505d6cbea
79ee0e05-59c0-4770-a20e-326574627577
7cc717f5-4612-45f0-8181-7e341e937880
8a6ef849-6266-415b-b9e7-b1604f3f41d4
8d60076b-985d-4374-acb8-c8f17db50d76
8de045eb-0069-4e8d-a519-df5f3c51e919
92fb30f9-6891-46e0-adb0-ad396a3ad2dc
9d6d69fc-cc55-4007-acbc-4ec375bec1b3
9e52f7f9-0e66-4753-bcf3-d3cb0916678c
9fde8f6c-f4bd-400c-9710-94f268917bf8
a7067b75-0b1a-42ad-a01f-5bfe6ce8161e
acb5c0fb-1901-405f-862e-f0726e867e7d
ad913b91-539f-4645-bcde-69dee0c09bb3
b