In [1]:
import isri_tools

In [2]:
a = "👉 étagères𐌰"
b = "etag.µbleble"

A, B = isri_tools.align(a, b, 'x')
print(A)
print(B)

👉 étagèrxxes𐌰x
exxtag.µbleble


In [3]:
A, B = isri_tools.get_align_map(a, b)
print(A)
print(B)

[0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12]
[0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]


In [18]:
import regex
import numpy as np
from warnings import warn

tag_list = ["PER", "LOC", "ACT", "CARD", "FT", "TITRE"]

def normalize(txt: str):
    return txt


def add_tags_prediction(ner_xml: str, text_ocr: str):
     # Sanity check
    cbegin = { tag : ner_xml.count(f"<{tag}>") for tag in tag_list }
    cend = { tag : ner_xml.count(f"</{tag}>") for tag in tag_list }
    if cbegin != cend:
        warn(f"The string '{ner_xml}' has unbalanced tags.")
        print("Opening:", cbegin)
        print("Closing:", cend)

    ## 1. Chunking
    chunks = regex.split("(</?\L<tag>>)", ner_xml, tag=tag_list)
    print("chunks:", chunks)
    A_chunks = chunks[0::2]
    A_tags = [] if len(chunks) < 2 else chunks[1::2]

    # 2.1 Normalize
    A_chunks = list(map(normalize, A_chunks))
    a = "".join(A_chunks)
    b = normalize(text_ocr)

    # 2.2 Single char "normalizations"
    # tolerance to OCR single char substitutions (case insensitive, no accents, etc.)
    # which DO NOT CHANGE THE MATCHING
    case_insensitive = True
    ai = a
    bi = b
    if case_insensitive:
        ai = a.lower()
        bi = b.lower()

    # 3. Align
    A, B = isri_tools.align(ai, bi, ' ')
    print(A)
    print(B)
    A, B = isri_tools.get_align_map(ai, bi)

    # 4. 
    pos_tags = np.cumsum([len(x) for x in A_chunks[:-1]])

    # 5. Reprojet b on the alignment string
    n = max(np.max(A), np.max(B)) + 1
    chr_list = [ '' for i in range(n + 1)]
    for k, c in zip(B, b):
        chr_list[k] = c


    # 6. Add tags on the alignment string
    for p, tag in zip(reversed(pos_tags), reversed(A_tags)):
        if tag.startswith("</"):
            print(p, a[p-1], chr_list[A[p-1] + 1])
            chr_list.insert(A[p-1]+1, tag)
        else:
            print(p, a[p], chr_list[A[p]])
            chr_list.insert(A[p], tag)

    return "".join(chr_list)

In [19]:
A = "<PER>Anthony</PER>, fab. du <ACT>pêche</ACT>, <LOC>Châtelet</LOC>"
B = "Mme Antoine, fab la pecheur du chatelet et du Faub. St Antoine"
add_tags_prediction(A, B)

chunks: ['', '<PER>', 'Anthony', '</PER>', ', fab. du ', '<ACT>', 'pêche', '</ACT>', ', ', '<LOC>', 'Châtelet', '</LOC>', '']
    anthony, fab. du pêche,     châtelet                       
mme antoine, fab  la pecheur du chatelet et du faub. st antoine
32 t  
24 C c
22 e u
17 p p
7 y ,
0 A A


'Mme <PER>Antoine</PER>, fab la <ACT>peche</ACT>ur du <LOC>chatelet</LOC> et du Faub. St Antoine'

In [20]:
A = "Mme <PER>Anthony</PER>, fab. du <ACT>pêche</ACT>, <LOC>Châtelet</LOC>"
B = "Mme Antoine, fab la pecheur du chatelet et du Faub. St Antoine"
add_tags_prediction(A, B)

chunks: ['Mme ', '<PER>', 'Anthony', '</PER>', ', fab. du ', '<ACT>', 'pêche', '</ACT>', ', ', '<LOC>', 'Châtelet', '</LOC>', '']
mme anthony, fab. du pêche,     châtelet                       
mme antoine, fab  la pecheur du chatelet et du faub. st antoine
36 t  
28 C c
26 e u
21 p p
11 y ,
4 A A


'Mme <PER>Antoine</PER>, fab la <ACT>peche</ACT>ur du <LOC>chatelet</LOC> et du Faub. St Antoine'