Retrain seq2seq model, using samples where 
- CER > 0 (to avoid learning to predict exactly the same thing)
- ground-truth can be realigned with original OCR transcription (not a case of OCR failure, to avoid training on noisy data) => this is easy to check in our dataset thanks to the "has_valid_ner_xml" field which indicates that we were able to reproject XML tags used to label Named Entities from clean, human-corrected text to OCR predictions.

## Data source

In [1]:
from pathlib import Path

In [6]:
dataset_basedir = Path("/lrde/home2/jchazalo/datasets/french_trade_directories_19th_century/")

In [7]:
extracted_data = dataset_basedir / "labeled/31-all_entries_ner_align_pero.json"

human_annot_path = dataset_basedir / "labeled/10-all_entries_ocr_ner_human.json"

In [8]:
import pandas as pd

In [9]:
def json_file_to_df(path: str) -> pd.DataFrame:
    with open(path, "r") as json_file:
        return pd.read_json(json_file)


human_df = json_file_to_df(human_annot_path)
auto_df = json_file_to_df(extracted_data)

len(human_df), len(auto_df)

(8765, 8765)

In [10]:
current_path = !pwd
current_path = current_path[0] if len(current_path) > 0 else None
import sys
if current_path is not None and current_path not in sys.path:
    sys.path.insert(0, current_path)
sys.path

['/work-ssd/jchazalo/postocr',
 '/work-ssd/jchazalo/postocr/paper-ner-bench-das22-2.0.0-camera-ready/src/ocr',
 '/usr/lib/python310.zip',
 '/usr/lib/python3.10',
 '/usr/lib/python3.10/lib-dynload',
 '',
 '/work-ssd/jchazalo/postocr/venv/lib/python3.10/site-packages']

In [11]:
sys.path

['/work-ssd/jchazalo/postocr',
 '/work-ssd/jchazalo/postocr/paper-ner-bench-das22-2.0.0-camera-ready/src/ocr',
 '/usr/lib/python310.zip',
 '/usr/lib/python3.10',
 '/usr/lib/python3.10/lib-dynload',
 '',
 '/work-ssd/jchazalo/postocr/venv/lib/python3.10/site-packages']

In [12]:
#!pip install regex

In [13]:
from text_utils import xml_unescape

In [14]:
df = human_df.loc[:, ["book", "page", "text_ocr", "ner_xml"]]
df = df.rename(columns={"text_ocr": "Ground Truth", "ner_xml": "NER Ground Truth"})
df.loc[:, "Sample"] = auto_df.loc[:, "text_ocr"]
df.loc[:, "NER Sample"] = auto_df.loc[:, "ner_xml"]
df.loc[:, "has_valid_ner_xml"] = auto_df.loc[:, "has_valid_ner_xml"]
df.loc[:, "NER Ground Truth"] = df["NER Ground Truth"].apply(xml_unescape)
df

Unnamed: 0,book,page,Ground Truth,NER Ground Truth,Sample,NER Sample,has_valid_ner_xml
0,Bottin1_1820,107,"Dufan et Clémendot, pharmaciens, r. de la\nCha...","<PER>Dufan et Clémendot</PER>, <ACT>pharmacien...","Dufau et Clémendot, pharmaciens, r. de la\nChä...","<PER>Dufau et Clémendot</PER>, <ACT>pharmacien...",False
1,Bottin1_1820,107,"Dufant (Victor), libraire, r. du Gros-Che-\nne...","<PER>Dufant (Victor)</PER>, <ACT>libraire</ACT...","☞\n\nT\nDufant (Victor), libraire, r. du Gros-...","☞ T <PER>Dufant (Victor)</PER>, <ACT>libraire...",True
2,Bottin1_1820,107,"Dufay, essayeur du commerce, place Dau-\nphine...","<PER>Dufay</PER>, <ACT>essayeur du commerce</A...","Dutay, essayeur du commerce, place Dau-\n\n-\n...","<PER>Dutay</PER>, <ACT>essayeur du commerce</A...",True
3,Bottin1_1820,107,"Dulay, chandronnier, r. du Pont- aux\nChoux, 1...","<PER>Dulay</PER>, <ACT>chandronnier</ACT>, <LO...","Dulay, chandronnier, r. du Pont- aux-\nChuux, ...","<PER>Dulay</PER>, <ACT>chandronnier</ACT>, <LO...",True
4,Bottin1_1820,107,"Dufay (V.e), grenetière, r. du Faub.-S.\nDenis...","<PER>Dufay (V.e)</PER>, <ACT>grenetière</ACT>,...","Dufay (V.e), grenetière, r. du Fauh.-S.\nDenis...","<PER>Dufay (V.e)</PER>, <ACT>grenetière</ACT>,...",True
...,...,...,...,...,...,...,...
8760,Notables_communaux_seine_1801,144,"Lamarche, géographe , rue du foin.","<PER>Lamarche</PER>, <ACT>géographe</ACT> , <L...","Lamarche, geographe , rue du foin.","<PER>Lamarche</PER>, <ACT>geographe</ACT> , <L...",True
8761,Notables_communaux_seine_1801,144,"Lamarck, membre de l'institut, quatrième munic...","<PER>Lamarck</PER>, <ACT>membre de l'institut<...","Lamarck, membre de l'institut, quatrieme munic...","<PER>Lamarck</PER>, <ACT>membre de l&apos;inst...",True
8762,Notables_communaux_seine_1801,144,"Lamare, notaire, rue du faubourg honoré.","<PER>Lamare</PER>, <ACT>notaire</ACT>, <LOC>ru...","Lamare , notaire, rue du faubourg honore.","<PER>Lamare</PER> , <ACT>notaire</ACT>, <LOC>r...",True
8763,Notables_communaux_seine_1801,144,"Lamarre , carrier, rue mouffetard.","<PER>Lamarre</PER> , <ACT>carrier</ACT>, <LOC>...","Lamarre, carrier, rue mouffetard.","<PER>Lamarre</PER>, <ACT>carrier</ACT>, <LOC>r...",True


## Filter out entries for which OCR transcription and target transcription cannot be aligned

In [15]:
df_valid = df.loc[df["has_valid_ner_xml"], :]
df_valid.head()

Unnamed: 0,book,page,Ground Truth,NER Ground Truth,Sample,NER Sample,has_valid_ner_xml
1,Bottin1_1820,107,"Dufant (Victor), libraire, r. du Gros-Che-\nne...","<PER>Dufant (Victor)</PER>, <ACT>libraire</ACT...","☞\n\nT\nDufant (Victor), libraire, r. du Gros-...","☞ T <PER>Dufant (Victor)</PER>, <ACT>libraire...",True
2,Bottin1_1820,107,"Dufay, essayeur du commerce, place Dau-\nphine...","<PER>Dufay</PER>, <ACT>essayeur du commerce</A...","Dutay, essayeur du commerce, place Dau-\n\n-\n...","<PER>Dutay</PER>, <ACT>essayeur du commerce</A...",True
3,Bottin1_1820,107,"Dulay, chandronnier, r. du Pont- aux\nChoux, 1...","<PER>Dulay</PER>, <ACT>chandronnier</ACT>, <LO...","Dulay, chandronnier, r. du Pont- aux-\nChuux, ...","<PER>Dulay</PER>, <ACT>chandronnier</ACT>, <LO...",True
4,Bottin1_1820,107,"Dufay (V.e), grenetière, r. du Faub.-S.\nDenis...","<PER>Dufay (V.e)</PER>, <ACT>grenetière</ACT>,...","Dufay (V.e), grenetière, r. du Fauh.-S.\nDenis...","<PER>Dufay (V.e)</PER>, <ACT>grenetière</ACT>,...",True
5,Bottin1_1820,107,"Dufeu, charcutier, r. Montmartre, 89. 318","<PER>Dufeu</PER>, <ACT>charcutier</ACT>, <LOC>...","Y\n☞\nDnten,charentier, 1. Montmartre, 89. 318","Y ☞ <PER>Dnten</PER>,<ACT>charentier</ACT>, <L...",True


## Keep only entries with errors
With our dataset, this represents roughly half of the entries.

Our goal here is to select appropriate samples to learn a character-level seq2seq model, avoiding the risk of learning to predict the same sequence as the input.

In [16]:
#!pip install evaluate jiwer cer

In [17]:
import evaluate
cer = evaluate.load("cer")

Downloading builder script:   0%|          | 0.00/5.60k [00:00<?, ?B/s]

In [18]:
import isri_tools

In [19]:
df_valid.iloc[0]["Ground Truth"], df_valid.iloc[0]["Sample"]

('Dufant (Victor), libraire, r. du Gros-Che-\nnet, 2. 392',
 '☞\n\nT\nDufant (Victor), libraire, r. du Gros-Che-\nnet. 2.\nJO')

In [20]:
stats0 = isri_tools.compute_accurary_stats(df_valid.iloc[0]["Ground Truth"], df_valid.iloc[0]["Sample"])
stats0  

UNLV-ISRI OCR Accuracy Report Version 5.1
-----------------------------------------
      54   Characters
      10   Errors
   81.48%  Accuracy

       0   Reject Characters
       0   Suspect Markers
       0   False Marks
    0.00%  Characters Marked
   81.48%  Accuracy After Correction

     Ins    Subst      Del   Errors
       0        0        0        0   Marked
       1        4        5       10   Unmarked
       1        4        5       10   Total

   Count   Missed   %Right
       8        1    87.50   ASCII Spacing Characters
       9        1    88.89   ASCII Special Symbols
       4        3    25.00   ASCII Digits
       4        0   100.00   ASCII Uppercase Letters
      29        0   100.00   ASCII Lowercase Letters
      54        5    90.74   Total

  Errors   Marked   Correct-Generated
       5        0   {}-{<261E><\n><\n>T<\n>}
       4        0   { 392}-{<\n>JO}
       1        0   {,}-{.}

   Count   Missed   %Right
       1        0   100.00   {<\n>}
       7 

In [21]:
stats0.characters, stats0.errors, stats0.false_marks, stats0.reject_characters, stats0.suspect_markers

(54, 10, 0, 0, 0)

We are missing ins/sub/del measures here!

In [22]:
stats1 = cer.compute(references=[df_valid.iloc[0]["Ground Truth"]], predictions=[df_valid.iloc[0]["Sample"]])
stats1

0.16666666666666666

Same here :(

So, either I add some bindings in the modified ISRI tools, or I parse the output…

Parsing will be super slow, I fixed our custom ISRI module instead.

In [23]:
lines = stats0.__repr__().split("\n")
lines[15]

'       1        4        5       10   Total'

In [24]:
stats0.total_ops

insertions: 1, substitutions: 4, deletions: 5

In [25]:
stats0.total_ops.insertions, stats0.total_ops.substitutions, stats0.total_ops.deletions, stats0.total_ops.errors

(1, 4, 5, 10)

We need to understand how substitutions are counted to be correct, and check it complies with the definition at <https://huggingface.co/spaces/evaluate-metric/cer>:
$$
CER = (S + D + I) / N = (S + D + I) / (S + D + C)
$$
where

S is the number of substitutions, D is the number of deletions, I is the number of insertions, C is the number of correct characters, N is the number of characters in the reference (N=S+D+C).

In [26]:
stats2 = isri_tools.compute_accurary_stats("abbbbc", "addddc")
stats2.total_ops

insertions: 0, substitutions: 4, deletions: 0

In [27]:
stats2 = isri_tools.compute_accurary_stats("aaaacccc", "aaaaccccdddd")
stats2.total_ops

insertions: 0, substitutions: 0, deletions: 4

This seems correct, let's move on…

In [28]:
cer_all = cer.compute(references=list(df_valid["Ground Truth"]), predictions=list(df_valid["Sample"]))
cer_all

0.032892845459141694

We cannot get enough details from the CER metric in the "evaluate" package. We need to define a custom evaluation function.

In [29]:
stats2.errors, stats2.characters

(4, 8)

In [30]:
def local_cer(ref: str, pred: str) -> float:
    stats = isri_tools.compute_accurary_stats(ref, pred)
    return stats.errors / stats.characters

In [31]:
# custom version
local_cer("aaaacccc", "aaaaccccdddd")

0.5

In [32]:
# reference implementation
cer.compute(references=["aaaacccc"], predictions=["aaaaccccdddd"])

0.5

In [33]:
def local_stats(ref: str, pred: str) -> tuple[int, int, int, int, int]:
    """Compute character-level stats

    Args:
        ref (str): Reference (target) string
        pred (str): Predicted (automatically transcripted) string

    Returns:
        tuple[int, int, int, int, int]: (ref len, total errors, insertions, substitutions, deletions)
    """
    stats = isri_tools.compute_accurary_stats(ref, pred)
    return stats.characters, stats.errors, stats.total_ops.insertions, stats.total_ops.substitutions, stats.total_ops.deletions

In [34]:
# Augment the dataframe with useful columns
df_valid.loc[:, ('ref_len', 'errors', 'insertions', 'substitutions', 'deletions')] = df_valid.apply(
        lambda line: pd.Series(
            local_stats(ref=line["Ground Truth"], pred=line["Sample"]),
            index=['ref_len', 'errors', 'insertions', 'substitutions', 'deletions']), 
        axis=1,
        result_type='expand')
df_valid.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, val, pi)


Unnamed: 0,book,page,Ground Truth,NER Ground Truth,Sample,NER Sample,has_valid_ner_xml,ref_len,errors,insertions,substitutions,deletions
1,Bottin1_1820,107,"Dufant (Victor), libraire, r. du Gros-Che-\nne...","<PER>Dufant (Victor)</PER>, <ACT>libraire</ACT...","☞\n\nT\nDufant (Victor), libraire, r. du Gros-...","☞ T <PER>Dufant (Victor)</PER>, <ACT>libraire...",True,54,10,1,4,5
2,Bottin1_1820,107,"Dufay, essayeur du commerce, place Dau-\nphine...","<PER>Dufay</PER>, <ACT>essayeur du commerce</A...","Dutay, essayeur du commerce, place Dau-\n\n-\n...","<PER>Dutay</PER>, <ACT>essayeur du commerce</A...",True,61,18,8,7,3
3,Bottin1_1820,107,"Dulay, chandronnier, r. du Pont- aux\nChoux, 1...","<PER>Dulay</PER>, <ACT>chandronnier</ACT>, <LO...","Dulay, chandronnier, r. du Pont- aux-\nChuux, ...","<PER>Dulay</PER>, <ACT>chandronnier</ACT>, <LO...",True,51,6,1,4,1
4,Bottin1_1820,107,"Dufay (V.e), grenetière, r. du Faub.-S.\nDenis...","<PER>Dufay (V.e)</PER>, <ACT>grenetière</ACT>,...","Dufay (V.e), grenetière, r. du Fauh.-S.\nDenis...","<PER>Dufay (V.e)</PER>, <ACT>grenetière</ACT>,...",True,54,2,0,2,0
5,Bottin1_1820,107,"Dufeu, charcutier, r. Montmartre, 89. 318","<PER>Dufeu</PER>, <ACT>charcutier</ACT>, <LOC>...","Y\n☞\nDnten,charentier, 1. Montmartre, 89. 318","Y ☞ <PER>Dnten</PER>,<ACT>charentier</ACT>, <L...",True,41,11,1,6,4


In [35]:
df_valid.loc[:, ("CER",)] = df_valid["errors"] / df_valid["ref_len"]
df_valid.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0,book,page,Ground Truth,NER Ground Truth,Sample,NER Sample,has_valid_ner_xml,ref_len,errors,insertions,substitutions,deletions,CER
1,Bottin1_1820,107,"Dufant (Victor), libraire, r. du Gros-Che-\nne...","<PER>Dufant (Victor)</PER>, <ACT>libraire</ACT...","☞\n\nT\nDufant (Victor), libraire, r. du Gros-...","☞ T <PER>Dufant (Victor)</PER>, <ACT>libraire...",True,54,10,1,4,5,0.185185
2,Bottin1_1820,107,"Dufay, essayeur du commerce, place Dau-\nphine...","<PER>Dufay</PER>, <ACT>essayeur du commerce</A...","Dutay, essayeur du commerce, place Dau-\n\n-\n...","<PER>Dutay</PER>, <ACT>essayeur du commerce</A...",True,61,18,8,7,3,0.295082
3,Bottin1_1820,107,"Dulay, chandronnier, r. du Pont- aux\nChoux, 1...","<PER>Dulay</PER>, <ACT>chandronnier</ACT>, <LO...","Dulay, chandronnier, r. du Pont- aux-\nChuux, ...","<PER>Dulay</PER>, <ACT>chandronnier</ACT>, <LO...",True,51,6,1,4,1,0.117647
4,Bottin1_1820,107,"Dufay (V.e), grenetière, r. du Faub.-S.\nDenis...","<PER>Dufay (V.e)</PER>, <ACT>grenetière</ACT>,...","Dufay (V.e), grenetière, r. du Fauh.-S.\nDenis...","<PER>Dufay (V.e)</PER>, <ACT>grenetière</ACT>,...",True,54,2,0,2,0,0.037037
5,Bottin1_1820,107,"Dufeu, charcutier, r. Montmartre, 89. 318","<PER>Dufeu</PER>, <ACT>charcutier</ACT>, <LOC>...","Y\n☞\nDnten,charentier, 1. Montmartre, 89. 318","Y ☞ <PER>Dnten</PER>,<ACT>charentier</ACT>, <L...",True,41,11,1,6,4,0.268293


In [36]:
df_recoverable_errors = df_valid[df_valid["errors"] > 0]
df_recoverable_errors

Unnamed: 0,book,page,Ground Truth,NER Ground Truth,Sample,NER Sample,has_valid_ner_xml,ref_len,errors,insertions,substitutions,deletions,CER
1,Bottin1_1820,107,"Dufant (Victor), libraire, r. du Gros-Che-\nne...","<PER>Dufant (Victor)</PER>, <ACT>libraire</ACT...","☞\n\nT\nDufant (Victor), libraire, r. du Gros-...","☞ T <PER>Dufant (Victor)</PER>, <ACT>libraire...",True,54,10,1,4,5,0.185185
2,Bottin1_1820,107,"Dufay, essayeur du commerce, place Dau-\nphine...","<PER>Dufay</PER>, <ACT>essayeur du commerce</A...","Dutay, essayeur du commerce, place Dau-\n\n-\n...","<PER>Dutay</PER>, <ACT>essayeur du commerce</A...",True,61,18,8,7,3,0.295082
3,Bottin1_1820,107,"Dulay, chandronnier, r. du Pont- aux\nChoux, 1...","<PER>Dulay</PER>, <ACT>chandronnier</ACT>, <LO...","Dulay, chandronnier, r. du Pont- aux-\nChuux, ...","<PER>Dulay</PER>, <ACT>chandronnier</ACT>, <LO...",True,51,6,1,4,1,0.117647
4,Bottin1_1820,107,"Dufay (V.e), grenetière, r. du Faub.-S.\nDenis...","<PER>Dufay (V.e)</PER>, <ACT>grenetière</ACT>,...","Dufay (V.e), grenetière, r. du Fauh.-S.\nDenis...","<PER>Dufay (V.e)</PER>, <ACT>grenetière</ACT>,...",True,54,2,0,2,0,0.037037
5,Bottin1_1820,107,"Dufeu, charcutier, r. Montmartre, 89. 318","<PER>Dufeu</PER>, <ACT>charcutier</ACT>, <LOC>...","Y\n☞\nDnten,charentier, 1. Montmartre, 89. 318","Y ☞ <PER>Dnten</PER>,<ACT>charentier</ACT>, <L...",True,41,11,1,6,4,0.268293
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8759,Notables_communaux_seine_1801,144,"Lamarche , marchand de vin, rue lazare.","<PER>Lamarche</PER> , <ACT>marchand de vin</AC...","Lamarche , marchand de vin , rue lazare.","<PER>Lamarche</PER> , <ACT>marchand de vin</AC...",True,39,1,0,0,1,0.025641
8760,Notables_communaux_seine_1801,144,"Lamarche, géographe , rue du foin.","<PER>Lamarche</PER>, <ACT>géographe</ACT> , <L...","Lamarche, geographe , rue du foin.","<PER>Lamarche</PER>, <ACT>geographe</ACT> , <L...",True,34,1,0,1,0,0.029412
8761,Notables_communaux_seine_1801,144,"Lamarck, membre de l'institut, quatrième munic...","<PER>Lamarck</PER>, <ACT>membre de l'institut<...","Lamarck, membre de l'institut, quatrieme munic...","<PER>Lamarck</PER>, <ACT>membre de l&apos;inst...",True,54,2,0,2,0,0.037037
8762,Notables_communaux_seine_1801,144,"Lamare, notaire, rue du faubourg honoré.","<PER>Lamare</PER>, <ACT>notaire</ACT>, <LOC>ru...","Lamare , notaire, rue du faubourg honore.","<PER>Lamare</PER> , <ACT>notaire</ACT>, <LOC>r...",True,40,2,0,1,1,0.050000


In [37]:
print("all entries:", len(df))
print("-"*20)
print("\tentries with unrecoverable errors", len(df)-len(df_valid))
print("\tentries with valid alignment OCR pred ↔ target text", len(df_valid))
print("-"*20)
print("\t\tentries with recoverable errors", len(df_recoverable_errors))
print("\t\tentries without errors", len(df_valid)-len(df_recoverable_errors))

all entries: 8765
--------------------
	entries with unrecoverable errors 373
	entries with valid alignment OCR pred ↔ target text 8392
--------------------
		entries with recoverable errors 4681
		entries without errors 3711


# Split train test

In [41]:
#!pip install scikit-learn

In [42]:
from sklearn.model_selection import train_test_split

In [43]:
SEED = 0

In [44]:
train_df, test_df = train_test_split(df_recoverable_errors, test_size=0.1, shuffle=True, random_state=SEED)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
train_df.shape, test_df.shape

((4212, 13), (469, 13))

# Launch the training!

In [45]:
from src.NMT_corrector import NMTCorrector

In [47]:
s2smdl = NMTCorrector()

In [48]:
%%time
s2smdl.fit(X=train_df["Sample"], y=train_df["Ground Truth"])

Corpus train's weight should be given. We default it to 1 for you.
[2023-05-29 18:25:53,813 INFO] Counter vocab from 10000 samples.
[2023-05-29 18:25:53,813 INFO] Build vocab on 10000 transformed examples/corpus.
[2023-05-29 18:25:53,972 INFO] Counters src:128
[2023-05-29 18:25:53,972 INFO] Counters tgt:112
[2023-05-29 18:25:56,025 INFO] Missing transforms field for train data, set to default: [].
[2023-05-29 18:25:56,025 INFO] Missing transforms field for valid data, set to default: [].
[2023-05-29 18:25:56,025 INFO] Parsed 2 corpora from -data.
[2023-05-29 18:25:56,026 INFO] Get special vocabs from Transforms: {'src': [], 'tgt': []}.
[2023-05-29 18:25:56,027 INFO] Building model...
[2023-05-29 18:25:56,200 INFO] NMTModel(
  (encoder): RNNEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(136, 500, padding_idx=1)
        )
      )
      (dropout): Dropout(p=0.3, inplace=False)
    )
    (rnn): LSTM(500, 

[2023-05-29 18:32:52,959 INFO] Step 350/ 2000; acc: 41.8; ppl:   7.6; xent: 2.0; lr: 0.50000; sents:    3200; bsz: 2586/2658/64; 2372/2438 tok/s;    417 sec;
[2023-05-29 18:33:47,279 INFO] Step 400/ 2000; acc: 46.8; ppl:   6.3; xent: 1.8; lr: 0.50000; sents:    3200; bsz: 2474/2548/64; 2277/2346 tok/s;    471 sec;
[2023-05-29 18:33:47,282 INFO] Saving checkpoint models/nmt/model/checkpoint_step_400.pt
[2023-05-29 18:34:41,044 INFO] Step 450/ 2000; acc: 49.6; ppl:   5.8; xent: 1.8; lr: 0.50000; sents:    3200; bsz: 2538/2607/64; 2360/2425 tok/s;    525 sec;
[2023-05-29 18:35:36,575 INFO] Step 500/ 2000; acc: 53.5; ppl:   5.0; xent: 1.6; lr: 0.50000; sents:    3200; bsz: 2648/2719/64; 2384/2448 tok/s;    580 sec;
[2023-05-29 18:36:22,363 INFO] Step 550/ 2000; acc: 60.8; ppl:   3.9; xent: 1.4; lr: 0.50000; sents:    3200; bsz: 2449/2518/64; 2674/2750 tok/s;    626 sec;
[2023-05-29 18:37:06,423 INFO] Step 600/ 2000; acc: 78.7; ppl:   2.2; xent: 0.8; lr: 0.50000; sents:    3200; bsz: 2579/2

CPU times: user 423 ms, sys: 123 ms, total: 546 ms
Wall time: 38min 51s


In [49]:
%%time
s2smdl.score(X=train_df["Sample"], y=train_df["Ground Truth"])

[2023-05-29 19:08:05,549 INFO] PRED SCORE: -0.0160, PRED PPL: 1.02 NB SENTENCES: 4212


CPU times: user 719 ms, sys: 9.24 ms, total: 728 ms
Wall time: 3min 23s


0.17289230437007375

In [50]:
%%time
s2smdl.score(X=test_df["Sample"], y=test_df["Ground Truth"])

[2023-05-29 19:08:32,030 INFO] PRED SCORE: -0.0227, PRED PPL: 1.02 NB SENTENCES: 469


CPU times: user 249 ms, sys: 20.3 ms, total: 269 ms
Wall time: 26.1 s


0.1779204213471588

In [52]:
%%time
pred_test = s2smdl.predict(test_df["Sample"])
pred_test

[2023-05-29 21:37:38,472 INFO] PRED SCORE: -0.0227, PRED PPL: 1.02 NB SENTENCES: 469


CPU times: user 24.5 ms, sys: 1.58 ms, total: 26.1 ms
Wall time: 20.9 s


0                                                                                                                                                                                                              Collugyod Marshall, peausster, Mon-\ntorgueil, 82.
1                                                                                                                                                                                                                            Rennit , et Pignard, RiN. Eganterde.
2                                                                                                                                                                                                 Grandemange, fab. de visieres et brides, Si-\nmon-le-Franc, 13.
3                                                                                                                                                                                                                            Audin

In [54]:
results = test_df.copy()
results.loc[:, ("Predictions",)] = pred_test
results.head()

Unnamed: 0,book,page,Ground Truth,NER Ground Truth,Sample,NER Sample,has_valid_ner_xml,ref_len,errors,insertions,substitutions,deletions,CER,Predictions
0,Cambon_almgene_1841,141,"Collingwood Marshall, peaussier, Mon-\ntorgueil, 82.","<PER>Collingwood Marshall</PER>, <ACT>peaussier</ACT>, <LOC>Mon-\ntorgueil</LOC>, <CARDINAL>82</CARDINAL>.","Collugwood Marshall, peausster, Mon-\ntorgucil, 82.","<PER>Collugwood Marshall</PER>, <ACT>peausster</ACT>, <LOC>Mon- torgucil</LOC>, <CARDINAL>82</CARDINAL>.",True,51,4,1,3,0,0.078431,"Collugyod Marshall, peausster, Mon-\ntorgueil, 82."
1,Duverneuil_et_La_Tynna_1806,147,"Benoit et Pignard, R. N. Egalité, 64.","<PER>Benoit et Pignard</PER>, <LOC>R. N. Egalité</LOC>, <CARDINAL>64</CARDINAL>.","Rennit ""et Pignard, RiN. Eganterdí.","<PER>Rennit &quot;et Pignard</PER>, <LOC>RiN. Egante</LOC>rd<CARDINAL>í</CARDINAL>.",True,37,12,3,8,1,0.324324,"Rennit , et Pignard, RiN. Eganterde."
2,DidotBottin_1860a,280,"Grandemange, fab. de visières et brides, Si-\nmon-le-Franc, 13.","<PER>Grandemange</PER>, <ACT>fab. de visières et brides</ACT>, <LOC>Si-\nmon-le-Franc</LOC>, <CARDINAL>13</CARDINAL>.","Grandemange, fab. de visieres et brides, Si-\nmon-le-Franc, 13.","<PER>Grandemange</PER>, <ACT>fab. de visieres et brides</ACT>, <LOC>Si- mon-le-Franc</LOC>, <CARDINAL>13</CARDINAL>.",True,62,1,0,1,0,0.016129,"Grandemange, fab. de visieres et brides, Si-\nmon-le-Franc, 13."
3,Duverneuil_et_La_Tynna_1805,292,"Audinot (Mlle ), R. Pinon, 12.— M. B.","<PER>Audinot (Mlle )</PER>, <LOC>R. Pinon</LOC>, <CARDINAL>12</CARDINAL>.— <LOC>M. B</LOC>.","Audinot (Mlle ), R. Pinon, 12.—M, B.","<PER>Audinot (Mlle )</PER>, <LOC>R. Pinon</LOC>, <CARDINAL>12</CARDINAL>.—<LOC>M, B</LOC>.",True,37,2,1,1,0,0.054054,"Audinot (Mlle ), R. Pino , 12.—M. B."
4,Didot_1851a,226,"Gautier et Fortier, fab. de produits chimiques,\nSt-Maur-Popincourt; 219. *","<PER>Gautier et Fortier</PER>, <ACT>fab. de produits chimiques</ACT>,<LOC>\nSt-Maur-Popincourt</LOC>; <CARDINAL>219</CARDINAL>. *","Gautier et Fortier, fab. de produits chimiques,\nSt-Maur-Popincourt, 219.","<PER>Gautier et Fortier</PER>, <ACT>fab. de produits chimiques</ACT>,<LOC> St-Maur-Popincourt</LOC>, <CARDINAL>219</CARDINAL>.",True,74,3,2,1,0,0.040541,"Gautier et Fortier, fab. de produits chimiques,\nSt-Maur-Popincourt, 219."


## Compute post-correction CER

In [55]:
results.loc[:, ("CER post corr",)] = results.apply(
        lambda line: local_cer(ref=line["Ground Truth"], pred=line["Predictions"]),
        axis=1)
results.head()

Unnamed: 0,book,page,Ground Truth,NER Ground Truth,Sample,NER Sample,has_valid_ner_xml,ref_len,errors,insertions,substitutions,deletions,CER,Predictions,CER post corr
0,Cambon_almgene_1841,141,"Collingwood Marshall, peaussier, Mon-\ntorgueil, 82.","<PER>Collingwood Marshall</PER>, <ACT>peaussier</ACT>, <LOC>Mon-\ntorgueil</LOC>, <CARDINAL>82</CARDINAL>.","Collugwood Marshall, peausster, Mon-\ntorgucil, 82.","<PER>Collugwood Marshall</PER>, <ACT>peausster</ACT>, <LOC>Mon- torgucil</LOC>, <CARDINAL>82</CARDINAL>.",True,51,4,1,3,0,0.078431,"Collugyod Marshall, peausster, Mon-\ntorgueil, 82.",0.098039
1,Duverneuil_et_La_Tynna_1806,147,"Benoit et Pignard, R. N. Egalité, 64.","<PER>Benoit et Pignard</PER>, <LOC>R. N. Egalité</LOC>, <CARDINAL>64</CARDINAL>.","Rennit ""et Pignard, RiN. Eganterdí.","<PER>Rennit &quot;et Pignard</PER>, <LOC>RiN. Egante</LOC>rd<CARDINAL>í</CARDINAL>.",True,37,12,3,8,1,0.324324,"Rennit , et Pignard, RiN. Eganterde.",0.351351
2,DidotBottin_1860a,280,"Grandemange, fab. de visières et brides, Si-\nmon-le-Franc, 13.","<PER>Grandemange</PER>, <ACT>fab. de visières et brides</ACT>, <LOC>Si-\nmon-le-Franc</LOC>, <CARDINAL>13</CARDINAL>.","Grandemange, fab. de visieres et brides, Si-\nmon-le-Franc, 13.","<PER>Grandemange</PER>, <ACT>fab. de visieres et brides</ACT>, <LOC>Si- mon-le-Franc</LOC>, <CARDINAL>13</CARDINAL>.",True,62,1,0,1,0,0.016129,"Grandemange, fab. de visieres et brides, Si-\nmon-le-Franc, 13.",0.016129
3,Duverneuil_et_La_Tynna_1805,292,"Audinot (Mlle ), R. Pinon, 12.— M. B.","<PER>Audinot (Mlle )</PER>, <LOC>R. Pinon</LOC>, <CARDINAL>12</CARDINAL>.— <LOC>M. B</LOC>.","Audinot (Mlle ), R. Pinon, 12.—M, B.","<PER>Audinot (Mlle )</PER>, <LOC>R. Pinon</LOC>, <CARDINAL>12</CARDINAL>.—<LOC>M, B</LOC>.",True,37,2,1,1,0,0.054054,"Audinot (Mlle ), R. Pino , 12.—M. B.",0.054054
4,Didot_1851a,226,"Gautier et Fortier, fab. de produits chimiques,\nSt-Maur-Popincourt; 219. *","<PER>Gautier et Fortier</PER>, <ACT>fab. de produits chimiques</ACT>,<LOC>\nSt-Maur-Popincourt</LOC>; <CARDINAL>219</CARDINAL>. *","Gautier et Fortier, fab. de produits chimiques,\nSt-Maur-Popincourt, 219.","<PER>Gautier et Fortier</PER>, <ACT>fab. de produits chimiques</ACT>,<LOC> St-Maur-Popincourt</LOC>, <CARDINAL>219</CARDINAL>.",True,74,3,2,1,0,0.040541,"Gautier et Fortier, fab. de produits chimiques,\nSt-Maur-Popincourt, 219.",0.040541


In [56]:
results.loc[:, ("is_improved_or_same",)] = results["CER post corr"] <= results["CER"]
results.loc[:, ("is_improved_strictly",)] = results["CER post corr"] < results["CER"]
results.head()

Unnamed: 0,book,page,Ground Truth,NER Ground Truth,Sample,NER Sample,has_valid_ner_xml,ref_len,errors,insertions,substitutions,deletions,CER,Predictions,CER post corr,is_improved_or_same,is_improved_strictly
0,Cambon_almgene_1841,141,"Collingwood Marshall, peaussier, Mon-\ntorgueil, 82.","<PER>Collingwood Marshall</PER>, <ACT>peaussier</ACT>, <LOC>Mon-\ntorgueil</LOC>, <CARDINAL>82</CARDINAL>.","Collugwood Marshall, peausster, Mon-\ntorgucil, 82.","<PER>Collugwood Marshall</PER>, <ACT>peausster</ACT>, <LOC>Mon- torgucil</LOC>, <CARDINAL>82</CARDINAL>.",True,51,4,1,3,0,0.078431,"Collugyod Marshall, peausster, Mon-\ntorgueil, 82.",0.098039,False,False
1,Duverneuil_et_La_Tynna_1806,147,"Benoit et Pignard, R. N. Egalité, 64.","<PER>Benoit et Pignard</PER>, <LOC>R. N. Egalité</LOC>, <CARDINAL>64</CARDINAL>.","Rennit ""et Pignard, RiN. Eganterdí.","<PER>Rennit &quot;et Pignard</PER>, <LOC>RiN. Egante</LOC>rd<CARDINAL>í</CARDINAL>.",True,37,12,3,8,1,0.324324,"Rennit , et Pignard, RiN. Eganterde.",0.351351,False,False
2,DidotBottin_1860a,280,"Grandemange, fab. de visières et brides, Si-\nmon-le-Franc, 13.","<PER>Grandemange</PER>, <ACT>fab. de visières et brides</ACT>, <LOC>Si-\nmon-le-Franc</LOC>, <CARDINAL>13</CARDINAL>.","Grandemange, fab. de visieres et brides, Si-\nmon-le-Franc, 13.","<PER>Grandemange</PER>, <ACT>fab. de visieres et brides</ACT>, <LOC>Si- mon-le-Franc</LOC>, <CARDINAL>13</CARDINAL>.",True,62,1,0,1,0,0.016129,"Grandemange, fab. de visieres et brides, Si-\nmon-le-Franc, 13.",0.016129,True,False
3,Duverneuil_et_La_Tynna_1805,292,"Audinot (Mlle ), R. Pinon, 12.— M. B.","<PER>Audinot (Mlle )</PER>, <LOC>R. Pinon</LOC>, <CARDINAL>12</CARDINAL>.— <LOC>M. B</LOC>.","Audinot (Mlle ), R. Pinon, 12.—M, B.","<PER>Audinot (Mlle )</PER>, <LOC>R. Pinon</LOC>, <CARDINAL>12</CARDINAL>.—<LOC>M, B</LOC>.",True,37,2,1,1,0,0.054054,"Audinot (Mlle ), R. Pino , 12.—M. B.",0.054054,True,False
4,Didot_1851a,226,"Gautier et Fortier, fab. de produits chimiques,\nSt-Maur-Popincourt; 219. *","<PER>Gautier et Fortier</PER>, <ACT>fab. de produits chimiques</ACT>,<LOC>\nSt-Maur-Popincourt</LOC>; <CARDINAL>219</CARDINAL>. *","Gautier et Fortier, fab. de produits chimiques,\nSt-Maur-Popincourt, 219.","<PER>Gautier et Fortier</PER>, <ACT>fab. de produits chimiques</ACT>,<LOC> St-Maur-Popincourt</LOC>, <CARDINAL>219</CARDINAL>.",True,74,3,2,1,0,0.040541,"Gautier et Fortier, fab. de produits chimiques,\nSt-Maur-Popincourt, 219.",0.040541,True,False


In [60]:
len(results), results.loc[:, ("has_valid_ner_xml", "is_improved_or_same", "is_improved_strictly")].sum(axis=0)

(469,
 has_valid_ner_xml       469
 is_improved_or_same     322
 is_improved_strictly    196
 dtype: int64)

In [64]:
print(f"Same:   {(322-196)/469*100:.2f}%")
print(f"Better: {(196)/469*100:.2f}%")
print(f"Worse:  {(469-322)/469*100:.2f}%")

Same:   26.87%
Better: 41.79%
Worse:  31.34%


In [67]:
results[results["is_improved_strictly"]].sample(10).loc[:,("Sample", "Predictions", "Ground Truth", "CER", "CER post corr")]

Unnamed: 0,Sample,Predictions,Ground Truth,CER,CER post corr
121,"Borel, carrossier, Cadet,\n7.","Borel, carrossier, Cadet, 7.","Borel, carrossier, Cadet, 7.",0.035714,0.0
244,"Lavarde, R. Batave, 400. Tuileries.","Lavarde , R. Batave, 400. Tuileries.","Lavarde , R. Batave, 400. Tuileries.",0.027778,0.0
123,"Bridan\nA\nfab, le creux, r. du Puits.I\n—","Bridan (Ae), fab, le creux, r. du Puits. 19.","Bridan A.. fab. le creux. r du Puits, 12.",0.292683,0.219512
443,"Courty-Fontaine, cordons de montres, St-De-\nnIS.309.","Courty-Fontaine, cordons de montres, St-De-\nnis, 38.","Courty-Fontaine, cordons de montres, St- De-\nnis, 309.",0.092593,0.055556
389,"Legrand (Mile.), boulang. r. aux Ours. 30.","Legrand (Mlle.), boulang. r. aux Ours. 30.","Legrand (Mlle.), boulang., r. aux Ours, 30.",0.069767,0.046512
18,"DUPRE, r. des Canneltes, 15.","DUPRÉ, r. des Canneltes, 15.","DUPRÉ, r. des Cannettes, 15.",0.071429,0.035714
422,"Collinct, chén. Ferme des Mathurins, 28.","Collinet, chén. Ferme des Mathurins, 28.","Collinet, ébén. Ferme des Mathurins, 28.",0.075,0.05
218,"Grandet i2, conseiller maître à la cour des\ncomptes, Ferme. 26.","Grandet , conseiller maçtre à la cour des\ncomptes, Ferme. 26.","Grandet , conseiller maître à la cour des\ncomptes, Ferme, 26.",0.048387,0.032258
135,"Gachard et Comp.\n(fab.), R. du Faub. Denis, 36. Poisson.","Gachard et Comp. (fab.). R. du Faub. Denis, 36. Poisson.","Gachard et Comp. ( fab.). R. du Faub. Denis , 36. Poisson.",0.068966,0.034483
147,"Carpentier, cloitre-Honoré, n. 15, - de la Halle.","Carpentier, cloître-Honoré, n. 15, - de la Halle.","Carpentier, cloître-Honoré, n. 15, - de la Halle.",0.020408,0.0


# TODO
- [ ] analyze for each kind of named entity
- [ ] perform some simple space normalization before any correction? (no, avoid hand-written rules)
- [ ] (later) self-supervised pre-training
- [ ] check whether we need some early stopping

# Prepare train set for the verifier

In [85]:
%%time
pred_train = s2smdl.predict(train_df["Sample"])
pred_train

[2023-05-29 22:28:19,429 INFO] PRED SCORE: -0.0160, PRED PPL: 1.02 NB SENTENCES: 4212


CPU times: user 153 ms, sys: 36.1 ms, total: 189 ms
Wall time: 2min 40s


0                                     Petit, nég. f. St Martin, 13.
1                        Balizeaux (Mad.), lingère, S.-Honuré, 169.
2                       Cave générale des hospices, Ne-N. -Dame, 2.
3                           Moreau (H.), avoc. cour imp., Alger, 9.
4               Laurent (J. A.) (miniature) , rue Nicaise, no. 487.
                                   ...                             
4207                         Dufour fils, corroyeur, Châtillon, 14.
4208       Lépine (J.-B. ), architecte, Marais-Saint-Ger-\nmain. 3.
4209                          PAULMIR, r. de la Ferronnerie, 4. 472
4210                             Audouard, O. , médecin, Cadet, 6.
4211    Gauthier-la-Chapelle, avocat à la cour d'appel,\nOdeon, 31.
Length: 4212, dtype: object

In [86]:
train_verif = train_df.copy()
train_verif.loc[:, ("Predictions",)] = pred_train
train_verif.head()

Unnamed: 0,book,page,Ground Truth,NER Ground Truth,Sample,NER Sample,has_valid_ner_xml,ref_len,errors,insertions,substitutions,deletions,CER,Predictions
0,Cambon_almgene_1841,330,"Petit, nég. f. St Martin, 13.","<PER>Petit</PER>, <ACT>nég.</ACT> <LOC>f. St Martin</LOC>, <CARDINAL>13</CARDINAL>.","Felit, neg. f. St Martin, 13.","<PER>Felit</PER>, <ACT>neg.</ACT> <LOC>f. St Martin</LOC>, <CARDINAL>13</CARDINAL>.",True,29,3,0,3,0,0.103448,"Petit, nég. f. St Martin, 13."
1,Bottin1_1827,37,"Balizeaux (Mad.), lingère, S.-Honuré, 169.","<PER>Balizeaux (Mad.)</PER>, <ACT>lingère</ACT>, <LOC>S.-Honuré</LOC>, <CARDINAL>169</CARDINAL>.","Balizeaux (Mad.), lingère, S.-Honoré, 169.","<PER>Balizeaux (Mad.)</PER>, <ACT>lingère</ACT>, <LOC>S.-Honoré</LOC>, <CARDINAL>169</CARDINAL>.",True,42,1,0,1,0,0.02381,"Balizeaux (Mad.), lingère, S.-Honuré, 169."
2,Bottin1_1837,80,"Cave générale des hospices, Ne-N. -Dame, 2.","<PER>Cave générale des hospices</PER>, <LOC>Ne-N. -Dame</LOC>, <CARDINAL>2</CARDINAL>.","Cave générale des hospices, Ne-N. Dame, 2.","<PER>Cave générale des hospices</PER>, <LOC>Ne-N. Dame</LOC>, <CARDINAL>2</CARDINAL>.",True,43,1,1,0,0,0.023256,"Cave générale des hospices, Ne-N. -Dame, 2."
3,DidotBottin_1861a,424,"Moreau (H. ), avoc. cour imp., Alger, 9.","<PER>Moreau (H. )</PER>, <ACT>avoc. cour imp.</ACT>, <LOC>Alger</LOC>, <CARDINAL>9</CARDINAL>.","Moreau (H.), avoc. cour imp., Alger, 9.","<PER>Moreau (H.)</PER>, <ACT>avoc. cour imp.</ACT>, <LOC>Alger</LOC>, <CARDINAL>9</CARDINAL>.",True,40,1,1,0,0,0.025,"Moreau (H.), avoc. cour imp., Alger, 9."
4,Favre_et_Duchesne_1798,700,"Jaurent (J. A.) (miniature) , rue Nicaise, no. 487.","<PER>Jaurent (J. A.)</PER> <ACT>(miniature)</ACT> , <LOC>rue Nicaise</LOC>, no. <CARDINAL>487</CARDINAL>.","Laurent (J. A.) (miniature)\n, rue Nicaise, no. 487.","<PER>Laurent (J. A.)</PER> <ACT>(miniature)</ACT> , <LOC>rue Nicaise</LOC>, no. <CARDINAL>487</CARDINAL>.",True,51,2,0,2,0,0.039216,"Laurent (J. A.) (miniature) , rue Nicaise, no. 487."


In [87]:
train_verif.loc[:, ("CER post corr",)] = train_verif.apply(
        lambda line: local_cer(ref=line["Ground Truth"], pred=line["Predictions"]),
        axis=1)
train_verif.head()

Unnamed: 0,book,page,Ground Truth,NER Ground Truth,Sample,NER Sample,has_valid_ner_xml,ref_len,errors,insertions,substitutions,deletions,CER,Predictions,CER post corr
0,Cambon_almgene_1841,330,"Petit, nég. f. St Martin, 13.","<PER>Petit</PER>, <ACT>nég.</ACT> <LOC>f. St Martin</LOC>, <CARDINAL>13</CARDINAL>.","Felit, neg. f. St Martin, 13.","<PER>Felit</PER>, <ACT>neg.</ACT> <LOC>f. St Martin</LOC>, <CARDINAL>13</CARDINAL>.",True,29,3,0,3,0,0.103448,"Petit, nég. f. St Martin, 13.",0.0
1,Bottin1_1827,37,"Balizeaux (Mad.), lingère, S.-Honuré, 169.","<PER>Balizeaux (Mad.)</PER>, <ACT>lingère</ACT>, <LOC>S.-Honuré</LOC>, <CARDINAL>169</CARDINAL>.","Balizeaux (Mad.), lingère, S.-Honoré, 169.","<PER>Balizeaux (Mad.)</PER>, <ACT>lingère</ACT>, <LOC>S.-Honoré</LOC>, <CARDINAL>169</CARDINAL>.",True,42,1,0,1,0,0.02381,"Balizeaux (Mad.), lingère, S.-Honuré, 169.",0.0
2,Bottin1_1837,80,"Cave générale des hospices, Ne-N. -Dame, 2.","<PER>Cave générale des hospices</PER>, <LOC>Ne-N. -Dame</LOC>, <CARDINAL>2</CARDINAL>.","Cave générale des hospices, Ne-N. Dame, 2.","<PER>Cave générale des hospices</PER>, <LOC>Ne-N. Dame</LOC>, <CARDINAL>2</CARDINAL>.",True,43,1,1,0,0,0.023256,"Cave générale des hospices, Ne-N. -Dame, 2.",0.0
3,DidotBottin_1861a,424,"Moreau (H. ), avoc. cour imp., Alger, 9.","<PER>Moreau (H. )</PER>, <ACT>avoc. cour imp.</ACT>, <LOC>Alger</LOC>, <CARDINAL>9</CARDINAL>.","Moreau (H.), avoc. cour imp., Alger, 9.","<PER>Moreau (H.)</PER>, <ACT>avoc. cour imp.</ACT>, <LOC>Alger</LOC>, <CARDINAL>9</CARDINAL>.",True,40,1,1,0,0,0.025,"Moreau (H.), avoc. cour imp., Alger, 9.",0.025
4,Favre_et_Duchesne_1798,700,"Jaurent (J. A.) (miniature) , rue Nicaise, no. 487.","<PER>Jaurent (J. A.)</PER> <ACT>(miniature)</ACT> , <LOC>rue Nicaise</LOC>, no. <CARDINAL>487</CARDINAL>.","Laurent (J. A.) (miniature)\n, rue Nicaise, no. 487.","<PER>Laurent (J. A.)</PER> <ACT>(miniature)</ACT> , <LOC>rue Nicaise</LOC>, no. <CARDINAL>487</CARDINAL>.",True,51,2,0,2,0,0.039216,"Laurent (J. A.) (miniature) , rue Nicaise, no. 487.",0.019608


In [88]:
import numpy as np

In [89]:
train_verif.loc[:, ("same",)] = pd.Series(np.isclose(train_verif["CER post corr"], train_verif["CER"]))
train_verif.loc[:, ("better",)] = train_verif["CER post corr"] < train_verif["CER"]
train_verif.loc[:, ("worse",)] = train_verif["CER post corr"] > train_verif["CER"]
train_verif.head()

Unnamed: 0,book,page,Ground Truth,NER Ground Truth,Sample,NER Sample,has_valid_ner_xml,ref_len,errors,insertions,substitutions,deletions,CER,Predictions,CER post corr,same,better,worse
0,Cambon_almgene_1841,330,"Petit, nég. f. St Martin, 13.","<PER>Petit</PER>, <ACT>nég.</ACT> <LOC>f. St Martin</LOC>, <CARDINAL>13</CARDINAL>.","Felit, neg. f. St Martin, 13.","<PER>Felit</PER>, <ACT>neg.</ACT> <LOC>f. St Martin</LOC>, <CARDINAL>13</CARDINAL>.",True,29,3,0,3,0,0.103448,"Petit, nég. f. St Martin, 13.",0.0,False,True,False
1,Bottin1_1827,37,"Balizeaux (Mad.), lingère, S.-Honuré, 169.","<PER>Balizeaux (Mad.)</PER>, <ACT>lingère</ACT>, <LOC>S.-Honuré</LOC>, <CARDINAL>169</CARDINAL>.","Balizeaux (Mad.), lingère, S.-Honoré, 169.","<PER>Balizeaux (Mad.)</PER>, <ACT>lingère</ACT>, <LOC>S.-Honoré</LOC>, <CARDINAL>169</CARDINAL>.",True,42,1,0,1,0,0.02381,"Balizeaux (Mad.), lingère, S.-Honuré, 169.",0.0,False,True,False
2,Bottin1_1837,80,"Cave générale des hospices, Ne-N. -Dame, 2.","<PER>Cave générale des hospices</PER>, <LOC>Ne-N. -Dame</LOC>, <CARDINAL>2</CARDINAL>.","Cave générale des hospices, Ne-N. Dame, 2.","<PER>Cave générale des hospices</PER>, <LOC>Ne-N. Dame</LOC>, <CARDINAL>2</CARDINAL>.",True,43,1,1,0,0,0.023256,"Cave générale des hospices, Ne-N. -Dame, 2.",0.0,False,True,False
3,DidotBottin_1861a,424,"Moreau (H. ), avoc. cour imp., Alger, 9.","<PER>Moreau (H. )</PER>, <ACT>avoc. cour imp.</ACT>, <LOC>Alger</LOC>, <CARDINAL>9</CARDINAL>.","Moreau (H.), avoc. cour imp., Alger, 9.","<PER>Moreau (H.)</PER>, <ACT>avoc. cour imp.</ACT>, <LOC>Alger</LOC>, <CARDINAL>9</CARDINAL>.",True,40,1,1,0,0,0.025,"Moreau (H.), avoc. cour imp., Alger, 9.",0.025,True,False,False
4,Favre_et_Duchesne_1798,700,"Jaurent (J. A.) (miniature) , rue Nicaise, no. 487.","<PER>Jaurent (J. A.)</PER> <ACT>(miniature)</ACT> , <LOC>rue Nicaise</LOC>, no. <CARDINAL>487</CARDINAL>.","Laurent (J. A.) (miniature)\n, rue Nicaise, no. 487.","<PER>Laurent (J. A.)</PER> <ACT>(miniature)</ACT> , <LOC>rue Nicaise</LOC>, no. <CARDINAL>487</CARDINAL>.",True,51,2,0,2,0,0.039216,"Laurent (J. A.) (miniature) , rue Nicaise, no. 487.",0.019608,False,True,False


In [90]:
train_verif.loc[:, ("has_valid_ner_xml", "same", "better", "worse")].sum(axis=0)

has_valid_ner_xml    4212
same                  578
better               2828
worse                 806
dtype: int64

In [97]:
(train_verif[train_verif["worse"]]
 .loc[:,("Sample", "Predictions", "Ground Truth", "CER", "CER post corr")]
 .sort_values(by="CER post corr", ascending=False)
 .head(10))

Unnamed: 0,Sample,Predictions,Ground Truth,CER,CER post corr
3493,n\nBouly R. Charentou\n—,"Bouly R. Charentouf, Bouly R. Charentouf, Bouly R. Charentouf, Bouly R. Charentouf, Bouly R. Charentouf, Bouly R. Charentouf, Bouly R. Charentouf, Bouly R. Charentouf, Bouly R. Charentouf, Bouly R. Charentouf, Bouly R. Charentouf, Bouly R. Charentouf","Bouly, R. Charenton, 103",0.375,9.541667
3159,"Chaumin, tailleur,\nVictoire, 28.","Chaumin, tailleur, Villeur, Villeur, Villeur, Villeur, Villeur, Villeur, Villeur, Villeur, Villeur, Villeur, Villeur, Villeur, Villeur, Villeur, Villeur, Villeur, Villeur, Villeur, Villeur, Villeur, Villeur, Villeur, Villeur, Villeur, Villeur, Villeu","Chaumin, tailleur, Victoire, 28.",0.03125,7.0
1066,"Hug, sellerie, r. de l Echiquier. 38.","Hug, sellerie, r. de l'Echiquier, 38lerie, r. de l'Echiquier, 38lerie, r. de l'Echiquier, 38lerie, r. de l'Echiquier, 38lerie, r. de l'Echiquier, 38lerie, r. de l'Echiquier, 38lerie, r. de l'Echiquier, 38lerie, r. de l'Echiquier, 38lerie, r. de l'Ech","Hug, sellerie, r. de l'Echiquier, 38",0.083333,5.944444
3605,"—17\nDelasalle, jeune,\nLhumpg, FF.\nK.\n\nE. des: Petis","Delasalle, jeune, Lhumpg, FF.\nR. des Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Petis Pe","Delasalle, jeune , R. Gr. des Petits\nChamps, 11",0.702128,4.574468
1491,"—\nBourguignon freres\nA-T-ELDITES, i\n—\n\nR. Notre-Dan\nS","Bourguignon freres), R. Nourguignon freres), R. Nourguignon freres), R. Nourguignon freres), R. Nourguignon freres), R. Nourguignon freres), R. Nourguignon freres), R. Nourguignon freres), R. Nourguignon freres), R. Nourguignon freres), R. Nourguigno","Bourguignon frères, R. Notre-Dame-\ndes-Victoires, 34.",0.641509,4.0
232,"Durbec (F.), épicerie. r. Traînée, passage des","Durbec (F.), épicerie. r. Trainée, passage des Cassage des Cassage des Cassage des Cassage des Cassage des Cassage des Cassage des Cassage des Cassage des Cassage des Cassage des Cassage des Cassage des Cassage des Cassage des Cassage des Cassage des","Durbec (F.), épicerie. r. Trainée, passage des\nChartreux.",0.210526,3.526316
1287,"Brahy , place du Palais-Egalité, - de la Butte-des-Moulins,","Brahy, place du Palais-Egalité, - de la Butte-des-Moulins, 2 de la Butte-des-Moulins, 2 de la Butte-des-Moulins, 2 de la Butte-des-Moulins, 2 de la Butte-des-Moulins, 2 de la Butte-des-Moulins, 2 de la Butte-des-Moulins, 2 de la Butte-des-Moulins, 2","Brahy, place du Palais-Egalité, - de la Butte-des-Moulins,",0.017241,3.310345
2628,"—\n\nDuflas, rouennerie, 1. Bourg-l'Abbé, pass.\n412\nde lAncre, 34.\n—","Duflas, rouennerie, r. Bourg-l'Abbé, pass. Coufg-l'Abbé, pass. Coufg-l'Abbé, pass. Coufg-l'Abbé, pass. Coufg-l'Abbé, pass. Coufg-l'Abbé, pass. Coufg-l'Abbé, pass. Coufg-l'Abbé, pass. Coufg-l'Abbé, pass. Coufg-l'Abbé, pass. Coufg-l'Abbé, pass. Coufg-l","Duflos, rouennerie, r. Bourg-l'Abbé, pass.\nde l'Ancre, 34. 412",0.225806,3.241935
3668,"Dupont , à la Croix Rouge , au coin de la","Dupont, a la Croix Rouge, au coin de la Croin de la Croin de la Croin de la Croin de la Croin de la Croin de la Croin de la Croin de la Croin de la Croin de la Croin de la Croin de la Croin de la Croin de la Croin de la Croin de la Croin de la Croin","Dupont, à la Croix Rouge , au coin de la\nR. du Vieux Colombier.",0.380952,3.174603
3310,"Hlélène, salaisons,fromages, fruits secs,etc.\nrue Richelieu, 72.","Hlélène, salaisons, fruits secs, truits secs, truits secs, truits secs, truits secs, truits secs, truits secs, truits secs, truits secs, truits secs, truits secs, truits secs, truits secs, truits secs, truits secs, truits secs, truits secs, truits se","Hélène, salaisons,fromages, fruits secs,etc.\nrue Richelieu, 72.",0.015873,3.174603


In [92]:
# Augment with stats to be ready to train verifier
train_verif.loc[:, ('sample_len', 'corr_errors', 'corr_insertions', 'corr_substitutions', 'corr_deletions')] = train_verif.apply(
        lambda line: pd.Series(
            local_stats(ref=line["Sample"], pred=line["Predictions"]),
            index=['sample_len', 'corr_errors', 'corr_insertions', 'corr_substitutions', 'corr_deletions']), 
        axis=1,
        result_type='expand')
train_verif.sample(4)

Unnamed: 0,book,page,Ground Truth,NER Ground Truth,Sample,NER Sample,has_valid_ner_xml,ref_len,errors,insertions,...,Predictions,CER post corr,same,better,worse,sample_len,corr_errors,corr_insertions,corr_substitutions,corr_deletions
4113,Bottin1_1827,117,"Desvaux, nouveautés, S.-Honoré, 25.","<PER>Desvaux</PER>, <ACT>nouveautés</ACT>, <LOC>S.-Honoré</LOC>, <CARDINAL>25</CARDINAL>.","Desvaux, noureautes, S.-Honoré, 27.","<PER>Desvaux</PER>, <ACT>noureautes</ACT>, <LOC>S.-Honoré</LOC>, <CARDINAL>27</CARDINAL>.",True,35,3,0,...,"Desvaux, noureautes, S.-Honoré, 27.",0.085714,True,False,False,35,0,0,0,0
876,Cambon_almgene_1841,330,"Petiteau, coif. Croix des Pet. Champs, 9.","<PER>Petiteau</PER>, <ACT>coif.</ACT> <LOC>Croix des Pet. Champs</LOC>, <CARDINAL>9</CARDINAL>.","Petiteau, coIf. Croix des Fet. Champs, 9..","<PER>Petiteau</PER>, <ACT>coIf.</ACT> <LOC>Croix des Fet. Champs</LOC>, <CARDINAL>9</CARDINAL>..",True,41,3,0,...,"Petiteau, coif. Croix des Pet. Champs, 9.",0.0,False,True,False,42,3,1,2,0
3494,La_Tynna_1813,377,"Thouin (le chevalier), membre de l'in-\nstitut , rue de Seine-St.-Victor, 35.","<PER>Thouin (le chevalier)</PER>, <ACT>membre de l'in-\nstitut</ACT> , <LOC>rue de Seine-St.-Victor</LOC>, <CARDINAL>35</CARDINAL>.","Thouin (le chevalier), membre de lin-\nstitut .rue de Seine-St.-Victor, 55.","<PER>Thouin (le chevalier)</PER>, <ACT>membre de lin- stitut</ACT> .<LOC>rue de Seine-St.-Victor</LOC>, <CARDINAL>55</CARDINAL>.",True,76,4,2,...,"Thouin (le chevalier), mémbre de lin-\nstitut . rue de Seine-St.-Victor, 55.",0.052632,True,False,False,74,2,0,1,1
3137,Duverneuil_et_La_Tynna_1801,260,"AUDEBERT , Camp. à gauche par le boulevard, 143. Observatoire.","<PER>AUDEBERT</PER> , <LOC>Camp. à gauche par le boulevard</LOC>, <CARDINAL>143</CARDINAL>. <LOC>Observatoire</LOC>.","AUDEBERT, Camp. à gauche par le boulevard, 143. Observatoire.","<PER>AUDEBERT</PER>, <LOC>Camp. à gauche par le boulevard</LOC>, <CARDINAL>143</CARDINAL>. <LOC>Observatoire</LOC>.",True,62,1,1,...,"AUDEBERT, camp. à gauche pardire.",0.483871,False,False,True,61,29,28,1,0


In [93]:
train_verif.to_pickle("train_verif.pkl")

In [94]:
!ls -lh train_verif.pkl

-rw-r--r-- 1 jchazalo lrde 2,1M 29 mai   22:28 train_verif.pkl
