# 50 - Experiment 2: figures and tables

In [1]:
from pathlib import Path

# PATHS
METRICS_DIR = Path("43-experiment_2_metrics").resolve() # Or set the path to the metrics extracted from the release archive
METRICS_DIR

PosixPath('/home/bertrand/dev/paper-ner-bench-das22/src/ner/43-experiment_2_metrics')

In [2]:
import os, re

metrics_camembert_pretrained = [f for f in os.listdir(METRICS_DIR) if "camembert_pretrained" in f]
metrics_ds = [re.match(".+_(\w+)\.json",f).group(1) for f in metrics_camembert_pretrained]
metrics_camembert_pretrained, metrics_ds

(['camembert_pretrained_ref.json',
  'camembert_pretrained_pero.json',
  'camembert_pretrained_tess.json'],
 ['ref', 'pero', 'tess'])

In [3]:
import pandas as pd
import functools

data_frames = [pd.read_json(METRICS_DIR / metrics,typ='series') for metrics in metrics_camembert_pretrained]
data_frames = [pd.DataFrame([df]) for df in data_frames]
camembert_pretrained = pd.concat(data_frames, keys=metrics_ds)
camembert_pretrained = camembert_pretrained.droplevel(1)
camembert_pretrained = camembert_pretrained.loc[["ref","pero","tess"]] # Force index order to be ref - pero - tess
camembert_pretrained

Unnamed: 0,test_loss,test_precision,test_recall,test_f1,test_accuracy,test_runtime,test_samples_per_second,test_steps_per_second
ref,0.169713,0.965261,0.965261,0.965261,0.952012,51.1994,32.598,4.082
pero,0.259704,0.932983,0.925616,0.929285,0.934323,51.2087,32.592,4.081
tess,0.306922,0.902732,0.91314,0.907906,0.924237,52.3309,31.893,3.994


In [4]:
metrics_camembert = [f for f in os.listdir(METRICS_DIR) if "camembert_simple" in f]
metrics_ds = [re.match(".+_(\w+)\.json",f).group(1) for f in metrics_camembert]
metrics_camembert, metrics_ds

(['camembert_simple_pero.json',
  'camembert_simple_tess.json',
  'camembert_simple_ref.json'],
 ['pero', 'tess', 'ref'])

In [5]:
data_frames = [pd.read_json(METRICS_DIR / metrics,typ='series') for metrics in metrics_camembert]
data_frames = [pd.DataFrame([df]) for df in data_frames]
camembert = pd.concat(data_frames, keys=metrics_ds)
camembert = camembert.droplevel(1)
camembert = camembert.loc[["ref","pero","tess"]] # Force index order to be ref - pero - tess
camembert

Unnamed: 0,test_loss,test_precision,test_recall,test_f1,test_accuracy,test_runtime,test_samples_per_second,test_steps_per_second
ref,0.165194,0.961339,0.965893,0.963611,0.951439,50.434,33.093,4.144
pero,0.238639,0.924242,0.924826,0.924534,0.933691,50.843,32.827,4.111
tess,0.274461,0.897312,0.912192,0.904691,0.923882,52.1782,31.987,4.006


In [6]:
full_dataframe = pd.concat([camembert_pretrained, camembert],keys=["camembert_pretrained","camembert"])
full_dataframe

Unnamed: 0,Unnamed: 1,test_loss,test_precision,test_recall,test_f1,test_accuracy,test_runtime,test_samples_per_second,test_steps_per_second
camembert_pretrained,ref,0.169713,0.965261,0.965261,0.965261,0.952012,51.1994,32.598,4.082
camembert_pretrained,pero,0.259704,0.932983,0.925616,0.929285,0.934323,51.2087,32.592,4.081
camembert_pretrained,tess,0.306922,0.902732,0.91314,0.907906,0.924237,52.3309,31.893,3.994
camembert,ref,0.165194,0.961339,0.965893,0.963611,0.951439,50.434,33.093,4.144
camembert,pero,0.238639,0.924242,0.924826,0.924534,0.933691,50.843,32.827,4.111
camembert,tess,0.274461,0.897312,0.912192,0.904691,0.923882,52.1782,31.987,4.006


In [7]:
latex_table = full_dataframe[["test_precision","test_recall","test_f1"]].T
latex_table.rename(columns={"camembert_pretrained":"Camembert.pretrained","camembert":"Camembert" }, inplace=True)
latex_table.rename(columns={"tess":"tesseract","pero":"pero-ocr" }, inplace=True)
latex_table = latex_table.multiply(100., axis=1)
latex_table

Unnamed: 0_level_0,Camembert.pretrained,Camembert.pretrained,Camembert.pretrained,Camembert,Camembert,Camembert
Unnamed: 0_level_1,ref,pero-ocr,tesseract,ref,pero-ocr,tesseract
test_precision,96.526133,93.298313,90.273224,96.133899,92.424242,89.731241
test_recall,96.526133,92.561592,91.313961,96.589294,92.482628,91.219204
test_f1,96.526133,92.928492,90.79061,96.361059,92.453426,90.469105


In [12]:
print(latex_table.to_latex(float_format="%.1f", multirow=True, caption=""))

\begin{tabular}{lrrrrrr}
\toprule
{} & \multicolumn{3}{l}{Camembert.pretrained} & \multicolumn{3}{l}{Camembert} \\
{} &                  ref & pero-ocr & tesseract &       ref & pero-ocr & tesseract \\
\midrule
test\_precision &                 96.5 &     93.3 &      90.3 &      96.1 &     92.4 &      89.7 \\
test\_recall    &                 96.5 &     92.6 &      91.3 &      96.6 &     92.5 &      91.2 \\
test\_f1        &                 96.5 &     92.9 &      90.8 &      96.4 &     92.5 &      90.5 \\
\bottomrule
\end{tabular}

