In [35]:
import pandas as pd
import numpy as np
from scipy import stats

In [36]:
# Load the CSV files
type = 'pdf_images' #pdf_images or pdf
df_pdf = pd.read_csv(f'results/extractor_comparison_{type}.csv')

print("Loaded PDF extraction results:")
print(f"  - PDF files: {len(df_pdf)} documents")
print()


Loaded PDF extraction results:
  - PDF files: 20 documents



In [37]:
df_pdf

Unnamed: 0,pdf_file,reference_chars,tika_time,tika_chars,tika_chrf,tika_chrfPlusPlus,tika_levenshteinRatio,tika_charDiff,tika_charDiffPct,docling_time,...,docling_levenshteinRatio,docling_charDiff,docling_charDiffPct,pypdfloader_time,pypdfloader_chars,pypdfloader_chrf,pypdfloader_chrfPlusPlus,pypdfloader_levenshteinRatio,pypdfloader_charDiff,pypdfloader_charDiffPct
0,0_umělá_inteligence.pdf,30536,0.1643,88,0.0694,0.0595,0.0056,-30448,-99.7118,130.7653,...,0.7897,-4504,-14.7498,0.0638,0,0.0,0.0,0.0,0,0.0
1,10_neuronová_síť.pdf,27210,0.2257,77,0.0654,0.0561,0.0056,-27133,-99.717,101.6508,...,0.7405,-6203,-22.7968,0.0099,0,0.0,0.0,0.0,0,0.0
2,11_skrytý_markovův_model.pdf,6878,0.054,63,0.3413,0.2925,0.0173,-6815,-99.084,31.7969,...,0.7718,-1105,-16.0657,0.0048,0,0.0,0.0,0.0,0,0.0
3,12_získávání_informací.pdf,12629,0.1058,71,0.1944,0.1666,0.0109,-12558,-99.4378,62.852,...,0.8055,-1262,-9.9929,0.0056,0,0.0,0.0,0.0,0,0.0
4,13_dolování_z_textů.pdf,3575,0.0476,56,0.5543,0.4751,0.0286,-3519,-98.4336,21.445,...,0.7265,-408,-11.4126,0.0021,0,0.0,0.0,0.0,0,0.0
5,14_viterbiho_algoritmus.pdf,18815,0.1007,76,0.1447,0.124,0.008,-18739,-99.5961,70.1015,...,0.6959,-5241,-27.8554,0.0073,0,0.0,0.0,0.0,0,0.0
6,15_učení_bez_učitele.pdf,3234,0.0813,57,0.6566,0.5628,0.0316,-3177,-98.2375,19.3297,...,0.8254,50,1.5461,0.0023,0,0.0,0.0,0.0,0,0.0
7,16_shluková_analýza.pdf,12399,0.0697,66,0.167,0.1432,0.0104,-12333,-99.4677,52.1538,...,0.7589,-2019,-16.2836,0.0057,0,0.0,0.0,0.0,0,0.0
8,17_lemmatizace.pdf,4755,0.0362,53,0.2784,0.2386,0.0216,-4702,-98.8854,24.6314,...,0.7857,-341,-7.1714,0.0026,0,0.0,0.0,0.0,0,0.0
9,18_jazykový_korpus.pdf,11733,0.0603,63,0.1597,0.1369,0.0103,-11670,-99.4631,48.4602,...,0.756,-2255,-19.2193,0.0043,0,0.0,0.0,0.0,0,0.0


In [38]:
list(df_pdf.columns)[2:]

['tika_time',
 'tika_chars',
 'tika_chrf',
 'tika_chrfPlusPlus',
 'tika_levenshteinRatio',
 'tika_charDiff',
 'tika_charDiffPct',
 'docling_time',
 'docling_chars',
 'docling_chrf',
 'docling_chrfPlusPlus',
 'docling_levenshteinRatio',
 'docling_charDiff',
 'docling_charDiffPct',
 'pypdfloader_time',
 'pypdfloader_chars',
 'pypdfloader_chrf',
 'pypdfloader_chrfPlusPlus',
 'pypdfloader_levenshteinRatio',
 'pypdfloader_charDiff',
 'pypdfloader_charDiffPct']

In [39]:
metric_cols = list(df_pdf.columns)[2:]

df = df_pdf[metric_cols]

# ---------- 2. MEAN ----------
mean_vals = df.mean()

# ---------- 3. 95% CONFIDENCE INTERVAL ----------
ci = {}
n = len(df)

for col in df.columns:
    std = df[col].std(ddof=1)
    se = std / np.sqrt(n)
    h = stats.t.ppf(0.975, n - 1) * se
    ci[col] = h

ci = pd.Series(ci)

# ---------- 4. FORMAT: MEAN ± CI ----------
summary = pd.DataFrame({
    "mean": mean_vals,
    "ci_95": ci
})

summary["formatted"] = summary["mean"].round(4).astype(str) + " ± " + summary["ci_95"].round(4).astype(str)

# ---------- 5. SPLIT TOOL / METRIC ----------
summary = summary.reset_index()
summary.columns = ["full_name", "mean", "ci_95", "formatted"]

summary[["tool", "metric"]] = summary["full_name"].str.rsplit("_", n=1, expand=True)

# ---------- 6. PIVOT TO FINAL TABLE ----------
final_table = summary.pivot(index="tool", columns="metric", values="formatted")
final_table.sort_values(by=["chrfPlusPlus"], ascending=False, inplace=True)
display(final_table[['chrfPlusPlus','chrf', 'levenshteinRatio', 'time', 'charDiff']])

metric,chrfPlusPlus,chrf,levenshteinRatio,time,charDiff
tool,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
docling,57.1072 ± 3.0306,62.5071 ± 3.0678,0.7683 ± 0.0223,53.365 ± 15.472,-1701.2 ± 968.7355
tika,0.2635 ± 0.1224,0.3075 ± 0.1428,0.0192 ± 0.0066,0.0729 ± 0.0228,-10502.9 ± 3800.356
pypdfloader,0.0 ± 0.0,0.0 ± 0.0,0.0 ± 0.0,0.0077 ± 0.0063,0.0 ± 0.0


In [40]:
final_table

metric,charDiff,charDiffPct,chars,chrf,chrfPlusPlus,levenshteinRatio,time
tool,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
docling,-1701.2 ± 968.7355,-10.3683 ± 6.0494,8863.4 ± 2992.2122,62.5071 ± 3.0678,57.1072 ± 3.0306,0.7683 ± 0.0223,53.365 ± 15.472
tika,-10502.9 ± 3800.356,-98.9822 ± 0.3618,61.7 ± 5.0065,0.3075 ± 0.1428,0.2635 ± 0.1224,0.0192 ± 0.0066,0.0729 ± 0.0228
pypdfloader,0.0 ± 0.0,0.0 ± 0.0,0.0 ± 0.0,0.0 ± 0.0,0.0 ± 0.0,0.0 ± 0.0,0.0077 ± 0.0063


In [34]:
latex_table = final_table[['chrfPlusPlus', 'chrf', 'levenshteinRatio', 'time', 'charDiff']] \
    .to_latex(
        index=True,
        float_format="%.4f",
        caption=f"Evaluation metrics and runtime statistics on {type}",
        label=f"tab:evaluation_metrics_{type}"
    )

print(latex_table)


\begin{table}
\caption{Evaluation metrics and runtime statistics on pdf_images}
\label{tab:evaluation_metrics_pdf_images}
\begin{tabular}{llllll}
\toprule
metric & chrfPlusPlus & chrf & levenshteinRatio & time & charDiff \\
tool &  &  &  &  &  \\
\midrule
docling & 57.1072 ± 3.0306 & 62.5071 ± 3.0678 & 0.7683 ± 0.0223 & 53.365 ± 15.472 & -1701.2 ± 968.7355 \\
tika & 0.2635 ± 0.1224 & 0.3075 ± 0.1428 & 0.0192 ± 0.0066 & 0.0729 ± 0.0228 & -10502.9 ± 3800.356 \\
pypdfloader & 0.0 ± 0.0 & 0.0 ± 0.0 & 0.0 ± 0.0 & 0.0077 ± 0.0063 & 0.0 ± 0.0 \\
\bottomrule
\end{tabular}
\end{table}

