In [None]:
from json import load
from typing import List, Dict
import re
import unicodedata
from sklearn.metrics import precision_score, recall_score, f1_score
from strsimpy.levenshtein import Levenshtein
from pathlib import Path
from glob import glob
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
ground_truth_dir = "../../data/metadata_extraction_data/metadata"

hyps = {
    "grobid_dl": "../../data/metadata_extraction_data/grobid_dl_metadata",
    "grobid_crf": "../../data/metadata_extraction_data/grobid_crf_metadata",
    "gpt_oss": "../../data/metadata_extraction_data/gpt_oss_metadata",
    "phi4_mini": "../../data/metadata_extraction_data/phi4mini_metadata",
    "qwen3b": "../../data/metadata_extraction_data/qwen3b_metadata",
    "qwen4b": "../../data/metadata_extraction_data/qwen4b_metadata",
    "llama3b": "../../data/metadata_extraction_data/llama3b_metadata"
    }

In [None]:
def _strip_controls(text: str) -> str:
    """Drop all Unicode control characters (category C*)."""
    return "".join(c for c in text if unicodedata.category(c)[0] != "C")

def _fix_accents(text: str) -> str:
    """
    Remove diacritics (accents) but keep other characters unchanged.
    """
    return "".join(
        c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn"
    )

def _clean_item(text: str) -> str:
    """
    Lower-case, strip accents, trim whitespace, collapse internal spaces.
    """
    text = _strip_controls(_fix_accents(text.lower()))
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def levenshtein_distance(s0, s1):
    return Levenshtein().distance(s0, s1)

def cosine_sim(a: str, b: str) -> float:
    """
    TF-IDF cosine similarity between two texts.
    """
    if a and b:
        vect = TfidfVectorizer(stop_words="english")
        mat = vect.fit_transform([a, b])
        return round(float(cosine_similarity(mat[0:1], mat[1:2])[0, 0]), 2)
    else: return 1

def f1_on_sets(true_set, pred_set):
    """
    Element-level F1 score treating both inputs as sets after normalisation.
    """
    if not true_set and not pred_set:
        return 1, 1, 1
    if true_set and not pred_set:
        return 0, 0, 0
    if not true_set and pred_set:
        return 0, 0, 0
    if len(true_set) == 1 and len(pred_set) == 1:
        return float(true_set == pred_set), float(true_set == pred_set), float(true_set == pred_set)
    labels = sorted(true_set|pred_set)                 # union of both sets

    # ── 3. Convert each set to a binary indicator row (1 = label present) ─────────
    mlb = MultiLabelBinarizer(classes=labels)
    y_true_bin = mlb.fit_transform([true_set])   # shape (1, n_labels)
    y_pred_bin = mlb.transform([pred_set])  
    
    # ── 4. Micro-averaged metrics for multi-label classification ───────────────────
    prec  = round(precision_score(y_true_bin, y_pred_bin, average='samples'), 2)
    rec   = round(recall_score(y_true_bin, y_pred_bin, average='samples'), 2)
    f1    = round(f1_score(y_true_bin, y_pred_bin, average='samples'), 2)

    return (prec, rec, f1)

In [None]:
def evaluate_record(
    y_true: Dict, y_predicted: Dict, is_grobid: bool = False
) -> List[Dict[str, object]]:
    rows: List[Dict[str, object]] = []

    for field in ("title", "doi", "publication_date", "publisher"):
        gt = _clean_item(y_true.get(field, ""))
        if y_predicted.get(field, ""): pd_ = _clean_item(y_predicted.get(field, ""))
        else: pd_ = ""
        match = levenshtein_distance(gt, pd_)
        rows.append(
            dict(
                field=field,
                metric="levenshtein_distance",
                score=float(match),
                ground_truth=gt,
                predicted=pd_,
            )
        )
    
    for field in ["abstract"]:
        gt = _clean_item(y_true.get(field, ""))
        pd_ = _clean_item(y_predicted.get(field, ""))
        cos_similarity = cosine_sim(gt, pd_)
        rows.append(
            dict(
                field=field,
                metric="cosine_similarity",
                score=cos_similarity,
                ground_truth=gt,
                predicted=pd_,
            )
        )

    for field in ("authors", "affiliations", "keywords", "email_ids"):
        gt_list = y_true.get(field, [])
        pd_list = y_predicted.get(field, [])
        true_set = {_clean_item(x) for x in gt_list}
        pred_set = {_clean_item(x) for x in pd_list}
        true_set = {s for true_s in true_set for s in true_s.split() if s}
        pred_set = {s for pred_s in pred_set for s in pred_s.split() if s}
        _,_,f1 = f1_on_sets(true_set, pred_set)
        if is_grobid and field=="email_ids":
            rows.append(
                dict(
                    field=field,
                    metric="F1_set",
                    score=0,
                    ground_truth="; ".join(true_set),
                    predicted="; ".join(pred_set),
                )
            )
            return rows
        rows.append(
            dict(
                field=field,
                metric="F1_set",
                score=f1,
                ground_truth="; ".join(true_set),
                predicted="; ".join(pred_set),
            )
        )

    return rows


In [None]:
files = glob(str(Path(ground_truth_dir)/"*.json"))
all_models = {}
for model, pred_dir in hyps.items():
    all_metrics=[]
    for ground_truth_file in files:
        name = Path(ground_truth_file).stem
        pred_file = Path(pred_dir) / f"{name}.json"
        with open(pred_file, encoding="utf-8") as f:
            y_pred = load(f)

        with open(ground_truth_file, encoding="utf-8") as f:
            y = load(f)
        is_grobid = model.startswith("grobid")
        results = evaluate_record(y, y_pred, is_grobid)
        all_metrics.append(results)
    all_models[model] = all_metrics

In [None]:
import pandas as pd
from statistics import mean

big_df = pd.DataFrame(columns=["fields", "metric", "grobid_dl", "grobid_crf", "gpt_oss", "phi4_mini", "qwen3b", "qwen4b", "llama3b"])
big_df["fields"] = ["title", "doi", "publication_date", "publisher", "abstract", "authors", "affiliations", "keywords", "email_ids"]
big_df["metric"] = ["Levenshtein Dist", "Levenshtein Dist", "Levenshtein Dist",  "Levenshtein Dist", "Cosine Sim", "F1 Score", "F1 Score", "F1 Score", "F1 Score"]
for model, all_metrics in all_models.items():
    scores_by_field = {}
    for record in all_metrics:
        for entry in record:
            field = entry['field']
            score = entry['score']
            scores_by_field.setdefault(field, []).append(score)

    # --- Compute the average for each field --------------------------------------
    avg_scores = {field: mean(vals) for field, vals in scores_by_field.items()}

    for field in big_df["fields"]:
        big_df.loc[big_df["fields"]==field, model] = avg_scores.get(field, 0)
    # --- Present as a DataFrame for clarity --------------------------------------
    avg_df = pd.DataFrame({
        'field': list(avg_scores.keys()),
        'average_score': list(avg_scores.values())
    }).sort_values('field').reset_index(drop=True)
    #print(avg_df.to_markdown())
print(big_df.to_markdown(index=False))

In [None]:
def plot_again(results):
    # Define field groups
    group1_fields = ["title", "doi", "publication_date", "publisher"]
    group2_fields = ["abstract"]
    group3_fields = ["authors", "affiliations", "keywords"]

    # Prepare data for plotting
    data = []
    for model_name, model_data in results.items():
        for entry in model_data:
            for item in entry:
                data.append({
                    "model": model_name,
                    "field": item["field"],
                    "score": item["score"]
                })

    df = pd.DataFrame(data)

    # Box plot for group1_fields
    plt.style.use('seaborn-v0_8')
    plt.figure(figsize=(10, 6))
    sns.boxplot(x="field", y="score", hue="model", data=df[df["field"].isin(group1_fields)])
    plt.title("Distribution of Levenshtein Distance for different Models")
    plt.xlabel("")
    plt.ylabel("Levenshtein Distance")
    plt.ylim(0, 50)
    plt.tight_layout()
    plt.savefig("../../data/outputs/group1_boxplot.png")
    #plt.show()
    plt.close()

    # KDE plot for group2_fields
    plt.figure(figsize=(10, 6))
    for model_name in results.keys():
        subset = df[(df["model"] == model_name) & (df["field"].isin(group2_fields))]
        sns.kdeplot(subset["score"], label=model_name, fill=True)
    plt.title("Distribution of Cosine Similarity for different Models")
    plt.legend()
    plt.tight_layout()
    plt.xlabel("Cosine Similarity of Abstract")
    plt.savefig("../../data/outputs/group2_boxplot.png")
    #plt.show()
    plt.close()

    # Box plot for group3_fields
    plt.figure(figsize=(10, 6))
    sns.boxplot(x="field", y="score", hue="model", data=df[df["field"].isin(group3_fields)])
    plt.title("Distribution of F1 Score for different Models")
    plt.tight_layout()
    plt.xlabel("")
    plt.ylabel("F1 Score")
    plt.savefig("../../data/outputs/group3_boxplot.png")
    #plt.show()
    plt.close()

    print("Plots generated:")


In [None]:
del all_models["grobid_crf"]
del all_models["qwen4b"]
plot_again(all_models)

In [None]:
models = ["grobid_dl", "gpt_oss", "phi4_mini", "qwen3b", "llama3b"]

df = big_df

def plot_cosine_and_f1(df):
    models = ["grobid_dl", "gpt_oss", "phi4_mini", "qwen3b", "llama3b"]

    # Keep only F1 Score and Cosine Sim metrics
    subset = df[df["metric"].isin(["F1 Score", "Cosine Sim"])].melt(
        id_vars=["fields", "metric"], 
        value_vars=models, 
        var_name="model", 
        value_name="score"
    )

    # Plot
    plt.figure(figsize=(12, 6))
    ax = sns.barplot(
        data=subset,
        x="fields",
        y="score",
        hue="model"
    )
    ax.set_title("Average Score (Cosine Similarity and F1 Score) for different Models")
    ax.set_ylabel("Score (higher is better)")
    ax.set_xlabel("")
    plt.legend(title="Models")
    plt.tight_layout()
    plt.savefig("../../data/outputs/f1_bar.png", dpi=300)
    #plt.show()
    plt.close()

def plot_levenshtein(df):
    fields = ["title", "doi", "publication_date", "publisher"]
    models = ["grobid_dl", "gpt_oss", "phi4_mini", "qwen3b", "llama3b"]

    melted = df[df["metric"] == "Levenshtein Dist"].melt(
        id_vars=["fields", "metric"], 
        value_vars=models, 
        var_name="model", 
        value_name="score"
    )

    plt.figure(figsize=(10, 6))
    ax = sns.barplot(
        data=melted, 
        x="fields", 
        y="score", 
        hue="model"
    )
    ax.set_title("Average Score (Levenshtein Distance) for different Models")
    ax.set_ylabel("Score (lower is better)")
    ax.set_xlabel("")
    plt.legend(title="Models")
    plt.tight_layout()
    plt.savefig("../../data/outputs/levenshtein_bar.png", dpi=300)
    #plt.show()
    plt.close()

plot_levenshtein(big_df)
plot_cosine_and_f1(big_df)