In [1]:
import json
import os
from typing import List

# ====== CONFIG: cambia questi path come ti serve ======
INPUT_FILE  = "../predictions/subtask_a_dev_final_SM_cleaned.json"
OUTPUT_FILE = "../predictions/subtask_a_dev_final_SM_submission.json"

def load_json(path: str):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def save_json(obj, path: str):
    os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)
    print(f"✓ Saved cleaned (no-nested) predictions to {path}")


def contains_as_subspan(longer: str, shorter: str) -> bool:
    """
    shorter è nested in longer se i token di shorter compaiono
    come sottosequenza contigua in longer, e shorter ha meno token.
    """
    long_tokens = longer.split()
    short_tokens = shorter.split()
    L, S = len(long_tokens), len(short_tokens)
    if S >= L:
        return False
    for i in range(L - S + 1):
        if long_tokens[i:i+S] == short_tokens:
            return True
    return False


def remove_nested_terms_in_sentence(terms: List[str]) -> List[str]:
    to_drop = set()

    for i, shorter in enumerate(terms):
        for j, longer in enumerate(terms):
            if i == j:
                continue
            if contains_as_subspan(longer, shorter):
                # shorter è nested dentro longer → va rimosso
                to_drop.add(shorter)
                break

    # mantieni l'ordine originale, rimuovendo quelli marcati
    cleaned = [t for t in terms if t not in to_drop]
    return cleaned


def main():
    data = load_json(INPUT_FILE)
    rows = data["data"] if isinstance(data, dict) and "data" in data else data

    nested_before = 0
    for entry in rows:
        terms = entry.get("term_list", []) or []
    
        for i, t1 in enumerate(terms):
            for j, t2 in enumerate(terms):
                if i == j:
                    continue
                if contains_as_subspan(t2, t1):
                    nested_before += 1

    print(f"Nested pairs BEFORE cleaning (approx): {nested_before}")


    for entry in rows:
        terms = entry.get("term_list", []) or []
        cleaned = remove_nested_terms_in_sentence(terms)
        entry["term_list"] = cleaned

    if isinstance(data, dict) and "data" in data:
        data["data"] = rows
    else:
        data = rows

    save_json(data, OUTPUT_FILE)

 
    nested_after = 0
    for entry in rows:
        terms = entry.get("term_list", []) or []
        for i, t1 in enumerate(terms):
            for j, t2 in enumerate(terms):
                if i == j:
                    continue
                if contains_as_subspan(t2, t1):
                    nested_after += 1

    print(f"Nested pairs AFTER cleaning (approx): {nested_after}")


if __name__ == "__main__":
    main()


Nested pairs BEFORE cleaning (approx): 1
✓ Saved cleaned (no-nested) predictions to ../predictions/subtask_a_dev_final_SM_submission.json
Nested pairs AFTER cleaning (approx): 0
