In [31]:
import json
import os
from typing import List, Dict, Tuple
from collections import Counter

PRED_FILE = "predictions/subtask_a_test_ensemble_bert_spacy_dictfilter_submission.json"

def load_json(path: str):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

In [32]:
def load_json(path: str):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)



def is_lowercase(term: str) -> bool:
    return term == term.lower()


def has_duplicates(terms: List[str]) -> bool:
    return len(terms) != len(set(terms))


def contains_as_subspan(longer: str, shorter: str) -> bool:
    """
    shorter è 'nested' in longer se i token di shorter compaiono
    come sottosequenza contigua dei token di longer.
    Esempio:
      longer = "impianto di trattamento rifiuti"
      shorter = "trattamento rifiuti"  --> True
    """
    long_tokens = longer.split()
    short_tokens = shorter.split()
    L, S = len(long_tokens), len(short_tokens)
    if S >= L:
        return False
    for i in range(L - S + 1):
        if long_tokens[i:i+S] == short_tokens:
            return True
    return False


def find_nested_pairs(terms: List[str]) -> List[Tuple[str, str]]:
    """
    Ritorna lista di coppie (shorter, longer) che sono nested
    all'interno della stessa sentence.
    """
    nested = []
    for i, t1 in enumerate(terms):
        for j, t2 in enumerate(terms):
            if i == j:
                continue
            # t1 potenziale "shorter", t2 potenziale "longer"
            if contains_as_subspan(t2, t1):
                nested.append((t1, t2))
    return nested



def check_submission(pred_path: str):
    print(f"Loading predictions from: {pred_path}")
    pred = load_json(pred_path)
    rows = pred["data"] if isinstance(pred, dict) and "data" in pred else pred

    stats = Counter()
    lower_violations_examples = []
    dup_violations_examples = []
    nested_violations_examples = []

    for idx, entry in enumerate(rows):
        terms: List[str] = entry.get("term_list", []) or []

        # 1) check lowercase
        not_lower = [t for t in terms if not is_lowercase(t)]
        if not_lower:
            stats["sentences_not_lower"] += 1
            stats["terms_not_lower"] += len(not_lower)
            if len(lower_violations_examples) < 5:
                lower_violations_examples.append({
                    "index": idx,
                    "doc": entry["document_id"],
                    "paragraph_id": entry["paragraph_id"],
                    "sentence_id": entry["sentence_id"],
                    "bad_terms": not_lower,
                })

        # 2) check duplicates
        if has_duplicates(terms):
            seen = set()
            dups = []
            for t in terms:
                if t in seen and t not in dups:
                    dups.append(t)
                seen.add(t)
            stats["duplicate_terms_total"] += len(dups)
            if len(dup_violations_examples) < 5:
                dup_violations_examples.append({
                    "index": idx,
                    "doc": entry["document_id"],
                    "paragraph_id": entry["paragraph_id"],
                    "sentence_id": entry["sentence_id"],
                    "duplicate_terms": dups,
                })

        # 3) check nested terms
        nested_pairs = find_nested_pairs(terms)
        if nested_pairs:
            stats["sentences_with_nested"] += 1
            stats["nested_pairs_total"] += len(nested_pairs)
            if len(nested_violations_examples) < 5:
                nested_violations_examples.append({
                    "index": idx,
                    "doc": entry["document_id"],
                    "paragraph_id": entry["paragraph_id"],
                    "sentence_id": entry["sentence_id"],
                    "nested_pairs": nested_pairs,
                })

        stats["total_sentences"] += 1
        stats["total_terms"] += len(terms)

    # ========= REPORT =========
    print("\n=== SUBMISSION CONSTRAINTS CHECK ===")
    print(f"Total sentences: {stats['total_sentences']}")
    print(f"Total terms:     {stats['total_terms']}")

    print("\n1) LOWERCASE CHECK")
    print(f"  Sentences with non-lowercase terms: {stats['sentences_not_lower']}")
    print(f"  Total non-lowercase terms:          {stats['terms_not_lower']}")
    if lower_violations_examples:
        print("  Examples:")
        for ex in lower_violations_examples:
            print(f"    - idx={ex['index']} ({ex['doc']}, p{ex['paragraph_id']}, s{ex['sentence_id']}): {ex['bad_terms']}")

    print("\n2) DUPLICATES CHECK")
    print(f"  Sentences with duplicate terms: {stats['sentences_with_duplicates']}")
    print(f"  Total duplicate term types:     {stats['duplicate_terms_total']}")
    if dup_violations_examples:
        print("  Examples:")
        for ex in dup_violations_examples:
            print(f"    - idx={ex['index']} ({ex['doc']}, p{ex['paragraph_id']}, s{ex['sentence_id']}): {ex['duplicate_terms']}")

    print("\n3) NESTED TERMS CHECK")
    print(f"  Sentences with nested pairs: {stats['sentences_with_nested']}")
    print(f"  Total nested pairs:          {stats['nested_pairs_total']}")
    if nested_violations_examples:
        print("  Examples:")
        for ex in nested_violations_examples:
            print(f"    - idx={ex['index']} ({ex['doc']}, p{ex['paragraph_id']}, s{ex['sentence_id']}):")
            for shorter, longer in ex["nested_pairs"]:
                print(f"        '{shorter}' ⊂ '{longer}'")

    print("\n=== SUMMARY ===")
    if stats["sentences_not_lower"] == 0 and stats["sentences_with_duplicates"] == 0 and stats["sentences_with_nested"] == 0:
        print("The file satisfies all output constraints.")
    else:
        print("There are violations of the constraints above. Fix before submission.")


# ========= RUN =========

if __name__ == "__main__":
    check_submission(PRED_FILE)


Loading predictions from: predictions/subtask_a_test_ensemble_bert_spacy_dictfilter_submission.json

=== SUBMISSION CONSTRAINTS CHECK ===
Total sentences: 1142
Total terms:     630

1) LOWERCASE CHECK
  Sentences with non-lowercase terms: 0
  Total non-lowercase terms:          0

2) DUPLICATES CHECK
  Sentences with duplicate terms: 0
  Total duplicate term types:     0

3) NESTED TERMS CHECK
  Sentences with nested pairs: 0
  Total nested pairs:          0

=== SUMMARY ===
The file satisfies all output constraints.
