In [None]:
wsd_dataset_path = "/mnt/data2/data/datasets/wsd"

In [None]:
import json
import os
from pathlib import Path
from typing import Iterator, Dict, List


class JsonlFileReader:
    """
    Iterator over a single JSON Lines file.

    Parameters
    ----------
    file_path : str or Path
        Path to the `.jsonl` file to read.

    Usage
    -----
    >>> reader = JsonlFileReader("example.jsonl")
    >>> for record in reader:
    ...     print(record)
    """

    def __init__(self, file_path: str | os.PathLike):
        self.file_path = Path(file_path)
        if not self.file_path.is_file():
            raise FileNotFoundError(f"File not found: {self.file_path}")

    def __iter__(self) -> Iterator[Dict]:
        """Yield one parsed JSON dict per line."""
        with self.file_path.open("r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line:  # skip empty lines
                    try:
                        yield json.loads(line)
                    except json.JSONDecodeError as exc:
                        raise ValueError(
                            f"Invalid JSON in {self.file_path} on line {f.tell()}"
                        ) from exc


def list_jsonl_files(directory: str | os.PathLike) -> List[Path]:
    """Return a list of all `.jsonl` files in the given directory."""
    p = Path(directory)
    if not p.is_dir():
        raise NotADirectoryError(f"Directory not found: {p}")
    return list(p.glob("*.jsonl"))

In [52]:
# ---------------------------------------------------------------------------
# Updated Text class – defensive against missing keys in the JSONL record
# --------------------------------------------------------------------------- #
from dataclasses import dataclass, field
from typing import Any, Dict, Iterable, List


@dataclass
class AnnotationLayer:
    items: List[Dict[str, Any]] = field(default_factory=list)

    def __iter__(self) -> Iterable[Dict[str, Any]]:
        return iter(self.items)

    def __len__(self) -> int:
        return len(self.items)

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}({len(self.items)} items)"


@dataclass
class TokensLayer(AnnotationLayer):
    pass


@dataclass
class WsdLayer(AnnotationLayer):
    pass


@dataclass
class PhrasesLayer(AnnotationLayer):
    pass


@dataclass
class Text:
    context_file: str
    raw_text: str
    tokens: TokensLayer = field(default_factory=TokensLayer)
    wsd: WsdLayer = field(default_factory=WsdLayer)
    phrases: PhrasesLayer = field(default_factory=PhrasesLayer)

    @classmethod
    def from_dict(cls, d: Dict[str, Any]) -> "Text":
        """
        Build a Text instance from a dictionary produced by a JSONL line.
        All keys are fetched with .get() to avoid KeyError if a field
        is missing in some records.
        """
        return cls(
            context_file=d.get("context_file", "unknown"),
            raw_text=d.get("text", ""),
            tokens=TokensLayer(d.get("tokens", [])),
            wsd=WsdLayer(d.get("wsd", [])),
            phrases=PhrasesLayer(d.get("phrases", [])),
        )

    def __repr__(self) -> str:
        return (
            f"Text(file={self.context_file!r}, len={len(self.raw_text)} "
            f"tokens={len(self.tokens)} wsd={len(self.wsd)} phrases={len(self.phrases)})"
        )

    def get_annotation(self, index: int) -> Dict[str, Any]:
        token_entry = next((t for t in self.tokens if t["index"] == index), None)
        if token_entry is None:
            raise ValueError(f"No token with index {index}")

        wsd_entry = next((w for w in self.wsd if w["index"] == index), None)
        phrase_entry = next((p for p in self.phrases if p["head"] == index), None)

        return {"token": token_entry, "wsd": wsd_entry, "phrase": phrase_entry}

    def get_context(self, index: int, window: int = 5) -> Dict[str, Any]:
        center_token = next((t for t in self.tokens if t["index"] == index), None)
        if center_token is None:
            raise ValueError(f"No token with index {index}")

        center_pos = self.tokens.items.index(center_token)

        left_tokens = self.tokens.items[max(0, center_pos - window) : center_pos]
        right_tokens = self.tokens.items[center_pos + 1 : center_pos + 1 + window]

        full_window = left_tokens + [center_token] + right_tokens
        context_text = " ".join(tok["orth"] for tok in full_window)

        return {
            "center": center_token,
            "left": left_tokens,
            "right": right_tokens,
            "text": context_text,
        }

    def annotate_tokens_with_context(self, window: int = 5) -> List[Dict[str, Any]]:
        return [
            self.get_annotation(tok["index"])
            | self.get_context(tok["index"], window)
            for tok in self.tokens
        ]

In [None]:
# Demo: list all jsonl files in the dataset directory
jsonl_files = list_jsonl_files(wsd_dataset_path)
print(f"Found {len(jsonl_files)} JSONL files in {wsd_dataset_path}:")
for i, fp in enumerate(jsonl_files, 1):
    print(f"{i}. {fp.name}")

# If there is at least one file, open it and show the first 5 records
if jsonl_files:
    sample_file = jsonl_files[3]
    print(f"\nShowing first 5 records from {sample_file.name}:")
    reader = JsonlFileReader(sample_file)
    for idx, record in enumerate(reader):
        if idx >= 1:
            break
        print(list(record.keys()))
        print(record)
        # print(f"Record {idx+1}:", json.dumps(record["wsd"], indent=2, ensure_ascii=False))

In [None]:
# Example – build a Text from the first record of the first JSONL file
if jsonl_files:
    # Get an iterator over the file
    reader_iter = iter(JsonlFileReader(jsonl_files[0]))
    first_record = next(reader_iter)  # <-- fixed
    sample_text = Text.from_dict(first_record)
    print(sample_text)
    print("First 3 tokens:", list(sample_text.tokens)[:3])
    print("First 3 WSD entries:", list(sample_text.wsd)[:3])
    print("First 3 phrases:", list(sample_text.phrases)[:3])
else:
    print("No JSONL files found in", wsd_dataset_path)

In [48]:
if jsonl_files:
    reader_iter = iter(JsonlFileReader(jsonl_files[1]))
    first_record = next(reader_iter)
    sample_text = Text.from_dict(first_record)

    # 1. Annotation for token index 50 + surrounding context
    ann_50 = sample_text.get_annotation(50)
    ctx_50 = sample_text.get_context(50, window=5)

    print("Token 50 annotation:")
    print(ann_50)
    print("\nContext around token 50:")
    print(ctx_50["text"])
    print("\nLeft tokens:", [t["orth"] for t in ctx_50["left"]])
    print("Right tokens:", [t["orth"] for t in ctx_50["right"]])

    # 2. All tokens annotated with context (slower but convenient)
    full_ann = sample_text.annotate_tokens_with_context(window=3)
    print("\nFirst annotated token with context:")
    print(full_ann[0])

Token 50 annotation:
{'token': {'index': 50, 'position': [257, 263], 'orth': 'jednym', 'lemma': 'jeden', 'pos': 'adj', 'ctag': 'adj:loc:pos:n:sg'}, 'wsd': None, 'phrase': None}

Context around token 50:
Chciał by m tylko o jednym powiedzieć , że o ile

Left tokens: ['Chciał', 'by', 'm', 'tylko', 'o']
Right tokens: ['powiedzieć', ',', 'że', 'o', 'ile']

First annotated token with context:
{'token': {'index': 0, 'position': [0, 5], 'orth': 'Poseł', 'lemma': 'poseł', 'pos': 'noun', 'ctag': 'subst:nom:m1:sg'}, 'wsd': {'index': 0, 'pl_sense': 'poseł.2.n', 'plWN_syn_id': 'f3bcc549-aac4-11ed-aae5-0242ac130002', 'plWN_lex_id': 'd49dde9d-aac4-11ed-aae5-0242ac130002', 'plWN_syn_legacy_id': '6360', 'plWN_lex_legacy_id': '6084', 'PWN_syn_id': '10522035-n', 'bn_syn_id': 'bn:00067199n', 'mapping_relation': 'hypernymy'}, 'phrase': None, 'center': {'index': 0, 'position': [0, 5], 'orth': 'Poseł', 'lemma': 'poseł', 'pos': 'noun', 'ctag': 'subst:nom:m1:sg'}, 'left': [], 'right': [{'index': 1, 'position'

In [53]:
from collections import Counter
import pandas as pd


# Helper that collects WSD statistics from a single Text instance
def wsd_stats_from_text(text_obj: Text) -> Dict[str, Any]:
    senses = [entry["pl_sense"] for entry in text_obj.wsd if "pl_sense" in entry]
    return {
        "total_wsd": len(senses),
        "unique_senses": len(set(senses)),
        "most_common_sense": (
            Counter(senses).most_common(1)[0][0] if senses else None
        ),
    }


# --- Walk through all files and aggregate statistics ---

file_stats = []

for jsonl_path in jsonl_files:
    file_name = os.path.basename(jsonl_path)
    # Build a counter for the whole file (multiple records per file)
    file_wsd_counter = Counter()
    file_total_wsd = 0
    file_unique_senses = set()

    # Iterate over every record in the file
    for record in JsonlFileReader(jsonl_path):
        txt = Text.from_dict(record)
        stats = wsd_stats_from_text(txt)

        file_wsd_counter.update([e["pl_sense"] for e in txt.wsd if "pl_sense" in e])
        file_total_wsd += stats["total_wsd"]
        file_unique_senses.update(
            [e["pl_sense"] for e in txt.wsd if "pl_sense" in e]
        )

    most_common = file_wsd_counter.most_common(1)[0][0] if file_wsd_counter else None

    file_stats.append(
        {
            "file": file_name,
            "total_wsd": file_total_wsd,
            "unique_senses": len(file_unique_senses),
            "most_common_sense": most_common,
        }
    )

# Convert to DataFrame for nicer printing
df_stats = pd.DataFrame(file_stats)
df_stats = df_stats.sort_values("total_wsd", ascending=False).reset_index(drop=True)

print("WSD statistics per file:")
print(df_stats.to_string(index=False))

WSD statistics per file:
                     file  total_wsd  unique_senses most_common_sense
       walenty_text.jsonl      46345          10932        bardzo.1.r
     skladnica_text.jsonl      40815          13959           być.3.v
       emoglex_text.jsonl      39781          12818           być.9.v
      wikiglex_text.jsonl      21079           8506          być.10.v
          kpwr_text.jsonl      14387           3128           pan.4.n
      kpwr-100_text.jsonl      13397           5678           być.9.v
      sherlock_text.jsonl       3769           2049           być.9.v
 sherlock_sentences.jsonl          0              0              None
  emoglex_sentences.jsonl          0              0              None
  walenty_sentences.jsonl          0              0              None
 wikiglex_sentences.jsonl          0              0              None
 kpwr-100_sentences.jsonl          0              0              None
skladnica_sentences.jsonl          0              0              