In [6]:
# ============================================================
# SUPER ROBUST (FIXED):
# - extract JSON objects by brace scanning
# - fixes invalid backslash escapes inside strings
# - skips broken objects (logs counts)
# - computes tokenization metrics
# ============================================================

!pip -q install -U transformers tqdm

import json
import math
from collections import Counter
from tqdm.auto import tqdm
from transformers import AutoTokenizer

MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
CLEAN_PATH = "baseline_15000.json"
SEG_PATH   = "kazakh_segmented_15000.json"
TEXT_FIELDS = ("question",)   # ("question","answer") қылсаң болады


def repair_invalid_escapes_in_json_text(s: str) -> str:
    """
    Fix invalid JSON escapes inside strings:
    backslash + invalid escape char  -> double-backslash + that char
    Valid after backslash:  "  \\  /  b f n r t u
    """
    valid = set(['"', "\\", "/", "b", "f", "n", "r", "t", "u"])
    out = []
    in_str = False
    i = 0
    n = len(s)

    while i < n:
        ch = s[i]

        if not in_str:
            if ch == '"':
                in_str = True
            out.append(ch)
            i += 1
            continue

        # inside string
        if ch == '"':
            # quote is closing if not escaped by odd backslashes
            bs = 0
            j = i - 1
            while j >= 0 and s[j] == "\\":
                bs += 1
                j -= 1
            if bs % 2 == 0:
                in_str = False
            out.append(ch)
            i += 1
            continue

        if ch == "\\":
            if i + 1 >= n:
                out.append("\\\\")
                i += 1
                continue
            nxt = s[i + 1]
            if nxt in valid:
                out.append("\\")
                out.append(nxt)
            else:
                out.append("\\\\")
                out.append(nxt)
            i += 2
            continue

        out.append(ch)
        i += 1

    return "".join(out)


def extract_json_objects(raw: str):
    """
    Extract candidate JSON objects by tracking braces { ... } while respecting strings.
    Returns: list[str]
    """
    objs = []
    in_str = False
    escape = False
    depth = 0
    start = None

    for i, ch in enumerate(raw):
        if in_str:
            if escape:
                escape = False
            elif ch == "\\":
                escape = True
            elif ch == '"':
                in_str = False
        else:
            if ch == '"':
                in_str = True
            elif ch == "{":
                if depth == 0:
                    start = i
                depth += 1
            elif ch == "}":
                if depth > 0:
                    depth -= 1
                    if depth == 0 and start is not None:
                        objs.append(raw[start:i+1])
                        start = None

    return objs


def load_records_salvage(path, max_bad_ratio=0.2):
    """
    Salvage records from broken JSON by:
      1) try strict JSON
      2) else extract {...} candidates and parse each with repair
    Returns: (records:list, info:dict)
    """
    with open(path, "r", encoding="utf-8") as f:
        raw = f.read().strip()

    # 1) strict JSON
    try:
        data = json.loads(raw)
        if isinstance(data, list):
            return data, {"mode": "strict_json", "ok": len(data), "bad": 0, "candidates": len(data)}
        if isinstance(data, dict):
            for k in ["data", "items", "records", "examples"]:
                if k in data and isinstance(data[k], list):
                    return data[k], {"mode": "strict_json_dict", "ok": len(data[k]), "bad": 0, "candidates": len(data[k])}
            return [data], {"mode": "strict_single_dict", "ok": 1, "bad": 0, "candidates": 1}
    except json.JSONDecodeError:
        pass

    # 2) salvage by brace scan
    candidates = extract_json_objects(raw)
    records = []
    bad = 0

    for obj_txt in tqdm(candidates, desc=f"Salvaging: {path}"):
        fixed = repair_invalid_escapes_in_json_text(obj_txt)
        try:
            records.append(json.loads(fixed))
        except json.JSONDecodeError:
            bad += 1
            continue

    ok = len(records)
    cand_n = len(candidates)
    info = {"mode": "salvage_brace_scan", "ok": ok, "bad": bad, "candidates": cand_n}

    # sanity check
    if cand_n > 0 and (bad / cand_n) > max_bad_ratio:
        raise ValueError(
            f"{path}: too many broken objects while salvaging: bad={bad}, candidates={cand_n}. "
            f"File may be severely corrupted."
        )
    if ok == 0:
        raise ValueError(f"{path}: no valid JSON objects could be salvaged.")
    return records, info


def record_to_text(rec, fields):
    if isinstance(rec, str):
        return rec.strip()
    if not isinstance(rec, dict):
        return str(rec).strip()
    parts = []
    for k in fields:
        v = rec.get(k, "")
        if v is None:
            v = ""
        parts.append(str(v))
    return " ".join(parts).strip()


def tokenization_eval(records, tokenizer, fields=("question",), batch_size=256):
    texts = [record_to_text(r, fields) for r in records]
    texts = [t for t in texts if t]

    num_chars = sum(len(t) for t in texts)

    token_counter = Counter()
    total_tokens = 0

    for i in tqdm(range(0, len(texts), batch_size), desc="Tokenizing"):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            batch,
            add_special_tokens=False,
            truncation=False,
            return_attention_mask=False,
            return_token_type_ids=False
        )
        for ids in enc["input_ids"]:
            total_tokens += len(ids)
            token_counter.update(ids)

    vocab_used = len(token_counter)
    compression = (num_chars / total_tokens) if total_tokens else float("nan")
    theo_bits = math.log2(vocab_used) if vocab_used else float("nan")

    if total_tokens:
        H = 0.0
        for c in token_counter.values():
            p = c / total_tokens
            H -= p * math.log2(p)
    else:
        H = float("nan")

    theo_info_mbit = (total_tokens * theo_bits) / 1e6 if total_tokens else float("nan")
    act_info_mbit  = (total_tokens * H) / 1e6 if total_tokens else float("nan")

    return {
        "Tokenizer": "MiniLM (default)",
        "Actual Vocabulary Size": vocab_used,
        "Number of Tokens": total_tokens,
        "Number of Characters": num_chars,
        "Compression (characters/token)": compression,
        "Theoretical Bits/token": theo_bits,
        "Actual Bits/token": H,
        "Theoretical Information (Mbit)": theo_info_mbit,
        "Actual Information (Mbit)": act_info_mbit,
    }


def print_table(title, row, info=None):
    print("\n" + title)
    print("-"*len(title))
    if info:
        print(f"Loaded mode: {info['mode']} | ok={info['ok']} | bad={info['bad']} | candidates={info['candidates']}")
        print("-"*len(title))
    for k, v in row.items():
        if isinstance(v, float):
            print(f"{k:35s}: {v:.4f}")
        else:
            print(f"{k:35s}: {v}")


# -----------------------------
# RUN
# -----------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

clean_records, clean_info = load_records_salvage(CLEAN_PATH)
seg_records, seg_info     = load_records_salvage(SEG_PATH)

clean_row = tokenization_eval(clean_records, tokenizer, fields=TEXT_FIELDS)
seg_row   = tokenization_eval(seg_records, tokenizer, fields=TEXT_FIELDS)

print_table("Table 2. Tokenization Evaluation Results for the Original (Clean) Corpus", clean_row, clean_info)
print_table("Table 3. Tokenization Evaluation Results for the Segmented Corpus", seg_row, seg_info)

Salvaging: baseline_15000.json:   0%|          | 0/14999 [00:00<?, ?it/s]

Salvaging: kazakh_segmented_15000.json:   0%|          | 0/15000 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/59 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/59 [00:00<?, ?it/s]


Table 2. Tokenization Evaluation Results for the Original (Clean) Corpus
------------------------------------------------------------------------
Loaded mode: salvage_brace_scan | ok=14993 | bad=6 | candidates=14999
------------------------------------------------------------------------
Tokenizer                          : MiniLM (default)
Actual Vocabulary Size             : 4376
Number of Tokens                   : 242194
Number of Characters               : 922125
Compression (characters/token)     : 3.8074
Theoretical Bits/token             : 12.0954
Actual Bits/token                  : 8.9980
Theoretical Information (Mbit)     : 2.9294
Actual Information (Mbit)          : 2.1793

Table 3. Tokenization Evaluation Results for the Segmented Corpus
-----------------------------------------------------------------
Loaded mode: salvage_brace_scan | ok=14998 | bad=2 | candidates=15000
-----------------------------------------------------------------
Tokenizer                          :