In [None]:
!pip install bert-score



intfloat/multilingual-e5-large-instruct baseline

In [None]:
# @title
# ============================================================
# BASE_ONLY (Expert / Reviewer-proof single-dataset run)
# - Loads ONLY baseline (clean) dataset
# - One deterministic split (seed)
# - Index questions = CLEAN(question)
# - Gold answers = CLEAN(answer)
# - Interactive QA + evaluation metrics
#
# Metrics:
#   Exact@1, TokenF1@1, MeanCos@1(QSim), Semantic@1(ans_cos>=thr), BERTScore(optional)
#
# Also exports per-test details to CSV for expert manual checking.
# ============================================================

# !pip -q install -U sentence-transformers scikit-learn
# Optional:
# !pip -q install -U bert-score

import json, re, time, glob, hashlib, random, csv
from pathlib import Path
from dataclasses import dataclass
from typing import List, Dict, Any, Tuple, Optional

import numpy as np
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer

try:
    from bert_score import score as bert_score
except Exception:
    bert_score = None


# ---------------------------
# CONFIG
# ---------------------------
DATA_PATH   = "baseline_15000.json"

MODEL_NAME  = "intfloat/multilingual-e5-large-instruct"
SEED        = 42
TEST_SIZE   = 0.10
SEM_THR     = 0.85
DEVICE      = None  # None -> auto

# Export for expert checking
EXPORT_CSV  = True
CSV_PATH    = "base_only_test_details.csv"

# ---------------------------
# Utility: file auto-find
# ---------------------------
def find_data_path(p: str) -> str:
    if Path(p).exists():
        return p
    candidates = [f"/content/{p}", f"/content/drive/MyDrive/{p}"]
    for c in candidates:
        if Path(c).exists():
            return c
    name = Path(p).name
    hits = glob.glob(f"**/{name}", recursive=True)
    if hits:
        return hits[0]
    near = glob.glob("**/*.json", recursive=True)
    raise FileNotFoundError(
        f"❌ File not found: {p}\nPWD: {Path.cwd()}\n"
        f"Found .json (first 30):\n" + "\n".join(near[:30])
    )


# ---------------------------
# Robust loader (JSON array / JSONL / trailing commas / brace-scan)
# ---------------------------
def load_qa_records(path: str) -> List[Dict[str, str]]:
    text = Path(path).read_text(encoding="utf-8", errors="ignore").strip()
    if not text:
        raise ValueError(f"Файл бос: {path}")

    # JSON array
    if text[0] == "[":
        try:
            data = json.loads(text)
            return _normalize_records(data)
        except Exception:
            pass

    # JSONL
    lines = [ln.strip().rstrip(",") for ln in text.splitlines() if ln.strip()]
    if lines and lines[0].startswith("{"):
        recs = []
        ok = True
        for ln in lines:
            try:
                recs.append(json.loads(ln))
            except Exception:
                ok = False
                break
        if ok and recs:
            return _normalize_records(recs)

    # fallback: brace scan (best-effort)
    objs = []
    buf, depth = [], 0
    in_str, esc, started = False, False, False

    for ch in text:
        if not started:
            if ch == "{":
                started = True
                depth = 1
                buf = ["{"]
            continue

        buf.append(ch)

        if in_str:
            if esc:
                esc = False
            elif ch == "\\":
                esc = True
            elif ch == '"':
                in_str = False
        else:
            if ch == '"':
                in_str = True
            elif ch == "{":
                depth += 1
            elif ch == "}":
                depth -= 1
                if depth == 0:
                    obj_txt = "".join(buf)
                    buf = []
                    started = False
                    try:
                        objs.append(json.loads(obj_txt))
                    except Exception:
                        pass

    if not objs:
        raise ValueError(f"JSON оқу мүмкін болмады. Файл форматын тексеріңіз: {path}")
    return _normalize_records(objs)


def _normalize_records(data: Any) -> List[Dict[str, str]]:
    if not isinstance(data, list):
        raise ValueError("Дерек list болуы керек.")
    out = []
    for x in data:
        if not isinstance(x, dict):
            continue
        q = x.get("question") or x.get("instruction") or ""
        a = x.get("answer") or x.get("response") or ""
        q = str(q).strip()
        a = str(a).strip()
        if q and a:
            out.append({"question": q, "answer": a})
    if not out:
        raise ValueError("question/answer табылмады немесе бос.")
    return out


# ---------------------------
# Text normalization
# ---------------------------
_punct_space_left  = re.compile(r"\s+([.,!?;:%)\]\}])")
_punct_space_right = re.compile(r"([(\[\{])\s+")
_multi_space       = re.compile(r"\s+")

def clean_view(text: str) -> str:
    t = "" if text is None else str(text)
    t = t.replace("@@ ", "").replace("@@", "")
    t = t.replace(" - ", "-")
    t = _punct_space_left.sub(r"\1", t)
    t = _punct_space_right.sub(r"\1", t)
    t = _multi_space.sub(" ", t).strip()
    return t

def norm_for_exact(text: str) -> str:
    return re.sub(r"\s+", " ", clean_view(text).lower()).strip()

def tokens(text: str) -> List[str]:
    t = clean_view(text).lower()
    return re.findall(r"[a-zA-Zа-яА-ЯәғқңөұүһіӘҒҚҢӨҰҮҺІ0-9]+", t)

def token_f1(pred: str, gold: str) -> float:
    p = tokens(pred); g = tokens(gold)
    if not p and not g: return 1.0
    if not p or not g: return 0.0
    from collections import Counter
    pc = Counter(p); gc = Counter(g)
    inter = sum((pc & gc).values())
    if inter == 0: return 0.0
    prec = inter / max(1, len(p))
    rec  = inter / max(1, len(g))
    return (2 * prec * rec) / (prec + rec + 1e-12)


# ---------------------------
# Retrieval index
# ---------------------------
@dataclass
class QAIndex:
    q_text: List[str]
    q_emb: np.ndarray
    ans_clean: List[str]
    a_emb: np.ndarray

def build_index(model: SentenceTransformer, train_rows: List[Dict[str,str]]) -> QAIndex:
    q_view = [clean_view(x["question"]) for x in train_rows]
    a_clean = [clean_view(x["answer"]) for x in train_rows]
    q_emb = model.encode(q_view, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
    a_emb = model.encode(a_clean, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
    return QAIndex(q_view, q_emb, a_clean, a_emb)

def retrieve_top1(index: QAIndex, q_vec: np.ndarray) -> Tuple[int, float]:
    sims = np.dot(index.q_emb, q_vec)
    i = int(np.argmax(sims))
    return i, float(sims[i])


# ---------------------------
# BERTScore helper
# ---------------------------
def _bert_lang_try(preds: List[str], golds: List[str]) -> Optional[float]:
    if bert_score is None:
        return None
    for lang in ("kk", "tr", "en"):
        try:
            P, R, F1 = bert_score(preds, golds, lang=lang, rescale_with_baseline=True)
            arr = F1.numpy() if hasattr(F1, "numpy") else np.array(F1)
            return float(np.mean(arr))
        except Exception:
            continue
    return None


# ---------------------------
# Evaluation
# ---------------------------
def eval_run(model: SentenceTransformer, train_rows, test_rows) -> Tuple[Dict[str,Any], List[Dict[str,Any]]]:
    index = build_index(model, train_rows)

    test_q = [clean_view(x["question"]) for x in test_rows]
    test_q_emb = model.encode(test_q, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)

    gold = [clean_view(x["answer"]) for x in test_rows]
    gold_a_emb = model.encode(gold, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)

    exacts, tf1s, qcos1s, semhit = [], [], [], []
    preds_for_bert, golds_for_bert = [], []

    details = []

    for i in range(len(test_rows)):
        idx, qcos = retrieve_top1(index, test_q_emb[i])
        pred = index.ans_clean[idx]
        g    = gold[i]

        ex = 1.0 if norm_for_exact(pred) == norm_for_exact(g) else 0.0
        f1 = token_f1(pred, g)
        qsim = float(qcos)

        sem_cos = float(np.dot(index.a_emb[idx], gold_a_emb[i]))
        sh = 1.0 if sem_cos >= SEM_THR else 0.0

        exacts.append(ex)
        tf1s.append(f1)
        qcos1s.append(qsim)
        semhit.append(sh)

        if bert_score is not None:
            preds_for_bert.append(pred)
            golds_for_bert.append(g)

        details.append({
            "test_question": test_q[i],
            "gold_answer": g,
            "pred_answer": pred,
            "QSim": qsim,
            "Exact": ex,
            "TokenF1": f1,
            "AnsCos": sem_cos,
            "SemHit": sh
        })

    out = {
        "Dataset": "BASE_ONLY",
        "Exact@1": float(np.mean(exacts)),
        "TokenF1@1": float(np.mean(tf1s)),
        "MeanCos@1(QSim)": float(np.mean(qcos1s)),
        f"Semantic@1(ans_cos≥{SEM_THR})": float(np.mean(semhit)),
    }

    if bert_score is not None and preds_for_bert:
        bf1 = _bert_lang_try(preds_for_bert, golds_for_bert)
        if bf1 is not None:
            out["BERTScoreF1@1"] = float(bf1)

    return out, details


# ---------------------------
# Interactive QA
# ---------------------------
def interactive(model: SentenceTransformer, train_rows: List[Dict[str,str]]):
    idx = build_index(model, train_rows)
    print(f"\n==================== INTERACTIVE QA (BASE_ONLY) ====================")
    print("Input: CLEAN question | Output: CLEAN answer (Top1).")
    print("Шығу үшін: exit немесе quit.\n")
    while True:
        q = input("Сұрақ (таза): ").strip()
        if not q:
            continue
        if q.lower() in {"exit","quit","q"}:
            break
        qv = model.encode([clean_view(q)], convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)[0]
        j, sim = retrieve_top1(idx, qv)
        print(f"\nЖауап (Top1 QSim={sim:.4f}):\n{idx.ans_clean[j]}\n")


# ---------------------------
# Pretty print + export
# ---------------------------
def print_result(res: Dict[str,Any]):
    print("\n==================== RESULTS (BASE_ONLY) ====================")
    for k, v in res.items():
        if isinstance(v, float):
            print(f"{k:>28}: {v:.6f}")
        else:
            print(f"{k:>28}: {v}")

def export_csv(details: List[Dict[str,Any]], path: str):
    if not details:
        return
    fields = list(details[0].keys())
    with open(path, "w", encoding="utf-8", newline="") as f:
        w = csv.DictWriter(f, fieldnames=fields)
        w.writeheader()
        for r in details:
            w.writerow(r)
    print(f"\n✅ CSV exported: {path}  (rows={len(details)})")


# ---------------------------
# MAIN
# ---------------------------
def main():
    random.seed(SEED)
    np.random.seed(SEED)

    data_path = find_data_path(DATA_PATH)
    rows = load_qa_records(data_path)

    print(f"[BASE_ONLY] Loaded: {len(rows)} | {data_path}")

    train_rows, test_rows = train_test_split(
        rows, test_size=TEST_SIZE, random_state=SEED, shuffle=True
    )
    print("\n==================== ONE SPLIT ====================")
    print(f"Total={len(rows)} | Train={len(train_rows)} | Test={len(test_rows)} | seed={SEED} | test_size={TEST_SIZE}")

    model = SentenceTransformer(MODEL_NAME, device=DEVICE)
    print(f"\nModel: {MODEL_NAME}")

    # Interactive first (manual checking)
    interactive(model, train_rows)

    # Eval
    t0 = time.time()
    res, details = eval_run(model, train_rows, test_rows)
    dt = time.time() - t0

    print_result(res)
    print(f"\nTime: {dt:.2f}s")
    if bert_score is None:
        print("Note: BERTScore орнатылмаған (pip install bert-score).")

    if EXPORT_CSV:
        export_csv(details, CSV_PATH)

    print("\n✅ BASE_ONLY done. (Single dataset, deterministic split, clean view)")

if __name__ == "__main__":
    main()


[BASE_ONLY] Loaded: 14991 | baseline_15000.json

Total=14991 | Train=13491 | Test=1500 | seed=42 | test_size=0.1


Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]


Model: intfloat/multilingual-e5-large-instruct

Input: CLEAN question | Output: CLEAN answer (Top1).
Шығу үшін: exit немесе quit.

Сұрақ (таза): Python тілінің негізгі артықшылықтарын атаңыз.

Жауап (Top1 QSim=0.9395):
Python — оқуға жеңіл және интуитивті синтаксиске ие тіл. ```python сан = 5 if сан > 3: print("Үлкен сан") ```

Сұрақ (таза): Python-дағы REPL ортасының практикалық пайдасы неде?

Жауап (Top1 QSim=0.9609):
REPL — Read-Eval-Print-Loop. Бұл Python командаларын бірден орындап, нәтижесін көрсететін режим.

Сұрақ (таза): CPython, PyPy сияқты интерпретаторлардың айырмашылығы қандай?

Жауап (Top1 QSim=0.9453):
PyPy (JIT), Jython (Java), IronPython (.NET) — Python интерпретаторларының баламалары. ```python # PyPy — жылдам жұмыс істейді ```

Сұрақ (таза): Python скриптін терминалдан параметрмен қалай іске қосамыз?

Жауап (Top1 QSim=0.9688):
Терминалға файлдың атын енгіземіз: ``` python my_script.py ```

Сұрақ (таза): Python-да синтаксистік шегініс (indentation) қате болса қандай 

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: bert-base-multilingual-cased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.



                     Dataset: BASE_ONLY
                     Exact@1: 0.024000
                   TokenF1@1: 0.445845
             MeanCos@1(QSim): 0.967743
    Semantic@1(ans_cos≥0.85): 0.992000
               BERTScoreF1@1: 0.831667

Time: 197.09s

✅ CSV exported: base_only_test_details.csv  (rows=1500)

✅ BASE_ONLY done. (Single dataset, deterministic split, clean view)


