Environment & Installs

In [9]:
# Safe to rerun
import sys, subprocess

def _pip(*args):
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", *args], check=True)

_pip(
    "datasets", "evaluate", "pandas",
    "ragas>=0.1.9",          # optional; not required for local ID-overlap metrics
    "transformers", "sentence-transformers",
    "faiss-cpu", "nbconvert",
    "pymilvus"               # not used here but handy if upstream notebooks used Milvus
)

import os, re, importlib, runpy, types
import numpy as np
import pandas as pd
from pathlib import Path
from datasets import load_dataset, Dataset

In [12]:
# Mount (Colab) or use local CWD
try:
    from google.colab import drive  # type: ignore
    drive.mount("/content/drive")
    IN_COLAB = True
except Exception:
    IN_COLAB = False

from pathlib import Path
import os

# Try both styles Colab has used over time
CANDIDATE_ROOTS = [Path("/content/drive/MyDrive"), Path("/content/drive/My Drive")]
if not IN_COLAB:
    CANDIDATE_ROOTS = [Path.cwd()]  # local dev

TARGET_FILENAMES = {
    "step2": "NAIVE_RAG(Steps_2_and_3).ipynb",
    "step5": "AdvancedRAG(Step5).ipynb",
}

def find_notebook(filename: str) -> Path | None:
    for root in CANDIDATE_ROOTS:
        if not root.exists():
            continue
        # First, check your stated folder
        nb_dir = root / "NLX_LLM_Project_2"
        candidate = nb_dir / filename
        if candidate.exists():
            return candidate
        # If not there, search the whole Drive (can take a few seconds)
        for p in root.rglob("*.ipynb"):
            if p.name == filename:
                return p
    return None

NB_STEP2 = find_notebook(TARGET_FILENAMES["step2"])
NB_STEP5 = find_notebook(TARGET_FILENAMES["step5"])

print("Resolved Step2 path:", NB_STEP2)
print("Resolved Step5 path:", NB_STEP5)

# Where to write converted .py and outputs
if IN_COLAB and (NB_STEP2 or NB_STEP5):
    # Prefer placing artifacts beside the notebooks if possible
    ARTIFACTS = (NB_STEP2 or NB_STEP5).parent / "artifacts"
else:
    ARTIFACTS = Path.cwd() / "artifacts"

ARTIFACTS.mkdir(parents=True, exist_ok=True)
print("Artifacts dir:", ARTIFACTS)

# Quick sanity: list the target folder if a path is missing
if NB_STEP2 is None or NB_STEP5 is None:
    for root in CANDIDATE_ROOTS:
        test_dir = root / "NLX_LLM_Project_2"
        if test_dir.exists():
            print("Contents of", test_dir, ":\n", [p.name for p in test_dir.iterdir()])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Resolved Step2 path: /content/drive/MyDrive/Colab Notebooks/NLX_LLM_Project_2/NAIVE_RAG(Steps_2_and_3).ipynb
Resolved Step5 path: /content/drive/MyDrive/Colab Notebooks/AdvancedRAG(Step5).ipynb
Artifacts dir: /content/drive/MyDrive/Colab Notebooks/NLX_LLM_Project_2/artifacts


In [13]:
from nbconvert import PythonExporter
import nbformat

def convert_notebook(nb_path: Path, out_path: Path):
    assert nb_path is not None, "Notebook path is None (not found)."
    assert nb_path.exists(), f"Missing notebook: {nb_path}"
    nb = nbformat.read(nb_path, as_version=4)
    code, _ = PythonExporter().from_notebook_node(nb)
    out_path.write_text(code, encoding="utf-8")
    return out_path

STEP2_PY = ARTIFACTS / "step2_module.py"
STEP5_PY = ARTIFACTS / "step5_module.py"

convert_notebook(NB_STEP2, STEP2_PY)
convert_notebook(NB_STEP5, STEP5_PY)

print("Converted to:", STEP2_PY, "and", STEP5_PY)

Converted to: /content/drive/MyDrive/Colab Notebooks/NLX_LLM_Project_2/artifacts/step2_module.py and /content/drive/MyDrive/Colab Notebooks/NLX_LLM_Project_2/artifacts/step5_module.py


In [17]:
# Cell 4 — Bind from your notebooks (adapts `chunks` + `content` → canonical `chunk_store`)

import importlib.util, types, runpy
from pathlib import Path
import pandas as pd
from typing import Any, Dict, List

def load_module_from_path(name: str, path: Path) -> types.ModuleType:
    spec = importlib.util.spec_from_file_location(name, str(path))
    mod = importlib.util.module_from_spec(spec)  # type: ignore
    assert spec and spec.loader, f"Cannot load {name} from {path}"
    spec.loader.exec_module(mod)  # type: ignore
    return mod

mod_step2 = load_module_from_path("step2_module", STEP2_PY)
mod_step5 = load_module_from_path("step5_module", STEP5_PY)

def to_chunk_store(raw) -> List[Dict[str, str]]:
    out = []
    if isinstance(raw, list):
        for i, x in enumerate(raw):
            if isinstance(x, dict):
                txt = x.get("text") or x.get("body") or x.get("content") or x.get("chunk")
                if isinstance(txt, str) and txt.strip():
                    out.append({"id": str(x.get("id", i)), "text": txt})
            elif isinstance(x, str) and x.strip():
                out.append({"id": str(i), "text": x})
    elif isinstance(raw, pd.DataFrame):
        col = next((c for c in ("text","body","content","chunk") if c in raw.columns), None)
        if col:
            ser = raw[col].astype(str).fillna("")
            out = [{"id": str(i), "text": t} for i, t in ser.items() if t.strip()]
    return out

# Prefer Step 5 exports, then Step 2
raw_docs = (
    getattr(mod_step5, "docs", None) or
    getattr(mod_step5, "chunks", None) or
    getattr(mod_step2, "docs", None) or
    getattr(mod_step2, "chunks", None)
)

# If import didn’t execute main blocks, run once with runpy as a fallback
if not raw_docs:
    ns2 = runpy.run_path(str(STEP2_PY))
    ns5 = runpy.run_path(str(STEP5_PY))
    raw_docs = (
        ns5.get("docs") or ns5.get("chunks") or
        ns2.get("docs") or ns2.get("chunks")
    )

chunk_store = to_chunk_store(raw_docs)
assert chunk_store, "Couldn’t normalize any docs/chunks into {id,text}."

# Build embedder / FAISS index if not exported by Step 5
from sentence_transformers import SentenceTransformer
import faiss
embed_model = getattr(mod_step5, "embed_model", None)
faiss_index = getattr(mod_step5, "index", None)

if embed_model is None:
    embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

if faiss_index is None:
    vecs = embed_model.encode([d["text"] for d in chunk_store], normalize_embeddings=True).astype("float32")
    faiss_index = faiss.IndexFlatIP(vecs.shape[1])
    faiss_index.add(vecs)

print(f"[OK] chunks={len(chunk_store)} | embedder={type(embed_model).__name__} | faiss_ntotal={faiss_index.ntotal}")
print("Sample:", chunk_store[0])

Loading Wikipedia mini dataset...

Dataset size: (3200, 1)
Null values: {'passage': 0}
After cleanup: (3200, 1)

Sample passage:
 Uruguay (official full name in  ; pron.  , Eastern Republic of  Uruguay) is a country located in the southeastern part of South America.  It is home to 3.3 million people, of which 1.7 million live in the capital Montevideo and its metropolitan area. ...
Total Q&A pairs: 918
Example Question: Was Abraham Lincoln the sixteenth President of the United States?
Example Answer: yes
Total chunks created: 1289
Example chunk:
 Uruguay (official full name in  ; pron.  , Eastern Republic of  Uruguay) is a country located in the southeastern part of South America.  It is home to 3.3 million people, of which 1.7 million live in the capital Montevideo and its metropolitan area. ...
Generating embeddings...


Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Embedding shape: (1289, 384)
Setting up Milvus collection...
Inserted entities: 2578
Index ready.


Device set to use cpu


FLAN-T5 ready ✅
🔍 Query: What are the three sections of a beetle?
ID: 1281 | Score: 0.3709
Text: s as generally assumed, which would necessitate splitting the traditional Pelecaniformes in three. ...

ID: 1274 | Score: 0.3135
Text: The Megadyptes - Eudyptes clade occurs at similar latitudes (though not as far north as the Galapagos Penguin), has its highest diversity in the New Z ...

Q: Was Abraham Lincoln the sixteenth President of the United States?


Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



=== Generated Answer ===
Abraham Lincoln (February 12, 1809 â April 15, 1865) was the sixteenth President of the United States, serving from March 4, 1861 until his assassination.

=== Retrieved Chunks ===
[1] ID: 339 | Score: 0.7095
Young Abraham Lincoln ...

[2] ID: 320 | Score: 0.6434
Abraham Lincoln (February 12, 1809 â April 15, 1865) was the sixteenth President of the United States, serving from Ma ...

[3] ID: 381 | Score: 0.5896
On November 6, 1860, Lincoln was elected as the 16th President of the United States, beating Democrat Stephen A. Douglas ...

QA evaluation size: 918
Sample QA: {'question': 'Was Abraham Lincoln the sixteenth President of the United States?', 'answer': 'yes'}


Device set to use cpu


✅ Using local Transformers: google/flan-t5-base
instruction (20 samples): {'exact_match': 30.0, 'f1': 44.55011655011655}
cot (20 samples): {'exact_match': 15.0, 'f1': 27.979597562930895}
persona (20 samples): {'exact_match': 30.0, 'f1': 43.641025641025635}

→ Evaluating instruction on 100 samples (GPU)

→ Evaluating cot on 100 samples (GPU)

→ Evaluating persona on 100 samples (GPU)

=== Results ===
instruction {'exact_match': 17.0, 'f1': 20.7685332211648}
cot {'exact_match': 7.0, 'f1': 13.376291029729087}
persona {'exact_match': 20.0, 'f1': 23.634334086965666}
OK: datasets
OK: sentence_transformers
OK: transformers
OK: faiss
OK: numpy
OK: pandas
OK: evaluate
Active configuration: {'encoder_model': 'sentence-transformers/all-MiniLM-L6-v2', 'chunk_size_chars': 600, 'embed_batch': 64, 'retrieval_candidates': 20, 'n_query_vectors': 3, 'rerank_top_k': 5, 'context_limit': 2000, 'use_openai_api': False}
Corpus loaded: DatasetDict({
    passages: Dataset({
        features: ['passage', 'id'],

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

✅ Index ready | dimension=384 | vectors=1289


Device set to use cpu


❓ Question: Was Abraham Lincoln the sixteenth President of the United States?


Token indices sequence length is longer than the specified maximum sequence length for this model (550 > 512). Running this sequence through the model will result in indexing errors



💬 Answer:
 Abraham Lincoln (February 12, 1809 â April 15, 1865) was the sixteenth President of the United States, serving from March 4, 1861 until his assassination.

📚 Citations: [{'id': np.str_('278-0'), 'score': 10.308937072753906}, {'id': np.str_('319-0'), 'score': 8.12199592590332}, {'id': np.str_('198-0'), 'score': -0.4494704604148865}, {'id': np.str_('383-0'), 'score': -0.7347864508628845}, {'id': np.str_('281-0'), 'score': -0.9723390936851501}]
[OK] chunks=1289 | embedder=SentenceTransformer | faiss_ntotal=1289
Sample: {'id': '0', 'text': 'Uruguay (official full name in  ; pron.  , Eastern Republic of  Uruguay) is a country located in the southeastern part of South America.  It is home to 3.3 million people, of which 1.7 million live in the capital Montevideo and its metropolitan area.'}


In [18]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# Local generator (Flan-T5)
_GEN_TOK = AutoTokenizer.from_pretrained("google/flan-t5-base")
_GEN_MDL = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
GEN = pipeline("text2text-generation", model=_GEN_MDL, tokenizer=_GEN_TOK, device_map="auto")

# Cross-encoder reranker
from sentence_transformers import CrossEncoder
_CE_ID  = "cross-encoder/ms-marco-MiniLM-L-6-v2"
_CE_TOK = AutoTokenizer.from_pretrained(_CE_ID)
RERANKER = CrossEncoder(_CE_ID, max_length=512)

def _trim_tokens(text: str, tokenizer, max_len: int) -> str:
    ids = tokenizer.encode(text, add_special_tokens=False)
    if len(ids) > max_len:
        ids = ids[:max_len]
    return tokenizer.decode(ids, skip_special_tokens=True)

Device set to use cpu


In [19]:
from typing import List, Tuple

def faiss_search_topk(query: str, k: int = 5) -> List[Tuple[int, str, float]]:
    qv = embed_model.encode([query], normalize_embeddings=True).astype("float32")
    D, I = faiss_index.search(qv, k)
    out = []
    for i in range(k):
        idx = int(I[0][i])
        score = float(D[0][i])
        out.append((idx, chunk_store[idx]["text"], score))
    return out

In [20]:
def basic_respond(question: str):
    """Top-1 retrieval + constrained generation (offline)."""
    hits = faiss_search_topk(question, k=1)
    contexts = [hits[0][1]] if hits else [""]
    prompt = (
        "Answer using ONLY the context. If insufficient, say 'I don't know.'\n\n"
        f"Context:\n{contexts[0]}\n\nQuestion: {question}\nAnswer:"
    )
    out = GEN(prompt, max_new_tokens=128)[0]["generated_text"].strip()
    return out, contexts

def ranked_respond(question: str, k_final: int = 5, ctx_char_budget: int = 2000):
    """
    Advanced: gather candidates (via your Step5 if present), rerank with CrossEncoder, generate with Flan-T5.
    """
    # 1) Candidate gather: use your Step5.gather_candidates if available, else FAISS top-20
    if hasattr(mod_step5, "gather_candidates"):
        cands, _rewrites = mod_step5.gather_candidates(question)
        # expected format: [(idx, text, score_like), ...]
    else:
        base = faiss_search_topk(question, k=20)
        cands = [(i, t, s) for (i, t, s) in base]

    if not cands:
        return "I don't know.", [""]

    # 2) Rerank with cross-encoder
    q_trim = _trim_tokens(question, _CE_TOK, max_len=32)
    pairs, idxs = [], []
    for idx, _t, _s in cands:
        p_trim = _trim_tokens(chunk_store[idx]["text"], _CE_TOK, max_len=480)
        pairs.append((q_trim, p_trim))
        idxs.append(idx)
    scores = RERANKER.predict(pairs, convert_to_numpy=True, show_progress_bar=False, batch_size=64)
    order = np.argsort(-scores)[:k_final]
    chosen = [int(idxs[i]) for i in order]

    # 3) Context assembly (use Step5.build_context_with_citations if available)
    if hasattr(mod_step5, "build_context_with_citations"):
        ctx_text, _cites = mod_step5.build_context_with_citations([(i, float(scores[j])) for j,i in enumerate(chosen)], budget=ctx_char_budget)
    else:
        ctx_text = ""
        for i in chosen:
            frag = chunk_store[i]["text"]
            if len(ctx_text) + len(frag) + 2 <= ctx_char_budget:
                ctx_text += ("\n\n" + frag) if ctx_text else frag

    # 4) Generate answer
    def _persona(ctx: str, q: str) -> str:
        q_t = _trim_tokens(q, _GEN_TOK, 48)
        c_t = _trim_tokens(ctx, _GEN_TOK, 460)
        return (
            "You are a concise encyclopedia editor. Use ONLY the context; "
            "if insufficient, reply 'I don't know.'\n\n"
            f"Context:\n{c_t}\n\nQuestion: {q_t}\nAnswer:"
        )

    ans = GEN(_persona(ctx_text, question), max_new_tokens=256)[0]["generated_text"].strip()
    ctx_list = [chunk_store[i]["text"] for i in chosen]
    return ans, ctx_list

In [21]:
def basic_respond(question: str):
    """Top-1 retrieval + constrained generation (offline)."""
    hits = faiss_search_topk(question, k=1)
    contexts = [hits[0][1]] if hits else [""]
    prompt = (
        "Answer using ONLY the context. If insufficient, say 'I don't know.'\n\n"
        f"Context:\n{contexts[0]}\n\nQuestion: {question}\nAnswer:"
    )
    out = GEN(prompt, max_new_tokens=128)[0]["generated_text"].strip()
    return out, contexts

def ranked_respond(question: str, k_final: int = 5, ctx_char_budget: int = 2000):
    """
    Advanced: gather candidates (via your Step5 if present), rerank with CrossEncoder, generate with Flan-T5.
    """
    # 1) Candidate gather: use your Step5.gather_candidates if available, else FAISS top-20
    if hasattr(mod_step5, "gather_candidates"):
        cands, _rewrites = mod_step5.gather_candidates(question)
        # expected format: [(idx, text, score_like), ...]
    else:
        base = faiss_search_topk(question, k=20)
        cands = [(i, t, s) for (i, t, s) in base]

    if not cands:
        return "I don't know.", [""]

    # 2) Rerank with cross-encoder
    q_trim = _trim_tokens(question, _CE_TOK, max_len=32)
    pairs, idxs = [], []
    for idx, _t, _s in cands:
        p_trim = _trim_tokens(chunk_store[idx]["text"], _CE_TOK, max_len=480)
        pairs.append((q_trim, p_trim))
        idxs.append(idx)
    scores = RERANKER.predict(pairs, convert_to_numpy=True, show_progress_bar=False, batch_size=64)
    order = np.argsort(-scores)[:k_final]
    chosen = [int(idxs[i]) for i in order]

    # 3) Context assembly (use Step5.build_context_with_citations if available)
    if hasattr(mod_step5, "build_context_with_citations"):
        ctx_text, _cites = mod_step5.build_context_with_citations([(i, float(scores[j])) for j,i in enumerate(chosen)], budget=ctx_char_budget)
    else:
        ctx_text = ""
        for i in chosen:
            frag = chunk_store[i]["text"]
            if len(ctx_text) + len(frag) + 2 <= ctx_char_budget:
                ctx_text += ("\n\n" + frag) if ctx_text else frag

    # 4) Generate answer
    def _persona(ctx: str, q: str) -> str:
        q_t = _trim_tokens(q, _GEN_TOK, 48)
        c_t = _trim_tokens(ctx, _GEN_TOK, 460)
        return (
            "You are a concise encyclopedia editor. Use ONLY the context; "
            "if insufficient, reply 'I don't know.'\n\n"
            f"Context:\n{c_t}\n\nQuestion: {q_t}\nAnswer:"
        )

    ans = GEN(_persona(ctx_text, question), max_new_tokens=256)[0]["generated_text"].strip()
    ctx_list = [chunk_store[i]["text"] for i in chosen]
    return ans, ctx_list

In [22]:
qa_ds = load_dataset("rag-datasets/rag-mini-wikipedia", "question-answer")["test"]
M = min(SAMPLE_COUNT, len(qa_ds))
print("Scoring with samples:", M)

def _best_reference_snippet(question: str, gold: str) -> str:
    # Try exact match; else nearest via FAISS
    g = (gold or "").strip()
    if g:
        pat = re.escape(g)
        for d in chunk_store:
            if re.search(pat, d["text"], flags=re.IGNORECASE):
                return d["text"]
    qv = embed_model.encode([question], normalize_embeddings=True).astype("float32")
    D, I = faiss_index.search(qv, 1)
    return chunk_store[int(I[0][0])]["text"]

def make_eval_dataset(which: str = "basic") -> Dataset:
    rows, skipped = [], 0
    for i in range(M):
        q = qa_ds[i]["question"]
        gt = qa_ds[i]["answer"] if "answer" in qa_ds[i] else qa_ds[i]["answers"]
        gt_text = gt if isinstance(gt, str) else gt[0]

        try:
            if which == "basic":
                a, ctxs = basic_respond(q)
            else:
                a, ctxs = ranked_respond(q, k_final=5, ctx_char_budget=2000)

            ctxs = ctxs if isinstance(ctxs, list) else [str(ctxs)]
            ref = _best_reference_snippet(q, gt_text)
            row = {
                "question": q,
                "answer": str(a),
                "contexts": [str(c) for c in ctxs] if ctxs else [""],
                "ground_truths": [gt_text],
                "reference": str(ref),
            }
            rows.append(row)
        except Exception:
            skipped += 1
            continue

    if not rows:
        raise RuntimeError("No samples built—check upstream notebooks.")
    print(f"{which}: built {len(rows)} rows (skipped {skipped})")
    return Dataset.from_list(rows)

ds_basic  = make_eval_dataset("basic")
ds_ranked = make_eval_dataset("ranked")
print("Sample row:", ds_basic[0])

Scoring with samples: 50
basic: built 50 rows (skipped 0)


Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512). Running this sequence through the model will result in indexing errors


ranked: built 50 rows (skipped 0)
Sample row: {'question': 'Was Abraham Lincoln the sixteenth President of the United States?', 'answer': "I don't know", 'contexts': ['Young Abraham Lincoln'], 'ground_truths': ['yes'], 'reference': "At the age of twenty, in 1812, at the end of his apprenticeship, Faraday attended lectures by the eminent English chemist and physicist Humphry Davy of the Royal Institution and Royal Society, and John Tatum, founder of the City Philosophical Society. Many tickets for these lectures were given to Faraday by William Dance (one of the founders of the Royal Philharmonic Society). Afterwards, Faraday sent Davy a three hundred page book based on notes taken during the lectures. Davy's reply was immediate, kind, and favorable. When Davy damaged his eyesight in an accident with nitrogen trichloride, "}


In [23]:
# Map chunk text → synthetic id for overlap evaluation
_text_to_id = {d["text"]: str(d.get("id", i)) for i, d in enumerate(chunk_store)}

def _text_to_nearest_id(txt: str) -> str:
    if not txt:
        return ""
    if txt in _text_to_id:
        return _text_to_id[txt]
    snippet = txt[:1200]
    vec = embed_model.encode([snippet], normalize_embeddings=True).astype("float32")
    D, I = faiss_index.search(vec, 1)
    return str(int(I[0][0]))

def _rows_to_pairs(ds: Dataset):
    rows = []
    for r in ds:
        ctx_ids = [_text_to_nearest_id(t) for t in (r.get("contexts") or []) if t]
        ref_id  = _text_to_nearest_id(r.get("reference",""))
        if ctx_ids and ref_id:
            rows.append({"retrieved": ctx_ids, "reference": [ref_id]})
    if not rows:
        raise RuntimeError("No valid rows for scoring.")
    return rows

pairs_basic  = _rows_to_pairs(ds_basic)
pairs_ranked = _rows_to_pairs(ds_ranked)

def score_overlap(pairs):
    precisions, recalls = [], []
    inter_total = ret_total = ref_total = 0
    for p in pairs:
        R = set(p["retrieved"])
        G = set(p["reference"])
        inter = len(R & G)
        precisions.append(inter / len(R) if R else 0.0)
        recalls.append(inter / len(G) if G else 0.0)
        inter_total += inter
        ret_total   += len(R)
        ref_total   += len(G)
    return {
        "context_precision_macro": float(np.mean(precisions)) if precisions else 0.0,
        "context_recall_macro": float(np.mean(recalls)) if recalls else 0.0,
        "context_precision_micro": float(inter_total / ret_total) if ret_total else 0.0,
        "context_recall_micro": float(inter_total / ref_total) if ref_total else 0.0,
        "n_samples_scored": len(pairs),
    }

scores_basic  = score_overlap(pairs_basic)
scores_ranked = score_overlap(pairs_ranked)

df_out = pd.DataFrame(
    [{"system":"basic", **scores_basic}, {"system":"ranked", **scores_ranked}]
).set_index("system")
display(df_out)

df_out.reset_index().to_csv(CSV_OUT, index=False)
print("Saved metrics to:", CSV_OUT)

Unnamed: 0_level_0,context_precision_macro,context_recall_macro,context_precision_micro,context_recall_micro,n_samples_scored
system,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
basic,0.16,0.16,0.16,0.16,50
ranked,0.048,0.24,0.048,0.24,50


Saved metrics to: /content/drive/MyDrive/NLX_LLM_Project_2/artifacts/step6_offline_overlap_metrics.csv


In [24]:
# OPTIONAL — run only if you want ragas metrics
from ragas import evaluate as ragas_evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall

# Minimal HF adapter (ragas may call an LLM; we point it to Flan-T5)
class SimpleHFLLM:
    name = "local-hf-flan-t5"
    def __init__(self, gen_pipe, default_max_new_tokens: int = 64):
        self.pipe = gen_pipe
        self._max = default_max_new_tokens
    def set_run_config(self, run_config):  # ragas compatibility
        pass
    def _to_text(self, p):
        try:
            if hasattr(p, "to_string"): return p.to_string()
            if hasattr(p, "text"):      return str(p.text)
        except Exception:
            pass
        return str(p)
    def generate(self, prompts, **kwargs):
        mx = int(kwargs.get("max_new_tokens", self._max))
        outs = []
        for p in prompts:
            txt = self._to_text(p)
            outs.append(self.pipe(txt, max_new_tokens=mx)[0]["generated_text"])
        return outs
    async def agenerate(self, prompts, **kwargs):
        return self.generate(prompts, **kwargs)
    def generate_prompt(self, prompts, **kwargs):
        return self.generate(prompts, **kwargs)
    async def agenerate_prompt(self, prompts, **kwargs):
        return self.agenerate(prompts, **kwargs)

local_llm = SimpleHFLLM(GEN)

# Choose which dataset to evaluate: ds_basic or ds_ranked
ragas_result = ragas_evaluate(
    ds_ranked,  # or ds_basic
    metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
    llm=local_llm
)
print(ragas_result)


OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable