Installs

In [None]:
# !pip install -U transformers accelerate peft datasets bitsandbytes sentence-transformers faiss-cpu \
#    nltk rouge-score bert-score pandas tqdm

Imports & config

In [None]:
import os, pandas as pd, torch
from src.rag_config import Paths, Models, RetrieverCfg, GenCfg, Flags, Experiment
from src.seed_utils import set_all_seeds
from src.io_utils import ensure_dir, atomic_csv_save, jsonl_write
from src.models import load_backbone, merge_lora
from src.prompt import build_messages
from src.retriever import (build_corpus, make_chunks, build_or_load_embeddings,
                           build_ip_index, retrieve_rows)
from src.generator import safe_generate
from src.eval_text import evaluate_text_metrics
from src.keyword_match import covered_ratio
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import AutoTokenizer

paths = Paths()
models = Models()
rcfg = RetrieverCfg()
gencfg = GenCfg()
flags = Flags()
exp = Experiment(selected_model="base_rag", seed=42)

set_all_seeds(exp.seed)
ensure_dir(paths.output_dir)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)

Corpus → Chunks → Embeddings → Index

In [None]:
df_corpus = pd.read_csv(paths.corpus_csv)
df_corpus = build_corpus(df_corpus)  # ensures Authors/Year/DOI exist

sbert = SentenceTransformer(models.retriever_sbert, device=device)
hf_tok = AutoTokenizer.from_pretrained(models.retriever_sbert)

df_chunks = make_chunks(df_corpus, hf_tok, max_tok=rcfg.chunk_max_tok, stride=rcfg.chunk_stride)
embeddings, from_cache = build_or_load_embeddings(df_chunks, sbert, paths.emb_cache_npy)
index = build_ip_index(embeddings)
print("chunks:", len(df_chunks), "embeddings from cache:", from_cache)

Load LLMs and cross-encoder

In [None]:
base_model, base_tok = load_backbone(models.base_model_id, flags.load_in_4bit, flags.use_bf16_if_available)
ft_base_for_merge, ft_tok = load_backbone(models.base_model_id, flags.load_in_4bit, flags.use_bf16_if_available)
ft_model = merge_lora(ft_base_for_merge, models.lora_adapter_path)

cross = CrossEncoder(models.cross_encoder, device=device)

model_dict = {"base": base_model, "ft": ft_model, "base_rag": base_model, "ft_rag": ft_model}
tokenizer_dict = {"base": base_tok, "ft": ft_tok, "base_rag": base_tok, "ft_rag": ft_tok}
print("loaded keys:", list(model_dict.keys()))

QA prep and generation function

In [None]:
qa_df = pd.read_csv(paths.qa_csv)
if "Question" not in qa_df.columns:
    raise KeyError("qa_set CSV must include a 'Question' column.")
questions = qa_df["Question"].astype(str).fillna("").tolist()

model = model_dict[exp.selected_model]
tokenizer = tokenizer_dict[exp.selected_model]
use_rag = exp.selected_model.endswith("_rag")

def retriever_fn(query, k_final=rcfg.k_final, max_ctx_tokens=rcfg.max_ctx_tokens):
    return retrieve_rows(
        query=query,
        faiss_index=index,
        embeddings=embeddings,
        df_chunks=df_chunks,
        sbert=sbert,
        hf_tok=hf_tok,
        k_initial=rcfg.k_initial,
        k_final=k_final,
        mmr_lambda=rcfg.mmr_lambda,
        cross_encoder=cross,
        ce_batch_size=rcfg.ce_batch_size,
        max_ctx_tokens=max_ctx_tokens,
        title_max_chars=rcfg.title_max_chars,
        excerpt_max_chars=rcfg.excerpt_max_chars
    )

Generation loop & save

In [None]:
from tqdm import tqdm

gen_txt = os.path.join(paths.output_dir, f"generated_{exp.selected_model}_responses.txt")
gen_csv = os.path.join(paths.output_dir, f"generated_{exp.selected_model}_responses.csv")
gen_log = os.path.join(paths.output_dir, f"generated_{exp.selected_model}_genlog.jsonl")

records, outputs = [], []
with open(gen_txt, "w", encoding="utf-8") as f:
    for i, q in enumerate(tqdm(questions, desc=f"Generating ({exp.selected_model})")):
        resp, meta = safe_generate(q, model, tokenizer, retriever_fn, use_rag, gencfg)
        outputs.append(resp if resp is not None else "")
        f.write((resp or "") + "\n")
        meta.update({"idx": i, "model_key": exp.selected_model, "question_preview": q[:160]})
        records.append(meta)

qa_save = qa_df.iloc[:len(outputs)].copy()
qa_save["Generated"] = outputs
atomic_csv_save(qa_save, gen_csv)
jsonl_write(records, gen_log)

print("saved:", gen_txt, gen_csv, gen_log)

Evaluation metrics

In [None]:
# Evaluation (CosineSimilarity, BERTScore, optional Perplexity)

import os
import pandas as pd
from src.eval_text import evaluate_text_metrics
from src.io_utils import atomic_csv_save

# Input from Cell 6
eval_input = pd.read_csv(gen_csv)

# Run evaluation
# - To skip Perplexity, set ppl_model=None and ppl_tokenizer=None.
evaluated = evaluate_text_metrics(
    eval_input,
    models.eval_sbert,
    ppl_model=model,                 # optional: causal LM for perplexity
    ppl_tokenizer=tokenizer,         # optional: tokenizer for perplexity
    ppl_column="Generated",          # "Generated" or "Answer"
    ppl_max_ctx=gencfg.model_max_ctx # optional; None = auto
)

# Save results
eval_csv = os.path.join(paths.output_dir, f"metrics_{exp.selected_model}.csv")
atomic_csv_save(evaluated, eval_csv)

print("metrics saved:", eval_csv, "| rows:", len(evaluated))


Keyword matching

In [None]:
from sentence_transformers import SentenceTransformer

dfm = pd.read_csv(eval_csv)
kw_col = "Keywords" if "Keywords" in dfm.columns else None

if kw_col:
    km_sbert = SentenceTransformer(models.eval_sbert, device=device)
    scores = []
    for _, row in tqdm(dfm.iterrows(), total=len(dfm), desc=f"Keyword Matching ({exp.selected_model})"):
        raw_kw = str(row.get(kw_col, "") or "")
        gen = str(row.get("Generated", "") or "")
        keywords = [k.strip() for k in raw_kw.split(",") if k.strip()]
        score = covered_ratio(keywords, gen, km_sbert)
        scores.append(score)
    dfm["Keyword_Match_Score"] = scores
    km_csv = os.path.join(paths.output_dir, f"metrics_{exp.selected_model}_with_keyword.csv")
    atomic_csv_save(dfm, km_csv)
    print("keyword metrics saved:", km_csv)
else:
    print("no 'Keywords' column found; skipping keyword matching.")

Cleanup

In [None]:
import gc
del model
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
print("cleanup done.")