## Step 1: Mounting Google Drive and Importing Dependencies

In [None]:
# Mount Google Drive
from google.colab import drive, files
drive.mount('/content/drive')

# Navigate to the repo folder
%cd /content/drive/MyDrive/llm-finetuning-project/llm-finetuning-summarizer

# List repo contents
!ls

In [2]:
!pip install -q transformers accelerate datasets openai sentence-transformers faiss-cpu bert-score

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m80.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m117.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m97.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
# Core libraries
import os, json, time
import numpy as np
from getpass import getpass
from typing import List
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# HF / model-specific imports
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import faiss

# BERTScore + OpenAI Eval
from bert_score import score as bertscore
from openai import OpenAI
import openai

## Step 2: Loading the Validation Set for Evaluation

In [4]:
eval_path = "./data/eval_with_context.jsonl"

eval_pairs = []
with open(eval_path, "r") as f:
    for line in f:
        eval_pairs.append(json.loads(line.strip()))

print(f"Loaded {len(eval_pairs)} QA pairs for RAG evaluation.")

Loaded 30 QA pairs for RAG evaluation.


## Step 3: Load the Fine-Tuned RAG Model and FAISS Index

In [5]:
EMBED_MODEL       = "BAAI/bge-base-en-v1.5"
MODEL_PATH        = "./models/merged-finetuned-mistral"
FAISS_INDEX_PATH  = "./data/rag_corpus/faiss_index.bin"
METADATA_PATH     = "./data/rag_corpus/chunk_metadata.json"

DEVICE            = "cuda" if torch.cuda.is_available() else "cpu"
CTX_TOKEN_LIMIT   = 2048
MAX_NEW_TOKENS    = 256
TOP_K             = 5

In [6]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)

In [7]:
tokenizer.pad_token = tokenizer.eos_token # to avoid padding error

In [8]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
    device_map="auto" if DEVICE == "cuda" else None,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
model.eval()

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32768, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): MistralRMSNorm((4096,), eps=1e-0

In [10]:
# Load FAISS + Metadata
index = faiss.read_index(str(FAISS_INDEX_PATH))
with open(METADATA_PATH) as f:
    chunk_metadata = json.load(f)

In [None]:
embedder = SentenceTransformer(EMBED_MODEL, device=DEVICE)

In [12]:
def retrieve_chunks(query: str, k: int = TOP_K) -> List[dict]:
    """Return top-k chunks (dicts with 'title' & 'text' fields)."""
    q_emb = embedder.encode([query], normalize_embeddings=True)
    _, idxs = index.search(q_emb, k)
    return [chunk_metadata[int(i)] for i in idxs[0]]

In [13]:
def build_prompt_rag(question: str,
                     k: int = TOP_K,
                     ctx_limit: int = CTX_TOKEN_LIMIT) -> str:
    """
    Compose prompt using ONLY RAG-retrieved chunks.
    Stops adding chunks when token budget (`ctx_limit`) would be exceeded.
    """
    blocks, n_tokens = [], 0
    for ch in retrieve_chunks(question, k):
        blk = f"[{ch['title']}]\n{ch['text']}\n"
        t   = len(tokenizer.tokenize(blk))
        if n_tokens + t <= ctx_limit:
            blocks.append(blk)
            n_tokens += t
        else:
            break

    context = "\n\n".join(blocks)
    prompt  = (
        "You are an expert scientific assistant. Use the excerpts to answer.\n\n"
        f"Excerpts:\n{context}\n\n"
        f"Question: {question}\nAnswer:"
    )
    return prompt

In [14]:
@torch.inference_mode()
def generate_answer_rag(question: str) -> str:
    """Generate answer using *only* RAG context (closed-book)."""
    prompt = build_prompt_rag(question)
    inputs = tokenizer(prompt,
                       return_tensors="pt",
                       padding=True,
                       truncation=True,
                       max_length=CTX_TOKEN_LIMIT).to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=False
    )

    # keep only the newly-generated tokens
    gen_ids   = outputs[0][inputs["input_ids"].shape[-1]:]
    answer = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
    return answer

## Step 4: Generate predictions using RAG

In [None]:
results = []

for i, item in enumerate(eval_pairs):
    question  = item["question"]
    reference = item["answer"]

    prediction = generate_answer_rag(question)

    results.append({
        "question": question,
        "reference": reference,
        "prediction": prediction,
    })

    if i % 10 == 0:
        print(f"[{i}/{len(eval_pairs)}] Question: {question}\n→ {prediction}\n")

In [18]:
# Save for evaluation
with open("eval_predictions_closed_book_rag.json", "w") as f:
    json.dump(results, f, indent=2)

In [16]:
output_path = "./data/evaluation/eval_predictions_closed_book_rag.json"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

with open(output_path, "w") as f:
    json.dump(results, f, indent=2)

print(f"Saved predictions to: {output_path}")

Saved predictions to: ./data/evaluation/eval_predictions_closed_book_rag.json


## Step 5: BLEU Score Evaluation

In this section, we evaluate our fine-tuned model using the **BLEU (Bilingual Evaluation Understudy)** score, a standard metric for evaluating the quality of generated text by comparing it to a reference answer.

### What is BLEU?
BLEU measures *n-gram overlap* between the model's prediction and the reference answer:
- **BLEU-1**: unigram overlap (word-level similarity)
- **BLEU-2**: bigram overlap (2-word chunks)
- **BLEU-3**: trigram overlap
- **BLEU-4**: 4-gram overlap (more stringent)

### Components of the Code:
- `weights=(1, 0, 0, 0)`: Measures unigram overlap only (BLEU-1).
- `smoothing_function=method1`: Prevents the BLEU score from dropping to 0 when there are no exact n-gram matches. This is useful for short or paraphrased responses.
- We iterate over our evaluation dataset and compute BLEU-1 through BLEU-4 for each response.

### Limitations:
BLEU is a **surface-level** metric:
- It penalizes paraphrasing.
- It doesn't understand meaning—only *form*.
- It is useful for rough comparison, but **not sufficient alone** to assess model quality.

Hence, we will also perform **qualitative evaluation** using *LLM-as-a-Judge* in the next step.

In [19]:
# Load predictions with context
with open("eval_predictions_closed_book_rag.json", "r") as f:
    eval_results = json.load(f)

In [20]:
# Initialize smoothing function and score containers
smooth = SmoothingFunction().method1
bleu_scores = {f"BLEU-{n}": [] for n in range(1, 5)}

In [21]:
# Iterate over predictions and compute BLEU-1 to BLEU-4
for item in eval_results:
    reference = item["reference"].split()
    prediction = item["prediction"].split()

    bleu_scores["BLEU-1"].append(
        sentence_bleu([reference], prediction, weights=(1, 0, 0, 0), smoothing_function=smooth)
    )
    bleu_scores["BLEU-2"].append(
        sentence_bleu([reference], prediction, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth)
    )
    bleu_scores["BLEU-3"].append(
        sentence_bleu([reference], prediction, weights=(1/3, 1/3, 1/3, 0), smoothing_function=smooth)
    )
    bleu_scores["BLEU-4"].append(
        sentence_bleu([reference], prediction, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)
    )

# Compute and display average scores
avg_bleu_scores = {metric: round(sum(scores)/len(scores), 4) for metric, scores in bleu_scores.items()}
print("Average BLEU Scores:", avg_bleu_scores)

Average BLEU Scores: {'BLEU-1': 0.2327, 'BLEU-2': 0.1221, 'BLEU-3': 0.0804, 'BLEU-4': 0.0595}


## Step 6: Using GPT-4o as LLM-as-a-Judge (OpenAI Evaluation)

In this section, we use **GPT-4o**—a state-of-the-art model from OpenAI—as a neutral third-party judge to evaluate the quality of our model’s predictions against ground truth answers. This is part of the **LLM-as-a-Judge** evaluation methodology, which is growing in popularity as a way to assess open-ended outputs where metrics like BLEU or ROUGE may fall short.

**What this section does:**

- Loads model predictions from `eval_openbook_predictions.json`
- Uses a GPT-4o prompt that provides:
  - The question
  - The model's generated answer
  - The reference (ground-truth) answer
- Asks GPT-4o to score the generated answer on a **scale from 1 to 5**, considering relevance, correctness, completeness, and style
- Stores all outputs in `gpt4o_judgments_openbook.json` for analysis

**Key Functions:**

- `ask_gpt_judge()` → Sends a prompt to GPT-4o via the OpenAI API and returns a numeric score
- `judged_results` → A list of evaluation records including the question, reference, model prediction, and GPT-4o's score
- `np.mean()` → Used at the end to compute the **average evaluation score** across all QA pairs

**Why use GPT-4o?**

Because LLMs are best judged by **other LLMs** capable of contextual understanding. GPT-4o has been shown to be highly consistent and reliable in comparative evaluations.

This evaluation complements our BLEU score by offering a **semantic and qualitative assessment**, helping us better understand the strengths and weaknesses of our fine-tuned model.

---

In [22]:
os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API key:")

Enter your OpenAI API key:··········


In [23]:
openai.api_key = os.environ["OPENAI_API_KEY"]

In [24]:
# Load the API key from environment variable
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [25]:
def ask_gpt_judge(question, reference, prediction):
    prompt = f"""
You are an expert model evaluator. Given a question, a reference answer, and a model-generated answer that was generated with access to a relevant excerpt from a scientific paper, judge how good the model’s answer is on a scale of 1 to 5. Use the following rubric:

1 – Completely irrelevant or hallucinated.
2 – Partially related but mostly inaccurate.
3 – Mostly accurate but missing key details.
4 – Accurate and mostly complete.
5 – Nearly identical in meaning to the reference.

Be strict but fair. Output ONLY the number.

Question: {question}
Reference Answer: {reference}
Model Prediction: {prediction}

Score:"""

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print("Error during evaluation:\n")
        print(e)
        return None

In [26]:
with open("eval_predictions_closed_book_rag.json") as f:
    eval_results = json.load(f)

In [None]:
judged_results = []

for i, item in enumerate(eval_results):
    print(f"Evaluating {i+1}/{len(eval_results)}")
    score = ask_gpt_judge(item["question"], item["reference"], item["prediction"])
    if score:
        judged_results.append({
            "question": item["question"],
            "reference": item["reference"],
            "prediction": item["prediction"],
            "gpt4o_score": score
        })
    time.sleep(1.2)

In [34]:
with open("gpt4o_judgments_closedbook_rag.json", "w") as f:
    json.dump(judged_results, f, indent=2)

In [35]:
for sample in judged_results:
    print(" Question:", sample["question"])
    print(" Reference Answer:", sample["reference"])
    print(" Model Prediction:", sample["prediction"])
    print(" GPT-4o Evaluation:", sample["gpt4o_score"])
    print("-" * 80)

 Question: What is the primary innovation introduced by the LoRI method for parameter-efficient fine-tuning?
 Reference Answer: LoRI introduces a novel approach that freezes the projection matrices A as random projections and sparsifies the matrices B using task-specific masks, thereby significantly reducing trainable parameters while minimizing cross-task interference.
 Model Prediction: LoRI introduces a novel approach to parameter-efficient fine-tuning by leveraging low-rank approximations of the pre-trained weight matrix, updating only the low-rank components while keeping the majority of pre-trained parameters frozen.
 GPT-4o Evaluation: 2
--------------------------------------------------------------------------------
 Question: How does LoRI reduce the number of trainable parameters compared to traditional LoRA?
 Reference Answer: LoRI reduces the number of trainable parameters by keeping matrix A fixed as a random projection and sparsifying matrix B using task-specific masks, e

In [33]:
# Calculating the average score
scores = [int(res["gpt4o_score"]) for res in judged_results if res["gpt4o_score"].isdigit()]
average_score = np.mean(scores)
print(f"Average GPT-4o Evaluation Score: {average_score:.2f} out of 5")

Average GPT-4o Evaluation Score: 2.60 out of 5


In [36]:
# Saving the results

output_path = "./data/evaluation/eval_gpt4o_judgments_closed_book_rag.json"

os.makedirs(os.path.dirname(output_path), exist_ok=True)

with open(output_path, "w") as f:
    json.dump(judged_results, f, indent=2)

print(f"Judged results saved to {output_path}")

Judged results saved to ./data/evaluation/eval_gpt4o_judgments_closed_book_rag.json


## Step 7: Evaluating with BERTScore (Semantic Similarity Metric)

In this section, we evaluate the semantic similarity between the model’s predictions and the ground truth answers using **BERTScore**, a metric that leverages contextual embeddings from large pretrained models (like BERT) to assess the *meaning* of the outputs.

Unlike BLEU, which only considers surface-level n-gram overlap, BERTScore measures how semantically close the answers are—even when the phrasing differs.

### Interpretation:
- **BERTScore F1** reflects the degree of **semantic overlap** between model output and human-labeled answer.
- A score closer to **1.0** indicates stronger alignment of meaning.
- This metric is especially useful in open-ended QA or summarization settings where **exact matching isn't expected**.

In [37]:
# Replace `results` with `judged_results` if needed
predictions = [item["prediction"] for item in results]
references = [item["reference"] for item in results]

In [None]:
P, R, F1 = bertscore(predictions, references, lang="en", rescale_with_baseline=True)

In [39]:
print(f"Average Precision: {P.mean().item():.4f}")

Average Precision: 0.2910


In [40]:
print(f"Average Recall: {R.mean().item():.4f}")

Average Recall: 0.3422


In [41]:
print(f"Average Recall: {R.mean().item():.4f}")

Average Recall: 0.3422


## Step 8: Fixing Metadata

In [1]:
pip install nbformat --quiet

In [2]:
from google.colab import drive, files
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive
