## Step 1: Mounting Google Drive and Importing Dependencies

In [1]:
# Mount Google Drive
from google.colab import drive, files
drive.mount('/content/drive')

# Navigate to the repo folder
%cd /content/drive/MyDrive/llm-finetuning-project/llm-finetuning-summarizer

# List repo contents
!ls

Mounted at /content/drive
/content/drive/MyDrive/llm-finetuning-project/llm-finetuning-summarizer
data				LICENSE		 qa_pairs   wandb
deployment			models		 README.md
eval_predictions_baseline.json	notebooks	 results
gpt4o_judgments_baseline.json	project_plan.md  scripts


In [None]:
!pip install -q transformers accelerate datasets openai sentence-transformers faiss-cpu bert-score

In [46]:
# Core libraries
import os, json, time
import numpy as np
from getpass import getpass
from typing import List
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# HF / model-specific imports
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import faiss

# BERTScore + OpenAI Eval
from bert_score import score as bertscore
from openai import OpenAI
import openai

## Step 2: Loading the Validation Set for Evaluation

In [4]:
eval_path = "./data/eval_with_context.jsonl"

eval_pairs = []
with open(eval_path, "r") as f:
    for line in f:
        eval_pairs.append(json.loads(line.strip()))

print(f"Loaded {len(eval_pairs)} QA pairs for RAG evaluation.")

Loaded 30 QA pairs for RAG evaluation.


## Step 3: Load the Fine-Tuned RAG Model and FAISS Index

In this step, we load the LoRA-fine-tuned Mistral-7B model along with the FAISS index and chunk metadata used for retrieval.

Components loaded:
- **Tokenizer**: For tokenizing prompts and decoding model output.
- **Fine-tuned model**: Our LoRA-adapted Mistral model, loaded with 4-bit quantization if available.
- **FAISS index**: A pre-computed vector store of document chunks, used to retrieve the most relevant contexts for a given question.
- **Chunk metadata**: Contains the actual content and titles of the document chunks associated with each FAISS vector.

This setup allows the model to perform Retrieval-Augmented Generation (RAG) by grounding answers in semantically retrieved chunks at inference time. The model is now ready to generate context-informed answers for evaluation.

In [8]:
# Configuration
EMBED_MODEL = "BAAI/bge-base-en-v1.5"
MODEL_PATH = "./models/merged-finetuned-mistral"
FAISS_INDEX_PATH = "./data/rag_corpus/faiss_index.bin"
METADATA_PATH = "./data/rag_corpus/chunk_metadata.json"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
CTX_TOKEN_LIMIT = 2048
MAX_NEW_TOKENS = 256
TOP_K = 5

In [6]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

In [22]:
tokenizer.pad_token = tokenizer.eos_token

In [7]:
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, torch_dtype=torch.float16, device_map="auto")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
# Load FAISS index and metadata
index = faiss.read_index(FAISS_INDEX_PATH)
with open(METADATA_PATH) as f:
    chunk_metadata = json.load(f)

In [None]:
# Load embedder
embedder = SentenceTransformer(EMBED_MODEL, device=DEVICE)

In [28]:
def retrieve_chunks(query: str, k: int = TOP_K) -> List[dict]:
    """Return top-k chunks (dicts with 'title' & 'text')."""
    q_emb = embedder.encode([query], normalize_embeddings=True)
    _, idxs = index.search(q_emb, k)
    return [chunk_metadata[int(i)] for i in idxs[0]]

In [29]:
def build_prompt(question: str,
                 gt_context: str,
                 k: int = TOP_K,
                 ctx_limit: int = CTX_TOKEN_LIMIT) -> str:
    """
    1. Keep the full ground-truth context.
    2. Add RAG chunks until we approach `ctx_limit` tokens.
    """
    # ----- ground-truth block -------------------------------------------------
    gt_block = f"[Ground Truth]\n{gt_context.strip()}\n"
    gt_tokens = len(tokenizer.tokenize(gt_block))

    # ----- RAG retrieval ------------------------------------------------------
    rag_blocks, rag_tokens = [], 0
    for ch in retrieve_chunks(question, k=k):
        blk = f"[{ch['title']}]\n{ch['text']}\n"
        t   = len(tokenizer.tokenize(blk))
        # will we still fit?
        if gt_tokens + rag_tokens + t <= ctx_limit:
            rag_blocks.append(blk)
            rag_tokens += t
        else:
            break                            # stop when limit reached

    # concatenate (GT first, then RAG)
    context = gt_block + "\n".join(rag_blocks)

    # final prompt
    prompt = (
        "You are an expert scientific assistant. Use the excerpts to answer.\n\n"
        f"Excerpts:\n{context}\n\n"
        f"Question: {question}\nAnswer:"
    )
    return prompt

In [30]:
@torch.inference_mode()
def generate_answer_rag_plus_gt(question: str, gt_context: str) -> str:
    prompt  = build_prompt(question, gt_context)
    inputs  = tokenizer(prompt, return_tensors="pt",
                        padding=True, truncation=True,
                        max_length=CTX_TOKEN_LIMIT).to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=False
    )

    # strip the prompt part → keep only newly generated tokens
    gen_ids   = outputs[0][inputs["input_ids"].shape[-1]:]
    prediction = tokenizer.decode(gen_ids,
                                  skip_special_tokens=True).strip()
    return prediction

## Step 4: Generate predictions using RAG

In [None]:
results = []

for i, item in enumerate(eval_pairs):
    question  = item["question"]
    reference = item["answer"]
    context   = item["context"]  # ground-truth context

    prediction = generate_answer_rag_plus_gt(question, context)

    results.append({
        "question": question,
        "reference": reference,
        "context": context,
        "prediction": prediction
    })

    if i % 10 == 0:
        print(f"[{i}/{len(eval_pairs)}] Question: {question}\n→ {prediction}\n")

In [33]:
results[2]

{'question': 'Why is sparsity in matrix B important in LoRI?',
 'reference': 'Sparsity in matrix B enables LoRI to retain only the most critical elements necessary for adaptation, reducing parameter count and mitigating cross-task interference during adapter merging and continual learning.',
 'context': 'Abstract:\n\nLow-Rank Adaptation (LoRA) has emerged as a popular parameter- efficient fine-tuning (PEFT) method for Large Language Models (LLMs), yet it still incurs notable overhead and suffers from parameter interference in multi-task scenarios. We propose LoRA with Reduced Interference (LoRI), a simple yet effective approach that freezes the projection matrices A as random projections and sparsifies the matrices B using task-specific masks. This design substantially reduces the number of trainable parameters while maintaining strong task performance. Moreover, LoRI minimizes cross-task interference in adapter merging by leveraging the orthogonality between adapter subspaces, and sup

In [37]:
# Save for evaluation
with open("eval_predictions_openbook_rag_plus_gt.json", "w") as f:
    json.dump(results, f, indent=2)

In [38]:
# Save results
output_path = "./data/evaluation/eval_predictions_openbook_rag_plus_gt.json"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

In [41]:
with open(output_path, "w") as f:
    json.dump(results, f, indent=2)

print(f"Predictions saved to {output_path}")

Predictions saved to ./data/evaluation/eval_predictions_openbook_rag_plus_gt.json


## Step 5: BLEU Score Evaluation

In this section, we evaluate our fine-tuned model using the **BLEU (Bilingual Evaluation Understudy)** score, a standard metric for evaluating the quality of generated text by comparing it to a reference answer.

### What is BLEU?
BLEU measures *n-gram overlap* between the model's prediction and the reference answer:
- **BLEU-1**: unigram overlap (word-level similarity)
- **BLEU-2**: bigram overlap (2-word chunks)
- **BLEU-3**: trigram overlap
- **BLEU-4**: 4-gram overlap (more stringent)

### Components of the Code:
- `weights=(1, 0, 0, 0)`: Measures unigram overlap only (BLEU-1).
- `smoothing_function=method1`: Prevents the BLEU score from dropping to 0 when there are no exact n-gram matches. This is useful for short or paraphrased responses.
- We iterate over our evaluation dataset and compute BLEU-1 through BLEU-4 for each response.

### Limitations:
BLEU is a **surface-level** metric:
- It penalizes paraphrasing.
- It doesn't understand meaning—only *form*.
- It is useful for rough comparison, but **not sufficient alone** to assess model quality.

Hence, we will also perform **qualitative evaluation** using *LLM-as-a-Judge* in the next step.

In [40]:
# Load predictions with context
with open("eval_predictions_openbook_rag_plus_gt.json", "r") as f:
    eval_results = json.load(f)

In [42]:
# Initialize smoothing function and score containers
smooth = SmoothingFunction().method1
bleu_scores = {f"BLEU-{n}": [] for n in range(1, 5)}

In [43]:
# Iterate over predictions and compute BLEU-1 to BLEU-4
for item in eval_results:
    reference = item["reference"].split()
    prediction = item["prediction"].split()

    bleu_scores["BLEU-1"].append(
        sentence_bleu([reference], prediction, weights=(1, 0, 0, 0), smoothing_function=smooth)
    )
    bleu_scores["BLEU-2"].append(
        sentence_bleu([reference], prediction, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth)
    )
    bleu_scores["BLEU-3"].append(
        sentence_bleu([reference], prediction, weights=(1/3, 1/3, 1/3, 0), smoothing_function=smooth)
    )
    bleu_scores["BLEU-4"].append(
        sentence_bleu([reference], prediction, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)
    )

# Compute and display average scores
avg_bleu_scores = {metric: round(sum(scores)/len(scores), 4) for metric, scores in bleu_scores.items()}
print("Average BLEU Scores:", avg_bleu_scores)

Average BLEU Scores: {'BLEU-1': 0.2571, 'BLEU-2': 0.1757, 'BLEU-3': 0.1345, 'BLEU-4': 0.108}


## Step 6: Using GPT-4o as LLM-as-a-Judge (OpenAI Evaluation)

In this section, we use **GPT-4o**—a state-of-the-art model from OpenAI—as a neutral third-party judge to evaluate the quality of our model’s predictions against ground truth answers. This is part of the **LLM-as-a-Judge** evaluation methodology, which is growing in popularity as a way to assess open-ended outputs where metrics like BLEU or ROUGE may fall short.

**What this section does:**

- Loads model predictions from `eval_openbook_predictions.json`
- Uses a GPT-4o prompt that provides:
  - The question
  - The model's generated answer
  - The reference (ground-truth) answer
- Asks GPT-4o to score the generated answer on a **scale from 1 to 5**, considering relevance, correctness, completeness, and style
- Stores all outputs in `gpt4o_judgments_openbook.json` for analysis

**Key Functions:**

- `ask_gpt_judge()` → Sends a prompt to GPT-4o via the OpenAI API and returns a numeric score
- `judged_results` → A list of evaluation records including the question, reference, model prediction, and GPT-4o's score
- `np.mean()` → Used at the end to compute the **average evaluation score** across all QA pairs

**Why use GPT-4o?**

Because LLMs are best judged by **other LLMs** capable of contextual understanding. GPT-4o has been shown to be highly consistent and reliable in comparative evaluations.

This evaluation complements our BLEU score by offering a **semantic and qualitative assessment**, helping us better understand the strengths and weaknesses of our fine-tuned model.

---

In [44]:
os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API key:")

Enter your OpenAI API key:··········


In [47]:
openai.api_key = os.environ["OPENAI_API_KEY"]

In [48]:
# Load the API key from environment variable
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [49]:
def ask_gpt_judge(question, reference, prediction):
    prompt = f"""
You are an expert model evaluator. Given a question, a reference answer, and a model-generated answer that was generated with access to a relevant excerpt from a scientific paper, judge how good the model’s answer is on a scale of 1 to 5. Use the following rubric:

1 – Completely irrelevant or hallucinated.
2 – Partially related but mostly inaccurate.
3 – Mostly accurate but missing key details.
4 – Accurate and mostly complete.
5 – Nearly identical in meaning to the reference.

Be strict but fair. Output ONLY the number.

Question: {question}
Reference Answer: {reference}
Model Prediction: {prediction}

Score:"""

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print("Error during evaluation:\n")
        print(e)
        return None

In [50]:
with open("eval_predictions_openbook_rag_plus_gt.json") as f:
    eval_results = json.load(f)

In [None]:
judged_results = []

for i, item in enumerate(eval_results):
    print(f"Evaluating {i+1}/{len(eval_results)}")
    score = ask_gpt_judge(item["question"], item["reference"], item["prediction"])
    if score:
        judged_results.append({
            "question": item["question"],
            "reference": item["reference"],
            "prediction": item["prediction"],
            "gpt4o_score": score
        })
    time.sleep(1.2)

In [52]:
with open("gpt4o_judgments_openbook_rag_plus_gt.json", "w") as f:
    json.dump(judged_results, f, indent=2)

In [53]:
for sample in judged_results:
    print(" Question:", sample["question"])
    print(" Reference Answer:", sample["reference"])
    print(" Model Prediction:", sample["prediction"])
    print(" GPT-4o Evaluation:", sample["gpt4o_score"])
    print("-" * 80)

 Question: What is the primary innovation introduced by the LoRI method for parameter-efficient fine-tuning?
 Reference Answer: LoRI introduces a novel approach that freezes the projection matrices A as random projections and sparsifies the matrices B using task-specific masks, thereby significantly reducing trainable parameters while minimizing cross-task interference.
 Model Prediction: The primary innovation of LoRI is the use of fixed, randomly initialized projection matrices A and sparse, task-specific matrices B to reduce trainable parameters while minimizing cross-task interference. This design enables effective single-task performance, training-free adapter merging, and continual learning with minimal catastrophic forgetting.
 GPT-4o Evaluation: 4
--------------------------------------------------------------------------------
 Question: How does LoRI reduce the number of trainable parameters compared to traditional LoRA?
 Reference Answer: LoRI reduces the number of trainable 

In [54]:
# Calculating the average score
scores = [int(res["gpt4o_score"]) for res in judged_results if res["gpt4o_score"].isdigit()]
average_score = np.mean(scores)
print(f"Average GPT-4o Evaluation Score: {average_score:.2f} out of 5")

Average GPT-4o Evaluation Score: 3.20 out of 5


In [55]:
# Saving the results

output_path = "./data/evaluation/eval_gpt4o_judgments_open_book_rag_plus_gt.json"

os.makedirs(os.path.dirname(output_path), exist_ok=True)

with open(output_path, "w") as f:
    json.dump(judged_results, f, indent=2)

print(f"Judged results saved to {output_path}")

Judged results saved to ./data/evaluation/eval_gpt4o_judgments_open_book_rag_plus_gt.json


## Step 7: Evaluating with BERTScore (Semantic Similarity Metric)

In this section, we evaluate the semantic similarity between the model’s predictions and the ground truth answers using **BERTScore**, a metric that leverages contextual embeddings from large pretrained models (like BERT) to assess the *meaning* of the outputs.

Unlike BLEU, which only considers surface-level n-gram overlap, BERTScore measures how semantically close the answers are—even when the phrasing differs.

### Interpretation:
- **BERTScore F1** reflects the degree of **semantic overlap** between model output and human-labeled answer.
- A score closer to **1.0** indicates stronger alignment of meaning.
- This metric is especially useful in open-ended QA or summarization settings where **exact matching isn't expected**.

In [56]:
# Replace `results` with `judged_results` if needed
predictions = [item["prediction"] for item in results]
references = [item["reference"] for item in results]

In [57]:
P, R, F1 = bertscore(predictions, references, lang="en", rescale_with_baseline=True)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [58]:
print(f"Average Precision: {P.mean().item():.4f}")

Average Precision: 0.2728


In [59]:
print(f"Average Recall: {R.mean().item():.4f}")

Average Recall: 0.3940


In [60]:
print(f"Average BERTScore (F1): {F1.mean().item():.4f}")

Average BERTScore (F1): 0.3318


## Step 8: Fixing Metadata

In [1]:
pip install nbformat --quiet

In [2]:
from google.colab import drive, files
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
import os
# List the notebook directory to confirm the file exists
os.listdir("/content/drive/MyDrive/llm-finetuning-project/llm-finetuning-summarizer/notebooks")

['.keep',
 '00_colab_setup.ipynb',
 '01_arxiv_scraper.ipynb',
 '02_pdf_downloader.ipynb',
 '04_prepare_finetuning_corpus.ipynb',
 '05_tokenization.ipynb',
 '03_qa_curation.ipynb',
 '07_eval_qa_curation.ipynb',
 '08_evaluation_closed_book.ipynb',
 '06_finetuning.ipynb',
 '10_evaluation_baseline_open_book.ipynb',
 '11_evaluation_baseline_closed_book.ipynb',
 '12_pdf_downloader_for_rag.ipynb',
 '14_rag_retrieval_and_inference.ipynb',
 '13_chunk_and_embed.ipynb',
 '09_evaluation_open_book.ipynb',
 '15_evaluation_rag_open_book.ipynb']

In [None]:
import nbformat

notebook_path = "/content/drive/MyDrive/llm-finetuning-project/llm-finetuning-summarizer/notebooks/09_evaluation_open_book.ipynb"

with open(notebook_path, "r") as f:
    nb = nbformat.read(f, as_version=4)

if "widgets" in nb.metadata:
    del nb.metadata["widgets"]

with open(notebook_path, "w") as f:
    nbformat.write(nb, f)

print("Notebook fixed and saved successfully!")