In [1]:
!pip install datasets
!pip uninstall -y faiss faiss-cpu faiss-gpu
!pip install faiss-cpu
!pip install wandb

[0mFound existing installation: faiss-cpu 1.9.0.post1
Uninstalling faiss-cpu-1.9.0.post1:
  Successfully uninstalled faiss-cpu-1.9.0.post1
[0mCollecting faiss-cpu
  Using cached faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Using cached faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [2]:
import os
import sys

!rm -r /content/goodRAG
os.system('git clone https://github.com/shannn1/goodRAG.git')
sys.path.append('/content/goodRAG')

In [3]:
import torch
from torch.utils.data import DataLoader
import faiss
from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np
import wandb
from modeling_rag import RagRetriever, RagSequenceForGeneration, RagConfig
from tokenization_rag import RagTokenizer
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DPRQuestionEncoder,
    DPRQuestionEncoderTokenizer,
    AdamW,
    get_scheduler
)
import json
from tqdm import tqdm
from torch.cuda.amp import GradScaler, autocast

In [4]:
knowledge_base = load_dataset("Shannnh/knowledge_base_genai")
knowledge_base = knowledge_base['train']
knowledge_base = knowledge_base.rename_column("document", "text")
output_path = "./knowledge_base"
knowledge_base.save_to_disk(output_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Saving the dataset (0/8 shards):   0%|          | 0/78529 [00:00<?, ? examples/s]

In [5]:
generator = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
generator_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

indexPath = "./faiss_index"
passagesPath = "./knowledge_base"

rag_config = RagConfig.from_question_encoder_generator_configs(
    question_encoder.config,
    generator.config,
    index_name="custom",
    n_docs=5,
    index_path=indexPath,
    passages_path=passagesPath,
)

retriever = RagRetriever(
    config=rag_config,
    question_encoder_tokenizer=question_encoder_tokenizer,
    generator_tokenizer=generator_tokenizer,
)

model = RagSequenceForGeneration(
    config=rag_config,
    question_encoder=question_encoder,
    generator=generator,
    retriever=retriever,
)

retriever.config.index_path = rag_config.index_path
retriever.config.passages_path = rag_config.passages_path
retriever.config.generator = rag_config.generator
retriever.config.question_encoder = rag_config.question_encoder

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
trainable_params = sum(param.numel() for param in model.parameters() if param.requires_grad)
print(f"Trainable Parameters: {trainable_params}")

Trainable Parameters: 515182080


In [7]:
for name, param in model.named_parameters():
    if "decoder.layers.6" not in name and "decoder.layers.7" not in name and "decoder.layers.8" not in name and "decoder.layers.9" not in name and "decoder.layers.10" not in name and "decoder.layers.11" not in name:  # train only last 2 layers
        param.requires_grad = False

trainable_params = [name for name, param in model.named_parameters() if param.requires_grad]
print("Trainable Parameters:", trainable_params)

Trainable Parameters: ['rag.generator.model.decoder.layers.6.self_attn.k_proj.weight', 'rag.generator.model.decoder.layers.6.self_attn.k_proj.bias', 'rag.generator.model.decoder.layers.6.self_attn.v_proj.weight', 'rag.generator.model.decoder.layers.6.self_attn.v_proj.bias', 'rag.generator.model.decoder.layers.6.self_attn.q_proj.weight', 'rag.generator.model.decoder.layers.6.self_attn.q_proj.bias', 'rag.generator.model.decoder.layers.6.self_attn.out_proj.weight', 'rag.generator.model.decoder.layers.6.self_attn.out_proj.bias', 'rag.generator.model.decoder.layers.6.self_attn_layer_norm.weight', 'rag.generator.model.decoder.layers.6.self_attn_layer_norm.bias', 'rag.generator.model.decoder.layers.6.encoder_attn.k_proj.weight', 'rag.generator.model.decoder.layers.6.encoder_attn.k_proj.bias', 'rag.generator.model.decoder.layers.6.encoder_attn.v_proj.weight', 'rag.generator.model.decoder.layers.6.encoder_attn.v_proj.bias', 'rag.generator.model.decoder.layers.6.encoder_attn.q_proj.weight', 'rag

In [8]:
assert model.config.index_path == retriever.config.index_path, "Index path mismatch!"
assert model.config.passages_path == retriever.config.passages_path, "Passages path mismatch!"
assert model.config.generator == retriever.config.generator, "Generator config mismatch!"
assert model.config.question_encoder == retriever.config.question_encoder, "Question encoder config mismatch!"
print("All configurations are consistent!")

All configurations are consistent!


In [9]:
# Initialize wandb
wandb.init(project="rag_finetuning2", name="rag2")

datasets = load_dataset("lighteval/natural_questions_clean")

# Split train and validation
full_train_data = datasets['train']
train_val_split = full_train_data.train_test_split(test_size=0.1, seed=42)
train_data = train_val_split['train']
validation_data = train_val_split['test']

# Rename columns
train_data = train_data.rename_column("short_answers", "answer")
validation_data = validation_data.rename_column("short_answers", "answer")

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'

model.to(device)

def preprocess_function(batch):
    question = [q if isinstance(q, str) else "" for q in batch['question']]
    answer = [a[0] if (isinstance(a, list) and len(a) > 0 and isinstance(a[0], str)) else "" for a in batch['answer']]
    inputs = question_encoder_tokenizer(question, padding="max_length", max_length = 512, truncation=True, return_tensors="pt")
    targets = generator_tokenizer(answer, padding="max_length", max_length = 512, truncation=True, return_tensors="pt")

    return {
        "input_ids": inputs["input_ids"].tolist(),
        "attention_mask": inputs["attention_mask"].tolist(),
        "labels": targets["input_ids"].tolist(),
    }

train_dataset = train_data.map(preprocess_function, batched=True)
validation_dataset = validation_data.map(preprocess_function, batched=True)

def collate_fn(batch):
    return {
        "input_ids": torch.tensor([item["input_ids"] for item in batch]),
        "attention_mask": torch.tensor([item["attention_mask"] for item in batch]),
        "labels": torch.tensor([item["labels"] for item in batch]),
        "question": [item["question"] for item in batch],
        "original_answer": [item["answer"] for item in batch],
    }

train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)
validation_dataloader = DataLoader(validation_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

epochs = 8
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_dataloader) * epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

scaler = GradScaler()

model.train()

for epoch in range(epochs):
    # Training loop
    train_loss_total = 0.0
    train_steps = 0

    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            if loss.dim() > 0:
                loss = loss.mean()

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        lr_scheduler.step()

        train_loss_total += loss.item()
        train_steps += 1

    # Compute average training loss for the epoch
    avg_train_loss = train_loss_total / train_steps if train_steps > 0 else 0.0

    # Validation loop
    model.eval()
    val_loss = 0
    val_steps = 0
    with torch.no_grad():
        for batch in validation_dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            with autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.mean().item()
                val_steps += 1

    avg_val_loss = val_loss / val_steps if val_steps > 0 else 0.0

    print(f"Epoch {epoch + 1}, Train Loss: {avg_train_loss}, Validation Loss: {avg_val_loss}")

    # Log epoch losses to wandb
    wandb.log({
        "epoch": epoch + 1,
        "train_loss_epoch": avg_train_loss,
        "val_loss_epoch": avg_val_loss
    })

    model.train()

model.save_pretrained("finetuned_rag")

wandb.finish()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mlzheng2[0m. Use [1m`wandb login --relogin`[0m to force relogin


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

  scaler = GradScaler()
  with autocast():
  with autocast():


Epoch 1, Train Loss: 52.0580891167363, Validation Loss: 36.87710940645864
Epoch 2, Train Loss: 38.07242177340907, Validation Loss: 30.8282016633203
Epoch 3, Train Loss: 29.36421213846742, Validation Loss: 21.80650766636154
Epoch 4, Train Loss: 20.00167305558406, Validation Loss: 20.966648106324303
Epoch 5, Train Loss: 17.889120399074923, Validation Loss: 18.133521347543034
Epoch 6, Train Loss: 16.68456099557555, Validation Loss: 17.370237055864155
Epoch 7, Train Loss: 15.61680763535338, Validation Loss: 16.86936557345434
Epoch 8, Train Loss: 15.039319696803531, Validation Loss: 17.90341077461493


Non-default generation parameters: {'forced_eos_token_id': 2}


0,1
epoch,▁▂▃▄▅▆▇█
train_loss_epoch,█▅▄▂▂▁▁▁
val_loss_epoch,█▆▃▂▁▁▁▁

0,1
epoch,8.0
train_loss_epoch,15.03932
val_loss_epoch,17.90341


In [16]:
# Move model to the appropriate device and set to evaluation mode

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

# Prepare the test dataset
test_data = datasets['validation']
test_data = test_data.rename_column("short_answers", "answer")
test_dataset = test_data.map(preprocess_function, batched=True)

test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)
results = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        # Move input data to the appropriate device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        # Generate predictions
        generated_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=50,
            num_beams=5
        )

        # Decode predictions
        predicted_answers = generator_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        # Store results
        for question, original_answer, predicted_answer in zip(batch["question"], batch["original_answer"], predicted_answers):
            results.append({
                "question": question,
                "original_answer": original_answer,
                "predicted_answer": predicted_answer
            })

# Save results to a file
results_file = "results.json"
with open(results_file, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)

print(f"Test results saved to {results_file}")


Testing: 100%|██████████| 4289/4289 [38:45<00:00,  1.84it/s]

Test results saved to results.json





In [17]:
def calculate_f1(predicted, ground_truths):
    if not isinstance(ground_truths, list):
        ground_truths = [ground_truths]
    max_f1 = 0.0
    for ground_truth in ground_truths:
        pred_tokens = predicted.split()
        gt_tokens = ground_truth.split()
        common = set(pred_tokens) & set(gt_tokens)
        if len(common) == 0:
            continue
        precision = len(common) / len(pred_tokens)
        recall = len(common) / len(gt_tokens)
        f1 = 2 * precision * recall / (precision + recall)
        max_f1 = max(max_f1, f1)
    return max_f1

f1_scores = [
    calculate_f1(item["predicted_answer"], item["original_answer"])
    for item in results
]
average_f1 = sum(f1_scores) / len(f1_scores)
print(f"Average F1 Score: {average_f1:.4f}")

Average F1 Score: 0.1874


In [18]:
def calculate_exact_match(predicted, ground_truths):
    if not isinstance(ground_truths, list):
        ground_truths = [ground_truths]
    for ground_truth in ground_truths:
        if predicted.strip() == ground_truth.strip():
            return 1
    return 0

em_scores = [
    calculate_exact_match(item["predicted_answer"], item["original_answer"])
    for item in results
]
average_em = sum(em_scores) / len(em_scores)
print(f"Exact Match Score: {average_em:.4f}")

Exact Match Score: 0.0208


In [19]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def calculate_q_bleu(predicted, ground_truths, question):
    if isinstance(ground_truths, list):
        ground_truths = " ".join(ground_truths)

    # Tokenize inputs
    ref_tokens = ground_truths.split()
    hyp_tokens = predicted.split()
    question_tokens = question.split()

    # Calculate BLEU score
    reference = [ref_tokens]
    hypothesis = hyp_tokens
    bleu_score = sentence_bleu(reference, hypothesis)

    # Remove stopwords
    important_ref_tokens = [token for token in ref_tokens if token.lower() not in stop_words]
    important_hyp_tokens = [token for token in hyp_tokens if token.lower() not in stop_words]

    # Key match ratio
    key_match = len(set(important_ref_tokens) & set(important_hyp_tokens)) / max(len(set(important_ref_tokens)), 1)

    # Question match ratio
    question_match = len(set(question_tokens) & set(hyp_tokens)) / max(len(set(question_tokens)), 1)

    # Weighted Q-BLEU score
    q_bleu = 0.7 * bleu_score + 0.2 * key_match + 0.1 * question_match

    return q_bleu

q_bleu_scores = [
    calculate_q_bleu(item["predicted_answer"], item["original_answer"], item["question"])
    for item in results
]
average_q_bleu = sum(q_bleu_scores) / len(q_bleu_scores)
print(f"Average Q-BLEU Score: {average_q_bleu:.4f}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Average Q-BLEU Score: 0.1363
