In [1]:
!pip install datasets
!pip uninstall -y faiss faiss-cpu faiss-gpu
!pip install faiss-cpu
!pip install wandb

[0mFound existing installation: faiss-cpu 1.9.0.post1
Uninstalling faiss-cpu-1.9.0.post1:
  Successfully uninstalled faiss-cpu-1.9.0.post1
[0mCollecting faiss-cpu
  Using cached faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Using cached faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [2]:
import torch
from torch.utils.data import DataLoader
import faiss
from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np
import wandb
from transformers import (
    AutoModel,
    AutoModelForSeq2SeqLM,
    RagTokenizer,
    RagRetriever,
    RagSequenceForGeneration,
    RagConfig,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    AdamW,
)

In [3]:
knowledge_base = load_dataset("Shannnh/knowledge_base_genai")
knowledge_base = knowledge_base['train']
knowledge_base = knowledge_base.rename_column("document", "text")
output_path = "./knowledge_base"  # 保存路径
knowledge_base.save_to_disk(output_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Saving the dataset (0/8 shards):   0%|          | 0/78529 [00:00<?, ? examples/s]

In [13]:
from transformers import (
    AutoModelForSeq2SeqLM,
    DPRQuestionEncoder,
    RagRetriever,
    RagSequenceForGeneration,
    RagConfig,
    AutoTokenizer,
    DPRQuestionEncoderTokenizer,
)

generator = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
generator_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

indexPath = "./faiss_index"
passagesPath = "./knowledge_base"

rag_config = RagConfig.from_question_encoder_generator_configs(
    question_encoder.config,
    generator.config,
    index_name="custom",
    n_docs=5,
    index_path=indexPath,
    passages_path=passagesPath,
)

retriever = RagRetriever(
    config=rag_config,
    question_encoder_tokenizer=question_encoder_tokenizer,
    generator_tokenizer=generator_tokenizer,
)

model = RagSequenceForGeneration(
    config=rag_config,
    question_encoder=question_encoder,
    generator=generator,
    retriever=retriever,
)

retriever.config.index_path = rag_config.index_path
retriever.config.passages_path = rag_config.passages_path
retriever.config.generator = rag_config.generator
retriever.config.question_encoder = rag_config.question_encoder

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
assert model.config.index_path == retriever.config.index_path, "Index path mismatch!"
assert model.config.passages_path == retriever.config.passages_path, "Passages path mismatch!"
assert model.config.generator == retriever.config.generator, "Generator config mismatch!"
assert model.config.question_encoder == retriever.config.question_encoder, "Question encoder config mismatch!"
print("All configurations are consistent!")

All configurations are consistent!


In [6]:
from torch.cuda.amp import GradScaler, autocast
from transformers import get_scheduler

# Initialize wandb
wandb.init(project="rag_finetuning", name="rag")

# Load tokenizer
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
datasets = load_dataset("lighteval/natural_questions_clean")

# Split train and validation
full_train_data = datasets['train'].select(range(4000))
train_val_split = full_train_data.train_test_split(test_size=0.1, seed=42)
train_data = train_val_split['train']
validation_data = train_val_split['test']

# Rename columns
train_data = train_data.rename_column("short_answers", "answer")
validation_data = validation_data.rename_column("short_answers", "answer")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Freeze question encoder parameters
for param in model.rag.question_encoder.parameters():
    param.requires_grad = False

model.to(device)

def preprocess_function(batch):
    question = [q if isinstance(q, str) else "" for q in batch['question']]
    answer = [a if isinstance(a, str) else "" for a in batch['answer']]
    inputs = tokenizer(question, padding="max_length", truncation=True, return_tensors="pt")
    targets = tokenizer(answer, padding="max_length", truncation=True, return_tensors="pt")
    return {
        "input_ids": inputs["input_ids"].tolist(),
        "attention_mask": inputs["attention_mask"].tolist(),
        "labels": targets["input_ids"].tolist(),
    }

train_dataset = train_data.map(preprocess_function, batched=True)
validation_dataset = validation_data.map(preprocess_function, batched=True)

def collate_fn(batch):
    return {
        "input_ids": torch.tensor([item["input_ids"] for item in batch]),
        "attention_mask": torch.tensor([item["attention_mask"] for item in batch]),
        "labels": torch.tensor([item["labels"] for item in batch]),
        "question": [item["question"] for item in batch],
        "original_answer": [item["answer"] for item in batch],
    }

train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)
validation_dataloader = DataLoader(validation_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

epochs = 1
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_dataloader) * epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

scaler = GradScaler()

model.train()

for epoch in range(epochs):
    # Training loop
    train_loss_total = 0.0
    train_steps = 0

    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            if loss.dim() > 0:
                loss = loss.mean()

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        lr_scheduler.step()

        train_loss_total += loss.item()
        train_steps += 1

        wandb.log({"train_loss_step": loss.item()})

    # Compute average training loss for the epoch
    avg_train_loss = train_loss_total / train_steps if train_steps > 0 else 0.0

    # Validation loop
    model.eval()
    val_loss = 0
    val_steps = 0
    with torch.no_grad():
        for batch in validation_dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            with autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.mean().item()
                val_steps += 1

            wandb.log({"val_loss_step": outputs.loss.mean().item()})

    avg_val_loss = val_loss / val_steps if val_steps > 0 else 0.0

    print(f"Epoch {epoch + 1}, Train Loss: {avg_train_loss}, Validation Loss: {avg_val_loss}")

    # Log epoch losses to wandb
    wandb.log({
        "epoch": epoch + 1,
        "train_loss_epoch": avg_train_loss,
        "val_loss_epoch": avg_val_loss
    })

    model.train()

model.save_pretrained("finetuned_rag")
tokenizer.save_pretrained("finetuned_rag")

wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mlzheng2[0m. Use [1m`wandb login --relogin`[0m to force relogin


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

Map:   0%|          | 0/3600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

  scaler = GradScaler()
  with autocast():
  with autocast():


Epoch 1, Train Loss: 28.31622967258141, Validation Loss: 274.90614219665525


Non-default generation parameters: {'forced_eos_token_id': 2}


0,1
epoch,▁
train_loss_epoch,▁
train_loss_step,▄▄█▄▅▁▂▂▁▃▂▂▁▁▁▃▂▃▇▂▂▂▂▂▁▁▂▂▂▂▃▃▇▂▂▂▂▂▁▂
val_loss_epoch,▁
val_loss_step,▄▂▃▄▆▂▄▂▄▂▁▂▃█▃▄█▃▅▄▄▅▅▅▄▅▄▂▃▁▅▅█▃▁▂▂█▃▂

0,1
epoch,1.0
train_loss_epoch,28.31623
train_loss_step,2.31632
val_loss_epoch,274.90614
val_loss_step,284.12057


In [7]:
import json
from tqdm import tqdm
from torch.utils.data import DataLoader

# Ensure the model is in evaluation mode
model.eval()

# Prepare the test dataset
test_data = datasets['validation']
test_data = test_data.rename_column("short_answers", "answer")
test_dataset = test_data.map(preprocess_function, batched=True, remove_columns=[])

test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)
results = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        # Generate predictions
        generated_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=50,
            num_beams=5
        )

        # Decode predictions and original answers
        predicted_answers = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        # Store results
        for question, original_answer, predicted_answer in zip(batch["question"], batch["original_answer"], predicted_answers):
            results.append({
                "question": question,
                "original_answer": original_answer,
                "predicted_answer": predicted_answer
            })

# Save results to a file
results_file = "results.json"
with open(results_file, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)

print(f"Test results saved to {results_file}")

Map:   0%|          | 0/4289 [00:00<?, ? examples/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Testing:   0%|          | 6/4289 [00:27<5:31:48,  4.65s/it]


KeyboardInterrupt: 

In [14]:
import json
from tqdm import tqdm
from torch.utils.data import DataLoader
import torch

# Check if GPU is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to the appropriate device and set to evaluation mode
model.to(device)
model.eval()

# Prepare the test dataset
test_data = datasets['validation'].select(range(20))
test_data = test_data.rename_column("short_answers", "answer")
test_dataset = test_data.map(preprocess_function, batched=True, remove_columns=[])

test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)
results = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        # Move input data to the appropriate device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        # Generate predictions
        generated_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=50,
            num_beams=5
        )

        # Decode predictions
        predicted_answers = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        # Store results
        for question, original_answer, predicted_answer in zip(batch["question"], batch["original_answer"], predicted_answers):
            results.append({
                "question": question,
                "original_answer": original_answer,
                "predicted_answer": predicted_answer
            })

# Save results to a file
results_file = "results.json"
with open(results_file, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)

print(f"Test results saved to {results_file}")


Testing: 100%|██████████| 20/20 [01:59<00:00,  5.97s/it]

Test results saved to results.json





In [15]:
def calculate_f1(predicted, ground_truths):
    if not isinstance(ground_truths, list):
        ground_truths = [ground_truths]
    max_f1 = 0.0
    for ground_truth in ground_truths:
        pred_tokens = predicted.split()
        gt_tokens = ground_truth.split()
        common = set(pred_tokens) & set(gt_tokens)
        if len(common) == 0:
            continue
        precision = len(common) / len(pred_tokens)
        recall = len(common) / len(gt_tokens)
        f1 = 2 * precision * recall / (precision + recall)
        max_f1 = max(max_f1, f1)
    return max_f1

f1_scores = [
    calculate_f1(item["predicted_answer"], item["original_answer"])
    for item in results
]
average_f1 = sum(f1_scores) / len(f1_scores)
print(f"Average F1 Score: {average_f1:.4f}")

Average F1 Score: 0.0191


In [16]:
def calculate_exact_match(predicted, ground_truths):
    if not isinstance(ground_truths, list):
        ground_truths = [ground_truths]
    for ground_truth in ground_truths:
        if predicted.strip() == ground_truth.strip():
            return 1
    return 0

em_scores = [
    calculate_exact_match(item["predicted_answer"], item["original_answer"])
    for item in results
]
average_em = sum(em_scores) / len(em_scores)
print(f"Exact Match Score: {average_em:.4f}")

Exact Match Score: 0.0000


In [17]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.corpus import stopwords
import nltk

# 下载所需的 NLTK 资源
nltk.download('stopwords')

# 预加载停用词
stop_words = set(stopwords.words('english'))

def calculate_q_bleu(predicted, ground_truths, question):
    # 如果 ground_truth 是列表，将其合并为单个字符串
    if isinstance(ground_truths, list):
        ground_truths = " ".join(ground_truths)

    # Tokenize inputs
    ref_tokens = ground_truths.split()
    hyp_tokens = predicted.split()
    question_tokens = question.split()

    # Calculate BLEU score
    reference = [ref_tokens]
    hypothesis = hyp_tokens
    bleu_score = sentence_bleu(reference, hypothesis)

    # Remove stopwords
    important_ref_tokens = [token for token in ref_tokens if token.lower() not in stop_words]
    important_hyp_tokens = [token for token in hyp_tokens if token.lower() not in stop_words]

    # Key match ratio
    key_match = len(set(important_ref_tokens) & set(important_hyp_tokens)) / max(len(set(important_ref_tokens)), 1)

    # Question match ratio
    question_match = len(set(question_tokens) & set(hyp_tokens)) / max(len(set(question_tokens)), 1)

    # Weighted Q-BLEU score
    q_bleu = 0.7 * bleu_score + 0.2 * key_match + 0.1 * question_match

    return q_bleu

q_bleu_scores = [
    calculate_q_bleu(item["predicted_answer"], item["original_answer"], item["question"])
    for item in results
]
average_q_bleu = sum(q_bleu_scores) / len(q_bleu_scores)
print(f"Average Q-BLEU Score: {average_q_bleu:.4f}")

Average Q-BLEU Score: 0.0126


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
