In [None]:
!pip install datasets
!pip uninstall -y faiss faiss-cpu faiss-gpu
!pip install faiss-cpu
!pip install wandb

[0mFound existing installation: faiss-cpu 1.9.0.post1
Uninstalling faiss-cpu-1.9.0.post1:
  Successfully uninstalled faiss-cpu-1.9.0.post1
[0mCollecting faiss-cpu
  Using cached faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Using cached faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [None]:
import torch
from torch.utils.data import DataLoader
import faiss
from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np
import wandb
from transformers import (
    AutoModel,
    AutoModelForSeq2SeqLM,
    RagTokenizer,
    RagRetriever,
    RagSequenceForGeneration,
    RagConfig,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    AdamW,
)

In [None]:
knowledge_base = load_dataset("Shannnh/knowledge_base_genai")
knowledge_base = knowledge_base['train']
knowledge_base = knowledge_base.rename_column("document", "text")
output_path = "./knowledge_base"  # 保存路径
knowledge_base.save_to_disk(output_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Saving the dataset (0/8 shards):   0%|          | 0/78529 [00:00<?, ? examples/s]

In [None]:
# 生成model

retriever = RagRetriever.from_pretrained(
    "facebook/rag-sequence-nq",
    index_name="custom",
    passages_path="./knowledge_base",
    index_path="./faiss_index",
    repo_type='knowledge_base'
)

model_name = "facebook/rag-sequence-nq"
model = RagSequenceForGeneration.from_pretrained(model_name, retriever=retriever)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

In [None]:
# Initialize wandb
wandb.init(project="rag_finetuning", name="rag_finetune_run")

# Load tokenizer
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
datasets = load_dataset("lighteval/natural_questions_clean")

# Split train and validation
full_train_data = datasets['train']
train_val_split = full_train_data.train_test_split(test_size=0.1, seed=42)
train_data = train_val_split['train']
validation_data = train_val_split['test']

# Rename columns
train_data = train_data.rename_column("short_answers", "answer")
validation_data = validation_data.rename_column("short_answers", "answer")

device = 'cpu'

# Freeze question encoder parameters
for param in model.rag.question_encoder.parameters():
    param.requires_grad = False

model.to(device)

def preprocess_function(batch):
    questions = [q if isinstance(q, str) else "" for q in batch['question']]
    answers = [a if isinstance(a, str) else "" for a in batch['answer']]
    inputs = tokenizer(questions, padding="max_length", truncation=True, return_tensors="pt")
    targets = tokenizer(answers, padding="max_length", truncation=True, return_tensors="pt")
    return {
        "input_ids": inputs["input_ids"].tolist(),
        "attention_mask": inputs["attention_mask"].tolist(),
        "labels": targets["input_ids"].tolist(),
    }

train_dataset = train_data.map(preprocess_function, batched=True)
validation_dataset = validation_data.map(preprocess_function, batched=True)

def collate_fn(batch):
    return {
        "input_ids": torch.tensor([item["input_ids"] for item in batch]),
        "attention_mask": torch.tensor([item["attention_mask"] for item in batch]),
        "labels": torch.tensor([item["labels"] for item in batch]),
    }

train_dataloader = DataLoader(train_dataset, batch_size=512, shuffle=True, collate_fn=collate_fn)
validation_dataloader = DataLoader(validation_dataset, batch_size=512, shuffle=False, collate_fn=collate_fn)

optimizer = AdamW(model.parameters(), lr=5e-5)

epochs = 1
model.train()

for epoch in range(epochs):
    # Training loop
    train_loss_total = 0.0
    train_steps = 0

    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss_total += loss.item()
        train_steps += 1

        # Log step loss to wandb
        wandb.log({"train_loss_step": loss.item()})

    # Compute average training loss for the epoch
    avg_train_loss = train_loss_total / train_steps if train_steps > 0 else 0.0

    # Validation loop
    model.eval()
    val_loss = 0
    val_steps = 0
    with torch.no_grad():
        for batch in validation_dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()
            val_steps += 1

    avg_val_loss = val_loss / val_steps if val_steps > 0 else 0.0

    print(f"Epoch {epoch + 1}, Train Loss: {avg_train_loss}, Validation Loss: {avg_val_loss}")

    # Log epoch losses to wandb
    wandb.log({
        "epoch": epoch + 1,
        "train_loss_epoch": avg_train_loss,
        "val_loss_epoch": avg_val_loss
    })

    model.train()

model.save_pretrained("finetuned_rag")
tokenizer.save_pretrained("finetuned_rag")

wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mlzheng2[0m. Use [1m`wandb login --relogin`[0m to force relogin


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

In [None]:
# test

model = RagSequenceForGeneration.from_pretrained("finetuned_rag")
# tokenizer = RagTokenizer.from_pretrained("finetuned_rag")

# Load the original validation dataset
datasets = load_dataset("lighteval/natural_questions_clean")
val_data = datasets['validation']

# Rename column for consistency
val_data = val_data.rename_column("short_answers", "answer")

# Preprocessing function for inference
def preprocess_function(batch):
    questions = [q if isinstance(q, str) else "" for q in batch['question']]
    # During inference, we only need input_ids and attention_mask for questions
    inputs = tokenizer(questions, padding="max_length", truncation=True, return_tensors="pt")
    return {
        "input_ids": inputs["input_ids"].tolist(),
        "attention_mask": inputs["attention_mask"].tolist(),
    }

# Apply the preprocessing to the validation data
val_processed = val_data.map(preprocess_function, batched=True)

# Define collate function to prepare batches
def collate_fn(batch):
    return {
        "input_ids": torch.tensor([item["input_ids"] for item in batch]),
        "attention_mask": torch.tensor([item["attention_mask"] for item in batch]),
    }

val_dataloader = DataLoader(val_processed, batch_size=1, shuffle=False, collate_fn=collate_fn)

model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

predictions = []
ids = []
questions = []
original_answers = []

# Inference loop
for i, val_batch in enumerate(val_dataloader):
    input_ids = val_batch["input_ids"].to(device)
    attention_mask = val_batch["attention_mask"].to(device)

    # Generate predicted answers
    # Adjust max_length as needed
    output_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=50)
    predicted_answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Retrieve the original data by index i
    original_item = val_data[i]

    ids.append(original_item["id"])
    questions.append(original_item["question"])
    original_answers.append(original_item["answer"])
    predictions.append(predicted_answer)

# Create a new dataset with the results
results = Dataset.from_dict({
    "id": ids,
    "question": questions,
    "original_answer": original_answers,
    "predicted_answer": predictions
})

results.push_to_hub("username/my_finetuned_rag_predictions")

Map:   0%|          | 0/4289 [00:00<?, ? examples/s]

TypeError: 'NoneType' object is not subscriptable

In [None]:
def calculate_f1(predicted, ground_truth):
    pred_tokens = predicted.split()
    gt_tokens = ground_truth.split()
    common = set(pred_tokens) & set(gt_tokens)
    if len(common) == 0:
        return 0.0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(gt_tokens)
    return 2 * precision * recall / (precision + recall)

f1_scores = [calculate_f1(pred, gt) for pred, gt in zip(results["predicted_answer"], results["original_answer"])]
average_f1 = sum(f1_scores) / len(f1_scores)
print(f"Average F1 Score: {average_f1:.4f}")

In [None]:
def calculate_exact_match(predicted, ground_truth):
    return int(predicted.strip() == ground_truth.strip())

em_scores = [calculate_exact_match(pred, gt) for pred, gt in zip(results["predicted_answer"], results["original_answer"])]
average_em = sum(em_scores) / len(em_scores)
print(f"Exact Match Score: {average_em:.4f}")

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

def calculate_q_bleu(predicted, ground_truth, question):
    # Tokenize inputs
    ref_tokens = ground_truth.split()
    hyp_tokens = predicted.split()
    question_tokens = question.split()

    reference = [ref_tokens]
    hypothesis = hyp_tokens
    bleu_score = sentence_bleu(reference, hypothesis)

    stop_words = set(stopwords.words('english'))
    important_ref_tokens = [token for token in ref_tokens if token.lower() not in stop_words]
    important_hyp_tokens = [token for token in hyp_tokens if token.lower() not in stop_words]

    key_match = len(set(important_ref_tokens) & set(important_hyp_tokens)) / max(len(set(important_ref_tokens)), 1)

    question_match = len(set(question_tokens) & set(hyp_tokens)) / max(len(set(question_tokens)), 1)

    q_bleu = 0.7 * bleu_score + 0.2 * key_match + 0.1 * question_match

    return q_bleu

q_bleu_scores = [
    calculate_q_bleu(pred, gt, q)
    for pred, gt, q in zip(results["predicted_answer"], results["original_answer"], results["question"])
]
average_q_bleu = sum(q_bleu_scores) / len(q_bleu_scores)
print(f"Average Q-BLEU Score: {average_q_bleu:.4f}")