In [2]:
!pip install datasets
!pip uninstall -y faiss faiss-cpu faiss-gpu
!pip install faiss-cpu

[0mFound existing installation: faiss-cpu 1.9.0.post1
Uninstalling faiss-cpu-1.9.0.post1:
  Successfully uninstalled faiss-cpu-1.9.0.post1
[0mCollecting faiss-cpu
  Using cached faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Using cached faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [3]:
# 构建 FAISS 索引
import faiss
from datasets import load_dataset, Dataset
import numpy as np

dataset = load_dataset("Shannnh/knowledge_base_genai2")
dataset = dataset['train']
embeddings = np.array(dataset['embeddings']).astype('float32')
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # 使用 L2 距离度量
index.add(embeddings)  # 添加嵌入到索引中
faiss.write_index(index, "faiss_index")  # 修正后的索引文件路径

dataset = dataset.rename_column("document", "text")
output_path = "./small_dataset_test"  # 保存路径
dataset.save_to_disk(output_path)  # 生成model

# 构建 RAG 模型和检索器
from transformers import (
    AutoModelForSeq2SeqLM,
    RagTokenizer,
    RagRetriever,
    RagSequenceForGeneration,
    RagConfig,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Saving the dataset (0/1 shards):   0%|          | 0/40 [00:00<?, ? examples/s]

In [4]:
from transformers import DPRQuestionEncoder
question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
generator = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-base")

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
retriever = RagRetriever.from_pretrained(
    "facebook/rag-sequence-nq",
    index_name="custom",
    passages_path="./small_dataset_test",  # 知识库路径
    index_path="faiss_index",              # FAISS 索引文件路径
    use_dummy_dataset=False,
    use_dummy_prompt=False,
    embed_title=True
)

rag_config = RagConfig.from_pretrained("facebook/rag-sequence-nq")
rag_config.index_name = "custom"
rag_config.n_docs = 5  # 检索的文档数量，可以调整

model = RagSequenceForGeneration(
    config=rag_config,
    question_encoder=question_encoder,
    generator=generator,
    retriever=retriever
)
model.set_retriever(retriever)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

In [6]:
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")

# 数据预处理
dataset = load_dataset("lighteval/natural_questions_clean")
dataset = dataset['train']
dataset = dataset.rename_column("short_answers", "answer")

def preprocess_function(examples):
    questions = [q if isinstance(q, str) else "" for q in examples['question']]
    answers = [a if isinstance(a, str) else "" for a in examples['answer']]
    encoded_questions = tokenizer.question_encoder(
        questions,
        max_length=512,
        truncation=True,
        padding='max_length',
        return_tensors="pt"
    )
    encoded_answers = tokenizer.generator(
        answers,
        max_length=512,
        truncation=True,
        padding='max_length',
        return_tensors="pt"
    )
    encoded_questions["labels"] = encoded_answers["input_ids"]
    return encoded_questions

tokenized_train = dataset.map(preprocess_function, batched=True)
tokenized_train = tokenized_train.remove_columns(['question', 'answer'])
tokenized_train.set_format('torch')

collator = DataCollatorForSeq2Seq(tokenizer.generator, model=model)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./rag_output",
    evaluation_strategy="steps",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    save_steps=10,
    logging_dir="./logs",
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_train,
    tokenizer=tokenizer,
    data_collator=collator,
)

trainer.train()
trainer.save_model("./rag_trained_model")

  trainer = Seq2SeqTrainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
