In [1]:
from models.external_database import ExternalDatabase
from models.model_rag import RAG
from datasets import Features, Value, Sequence

# Autoreload 
%load_ext autoreload
%autoreload 2

## Dataset for RAG

In [2]:
ext_db = ExternalDatabase()

# ============= Create a dataset =============
# Create a dataset from a csv file
dataset = ext_db.create_dataset("documents/my_dataset.csv")

# Split the documents into chunks
dataset = dataset.map(ext_db.split_documents, batched=True, num_proc=None)

# Compute the embeddings of the chunks
new_features = Features(
    {"text": Value("string"), "title": Value("string"), "embeddings": Sequence(Value("float32"))}
)  
dataset = dataset.map(ext_db.compute_embeddings, batched=True, batch_size=16, features=new_features)

# Save the dataset
ext_db.save_dataset(dataset, "datasets/rag/rag_dataset")

# ============= Create the FAISS index =============
dataset = ext_db.index_dataset(dataset, 
                               path_index='datasets/rag/rag_dataset_index.faiss', 
                               embedding_dim=768, 
                               nb_links=128)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-multiset-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification 

Saving the dataset (0/1 shards):   0%|          | 0/2 [00:00<?, ? examples/s]

  0%|          | 0/1 [00:00<?, ?it/s]

## RAG Model

In [3]:
rag_name_or_path = "facebook/rag-sequence-base"
question_encoder_name_or_path ="facebook/dpr-question_encoder-single-nq-base" 
generator_name_or_path = "google/flan-t5-small"

rag_save_dir = "checkpoints/rag_model/"
pretrained_model = rag_save_dir
path_dataset = "datasets/rag/rag_dataset"
path_index = "datasets/rag/rag_dataset_index.faiss"

kwargs = {
    "rag_name_or_path": rag_name_or_path,
    "question_encoder_name_or_path": question_encoder_name_or_path,
    "generator_name_or_path": generator_name_or_path,
    "dataset_path": path_dataset,
    "index_path": path_index,
    "rag_save_dir": rag_save_dir
}

rag = RAG(pretrained_model, **kwargs)

input_questions = ["What is Linear Regression ?", "What is the recipe of pizza ?"]
rag.prediction_step(input_questions, max_new_tokens=1000)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


['Predictions of the model for the samples from the evaluation_set will be saved under the path specified by the predictions_path parameter. If this path already exists, the script will use saved predictions to calculate metrics. Add --recalculate parameter to force the script to perform inference from scratch. An example e2e evaluation run could look as follows:',
 'a pizza']

In [4]:
from models.model_dpo import AutoDPOModelForSeq2SeqLM
from transformers import AutoTokenizer
import yaml 

# Parameters for evaluation
main_config = {}
with open("main_config.yaml") as f:
    try:
        main_config = yaml.safe_load(f)
    except Exception as e:
        logger.error(f"Error loading main_config.yaml: {e}! Please check the file format.")

model_class = AutoDPOModelForSeq2SeqLM
rag_policy_model_path = main_config["rag_policy_model_path"]
rag_model_args = main_config.get("rag_model_args", {})
print(rag_model_args, end="\n\n")

# Load model 
model = AutoDPOModelForSeq2SeqLM.from_pretrained(rag_policy_model_path, 
                                                 **rag_model_args)
tokenizer = AutoTokenizer.from_pretrained(rag_policy_model_path)

# Test generation
input_questions = [{
        "question": "What is Linear Regression ?",
        "choices": ["A", "B", "C", "D"],
        "answer": "Blablabla",
    },
    {
        "question": "What is FFT ?",
        "choices": ["A", "B", "C", "D"],
        "answer": "Blablabla",
    }
]

model.prediction_step_mcqa(input_questions, tokenizer)

{'rag_name_or_path': 'facebook/rag-token-nq', 'generator_name_or_path': 'google/flan-t5-small', 'question_encoder_name_or_path': 'facebook/dpr-question_encoder-single-nq-base', 'dataset_path': 'datasets/rag/rag_dataset', 'index_path': 'datasets/rag/rag_dataset_index.faiss', 'rag_save_dir': 'checkpoints/rag_model/'}



[{'question': 'What is Linear Regression ?',
  'choices': ['A', 'B', 'C', 'D'],
  'answer': 'Blablabla',
  'preds': 'a'},
 {'question': 'What is FFT ?',
  'choices': ['A', 'B', 'C', 'D'],
  'answer': 'Blablabla',
  'preds': 'a'}]