In [None]:
%%capture
import requests
import pandas as pd
from typing import List
from haystack import Document
from haystack.document_store.faiss import FAISSDocumentStore
from haystack.generator.transformers import RAGenerator
from haystack.retriever.dense import DensePassageRetriever

In [None]:
# Download sample dataset
temp = requests.get("https://raw.githubusercontent.com/deepset-ai/haystack/master/tutorials/small_generator_dataset.csv")
open('small_generator_dataset.csv', 'wb').write(temp.content)

# Create dataframe with columns "title" and "text"
df = pd.read_csv("small_generator_dataset.csv", sep=',')
# Minimal cleaning
df.fillna(value="", inplace=True)

print(df.head())

               title                                                                             text
0  "Albert Einstein"  to Einstein in 1922. Footnotes Citations Albert Einstein Albert Einstein (; ...
1  "Albert Einstein"  Albert Einstein Albert Einstein (; ; 14 March 1879 – 18 April 1955) was a Ge...
2  "Albert Einstein"  observations were published in the international media, making Einstein worl...
3  "Albert Einstein"  model for depictions of mad scientists and absent-minded professors; his exp...
4     "Alfred Nobel"  was adopted as the standard technology for mining in the "Age of Engineering...


In [None]:
# Use the dataset to initialize Document objects
titles = list(df["title"].values)
texts = list(df["text"].values)
documents: List[Document] = []
for title, text in zip(titles, texts):
    documents.append(
        Document(
            text=text,
            meta={
                "name": title or ""
            }
        )
    )

In [None]:
%%capture
# Initialize FAISS document store.
# Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding
document_store = FAISSDocumentStore(
    faiss_index_factory_str="Flat",
    return_embedding=True
)

# Initialize DPR Retriever to encode documents, encode question and query documents
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=True,
    embed_title=True,
)

# Initialize RAG Generator
generator = RAGenerator(
    model_name_or_path="facebook/rag-token-nq",
    use_gpu=True,
    top_k=1,
    max_length=200,
    min_length=2,
    embed_title=True,
    num_beams=2,
)

In [None]:
# Delete existing documents in documents store
document_store.delete_all_documents()

# Write documents to document store
document_store.write_documents(documents)

# Add documents embeddings to index
document_store.update_embeddings(
    retriever=retriever
)

  self.session.query(DocumentORM).filter_by(index=index).delete(synchronize_session=False)
  self.session.query(DocumentORM).filter_by(index=index).delete(synchronize_session=False)
Updating Embedding:   0%|          | 0/68 [00:00<?, ? docs/s]

HBox(children=(FloatProgress(value=0.0, description='Create embeddings', max=80.0, style=ProgressStyle(descrip…

Documents Processed: 10000 docs [00:02, 4181.71 docs/s]


In [None]:
QUESTIONS = [
    "who got the first nobel prize in physics",
    "when is the next deadpool movie being released",
    "which mode is used for short wave broadcast service",
    "who is the owner of reading football club",
    "when is the next scandal episode coming out",
    "when is the last time the philadelphia won the superbowl",
    "what is the most current adobe flash player version",
    "how many episodes are there in dragon ball z",
    "what is the first step in the evolution of the eye",
    "where is gall bladder situated in human body",
    "what is the main mineral in lithium batteries",
    "who is the president of usa right now",
    "where do the greasers live in the outsiders",
    "panda is a national animal of which country",
    "what is the name of manchester united stadium",
]

In [None]:
# Now generate an answer for each question
for question in QUESTIONS:
    # Retrieve related documents from retriever
    retriever_results = retriever.retrieve(
        query=question
    )

    # Now generate answer from question and retrieved documents
    predicted_result = generator.predict(
        query=question,
        documents=retriever_results,
        top_k=1
    )

    # Print you answer
    answers = predicted_result["answers"]
    print(f'Generated answer is \'{answers[0]["answer"]}\' for the question = \'{question}\'')

Generated answer is ' albert einstein' for the question = 'who got the first nobel prize in physics'
Generated answer is ' september 22, 2017' for the question = 'when is the next deadpool movie being released'
Generated answer is ' amplitude modulation' for the question = 'which mode is used for short wave broadcast service'
Generated answer is ' stefan persson' for the question = 'who is the owner of reading football club'
Generated answer is ' april 20, 2018' for the question = 'when is the next scandal episode coming out'
Generated answer is ' the 1970s' for the question = 'when is the last time the philadelphia won the superbowl'
Generated answer is ' 7.1. 2' for the question = 'what is the most current adobe flash player version'
Generated answer is ' 13' for the question = 'how many episodes are there in dragon ball z'
Generated answer is ' step by step' for the question = 'what is the first step in the evolution of the eye'
Generated answer is ' stomach' for the question = 'whe

In [None]:
# Try a new question
from haystack.pipeline import GenerativeQAPipeline

NEW_QUESTIONS = [
    "who is the best president USA",
]

pipe = GenerativeQAPipeline(generator=generator, retriever=retriever)
for question in NEW_QUESTIONS:
    res = pipe.run(query=question, top_k_generator=1, top_k_retriever=50)
    print(res)

{'query': 'who is the best president USA', 'answers': [{'query': 'who is the best president USA', 'answer': ' trump', 'meta': {'doc_ids': ['5e30f241d1c389b17b3e0d894589bb6d', '677b399a549c301443097e99e579b504', '4bc744a58c35759e701f569c3afbec12', '756703c5a38a72a1445026e10187e6d1', '2aedb0c40e3a20c805d5b404a39543a', 'd34ecd3aeacc8cbd33555e07f9a98c73', 'ad7b335b2aa0618a83a484712609f21f', 'a9f5a2bc2b302574c4813daa71b52b09', '69c8b13a944a0878822e417f028aae49', '7fe7251032cc4b8b249a06fe8c975213', '179c700282b1784abd793bb989ba453c', 'e86ca2c67ea5220c1bad5de0868fad3f', '9dfc548a82f643420f5a3fa219088773', '326bf35becf4b0fbb1983d0939b192e', '6e4109839970876be7e585365a5d8c02', '4a347e0fc7c1b7d06fa7d7a2aa580555', '87d904c0a6eb166df1e2d7d2414ff8f5', '23c07d7b51ea44a1db9af1265ac6c276', '1ecd0e520a520197fa2f74549ccad50', '792312bbd1b1e8c8c4365b5966dee573', '7b6800d75331a3619db6cd07bfde9f4e', '549e4d922dd841db8244599f1d9ac30c', 'bd3e02e3146be9bc4081cc806bbee7af', '2df48a633cba33fa64736240303513cf', 

## Finetune RAG via Hugging face

In [None]:
!pip install transformers
!pip install datasets
!conda install -c pytorch faiss-gpu

In [None]:
# Imports
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AdamW
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split as tts
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration

In [None]:
# Add the numbers of available GPUs in the d_ids array
d_ids = [0,1]
device = f"cuda:{d_ids[0]}"

In [None]:
# Download humor detection data
# Paper: https://arxiv.org/abs/2004.12765
data = pd.read_csv("https://raw.githubusercontent.com/Moradnejad/ColBERT-Using-BERT-Sentence-Embedding-for-Humor-Detection/master/Data/dataset.csv")
print("\nThere are", len(data), "sentences")
data.head()


There are 200000 sentences


Unnamed: 0,text,humor
0,"Joe biden rules out 2020 bid: 'guys, i'm not r...",False
1,Watch: darvish gave hitter whiplash with slow ...,False
2,What do you call a turtle without its shell? d...,True
3,5 reasons the 2016 election feels so personal,False
4,"Pasco police shot mexican migrant from behind,...",False


In [None]:
# Split to train, val and test
train, test = tts(data[["text", "humor"]], random_state=42, test_size=0.1)
train, val = tts(train, random_state=42, test_size=test.shape[0])

In [None]:
# Use the pre-trained RAG sequence
# Define the tokenizer
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
# Define the retriever
# index_name is the dataset to be used, here we use the "wiki_dpr"
retriever = RagRetriever.from_pretrained("facebook/rag-sequence-nq", index_name="exact")
# Initialize the RAG model
model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever, n_docs=1)
# Run the model in parallel
model = nn.DataParallel(model, device_ids=d_ids)

In [None]:
# Find the maximum length
max_len = max([len(tokenizer(s)["input_ids"]) for s in train.text.to_list()])
print("The maximum sentence length in training based on BERT BPEs is", max_len)

In [None]:
# Encode input sequences
x_train = tokenizer.__call__(
        list(train.text.to_list()),
        max_length=max_len,
        padding=True,
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt"
)

x_val = tokenizer.__call__(
        list(val.text.to_list()),
        max_length=max_len,
        padding=True,
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt"
)

In [None]:
# Encode target sequences
with tokenizer.as_target_tokenizer():
    y_train = tokenizer.__call__(
        list(train.humor.to_list()),
        max_length=1,
        padding=True,
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt"
    )
    y_val = tokenizer.__call__(
        list(val.humor.to_list()),
        max_length=1,
        padding=True,
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt"
    )

In [None]:
batch_size = 4

# Create a dataloader for each set
print("Creating dataloaders...")

train_dataset = TensorDataset(x_train['input_ids'], x_train['attention_mask'], y_train["input_ids"])
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)

val_dataset = TensorDataset(x_val['input_ids'], x_val['attention_mask'], y_val["input_ids"])
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
# Training method
def training():
    # Set to train mode
    model.train()
    total_loss = 0
    # Iterate through the training batches
    for batch in tqdm(train_dataloader, desc="Iteration"):
        # Push the batch to gpu
        batch = [r.to(device) for r in batch]
        sent_id, mask, labels = batch
        # Clear gradients
        model.zero_grad()
        # Get model outputs
        outputs = model(sent_id, attention_mask=mask, labels=labels)
        # Get loss
        loss = torch.mean(outputs.loss)
        # Add to the total loss
        total_loss = total_loss + loss
        # Backward pass to calculate the gradients
        loss.backward()
        # Update parameters
        optimizer.step()
    # Compute the training loss of the epoch
    epoch_loss = total_loss / len(train_dataloader)

    return epoch_loss

In [None]:
# Evaluation method
def evaluate():
    print("\nEvaluating...")
    # Set to eval mode
    model.eval()
    total_loss, total_accuracy = 0, 0
    # Iterate through the validation batches
    for batch in val_dataloader:
        # Push the batch to gpu
        batch = [t.to(device) for t in batch]
        sent_id, mask, labels = batch
        # Deactivate autograd
        with torch.no_grad():
            # Get model outputs
            outputs = model(sent_id, attention_mask=mask, labels=labels)
            # Get loss
            loss = torch.mean(outputs.loss)
            total_loss = total_loss + loss

    # Compute the validation loss of the epoch
    epoch_loss = total_loss / len(val_dataloader)

    return epoch_loss

In [None]:
# Define the optimizer and the learning rate
optimizer = AdamW(model.parameters(), lr = 2e-5)

best_val_loss = float('inf')
best_epoch = -1
train_losses=[]
val_losses=[]
epochs = 100
# Define the number of epochs to wait for early stopping
patience = 3

print("Starting training...")
# Train the model
for epoch in range(epochs):
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    train_loss = training()
    val_loss = evaluate()

    train_losses.append(train_loss)
    val_losses.append(val_loss)

    print("\nTraining Loss:", train_loss)
    print("Validation Loss:", val_loss)

    # Save the model with the best validation loss
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_epoch = epoch

        # Save the model
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
        torch.save(model_to_save.state_dict(), "rag.bin")

    # Early stopping
    if ((epoch - best_epoch) >= patience):
        print("No improvement in", patience, "epochs. Stopped training.")
        break