The notebook aims at answering questions using LFQA with pretrained and domain adapated retriever and thereby comparing search results 

Domain adapted retriever model outperforms base model for most of the questions

Note- The notebook does not involve code for domain adaptation using GPL.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Necessary Imports

In [None]:
!pip install -U pinecone-client==2.0.10
!pip install opendatasets
!pip install sentence_transformers datasets
!pip install -U 'farm-haystack[pinecone]'==1.3.0
!pip install lxml==4.9.0


In [None]:
from sentence_transformers import SentenceTransformer, util, InputExample, losses
from torch.utils.data import DataLoader
from sentence_transformers import CrossEncoder
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import tqdm
import random
import pandas as pd
from datasets import load_dataset
import pinecone
import opendatasets as od
import torch
from haystack import Document
from haystack.generator.transformers import Seq2SeqGenerator
from haystack.utils import print_answers

# Model Selection

Unzip domain adapted model file present in Drive

In [1]:
!unzip /content/drive/MyDrive/model.zip
MODEL_PATH = '/content/drive/MyDrive/biencoder-arxiv/'

We load 

(old_model) pretrained model, a state-of-the-art model trained on MS MARCO.

(new_model)domain adapted model (biencoder-arxiv) 

In [None]:
from sentence_transformers import SentenceTransformer
max_seq_length = 256
old_model_name = "msmarco-distilbert-base-tas-b"
old_model = SentenceTransformer(old_model_name)

new_model_name = MODEL_PATH
new_model = SentenceTransformer(new_model_name)


# PineCone Operations

Logging into Pinecone and creating index negative-mines if not present

Uploading current documents to Pinecone in batch with a namespace to perform faster search

Function to query top 3 similar embeddings from Pinecone, we take the query and get the embeddings based on the model provided. For all the matches, we get the ID and the TEXT from the metadata. This is then convert to a result dictionary to be used for Haystack using the document store.

PineCone Wrapper to perform operations such as upload, query and delete for our needs.

In [None]:
class PineconeWrapper():
    def __init__(self, model, top_k=3, dimensions=768):
        pinecone.init(api_key='###', environment='us-west1-gcp')
        # create new mining index if does not exist
        if 'negative-mines' not in pinecone.list_indexes():
            pinecone.create_index(
                'negative-mines', dimension=dimensions,
                metric='dotproduct', pods=1, pod_type='p1'  # limit of pods=1 for free plan (more pods == faster mining)
            )
        # connect
        self.index = pinecone.Index('negative-mines')
        print("Index Stats: ", self.index)

        self.batch_size = 16
        self.dimension = dimensions
        self.model = model
        self.top_k = top_k


    def upload_pinecone(self, haystack_docs, namespace):
        docs = [haystack_docs[i]['content'] for i in range(len(haystack_docs))]

        # doc_dir = [haystack_docs[i]['meta']['doc_dir'] for i in range(len(haystack_docs))]

        print("\nUPLOAD PINECONE START!*******!")
        docs_emb = self.model.encode(docs, convert_to_tensor=True, show_progress_bar=True)
        print(f"Document Embeddings Shape : {docs_emb.shape}")

        index_data = self.index.describe_index_stats()
        print(f"Index Data before Adding : {index_data}")
        totalVectorCount = int(index_data['totalVectorCount'])

        for i in tqdm.tqdm(range(0, len(docs_emb), self.batch_size)):
            i_end = min(i+self.batch_size, len(docs_emb))
            batch_emb = docs_emb[i:i_end, :].tolist()
            # batch_data = docs[i:i_end]
            
            # batch_metadata = [{"text": batch_data[i]} for i in range(0, len(batch_data))]
            batch_metadata = [{"text": haystack_docs[j]['content'], 
                               "doc_dir": haystack_docs[j]['meta']['doc_dir']} 
                              for j in range(i, i_end)]

            batch_ids = [str(x+totalVectorCount) for x in range(i, i_end)]
            # print(f"Batch ID : {batch_ids}, Batch MetaData : {batch_metadata}")
            # print(f"Batch ID : {batch_ids}, Batch Embeddings : {batch_emb}")
            # print(f"Batch ID : {batch_ids}, Batch Data : {batch_data}")
            # upload to index
            upload_vectors = list(zip(batch_ids, batch_emb, batch_metadata))
            print(f"\nBatch Upload Vectors : {upload_vectors}\n")
            self.index.upsert(vectors=upload_vectors, namespace=namespace)
        
        index_data = self.index.describe_index_stats()
        print(f"Index Data after Adding : {index_data}")
        print("\nUPLOAD PINECONE END!*******!")

    def delete_vectors(self, namespace=None):
        if not namespace:
            index_data = self.index.describe_index_stats()
            print(f"Index Data before deleting : {index_data}")

            to_delete = []
            for i in range(12263, 12294):
                to_delete.append(str(i))
            print(f"Index to Delete : {to_delete}")
            self.index.delete(ids = to_delete)

            index_data = self.index.describe_index_stats()
            print(f"Index Data after deleting : {index_data}")
        else:
            index_data = self.index.describe_index_stats()
            print(f"Index Data before deleting : {index_data}")

            self.index.delete(delete_all=True, namespace=namespace)

            index_data = self.index.describe_index_stats()
            print(f"Index Data after deleting : {index_data}")

    def query_pinecone(self, query, namespace):
        query_emb = self.model.encode(query).tolist()
        # print(query_emb.shape)
        # print(query_emb)
        
        # res = index.query([query_emb], top_k = 10)
        if namespace:
            res = self.index.query([query_emb], top_k = self.top_k, namespace=namespace, include_metadata=True)
        else:
            res = self.index.query([query_emb], top_k = self.top_k)
        # print(f"Pinecone Results : {res}")
        
        # ids = [match.id for match in res['results'][0]['matches']]
        # scores = [match.score for match in res['results'][0]['matches']]
        
        # print(ids)
        # print(scores)
        answers = []
        
        for match in res['results'][0]['matches']:
            vector_id = int(match.id)
            score = match.score
            # text = corpus[vector_id]
            try:
                text = match['metadata']['text']
                doc_dir = match['metadata']['doc_dir']
            except:
                text = abstracts[vector_id]
                doc_dir = "NA"
                
            result_dict = {
                "content" : text,
                "context-type" : "text",
                "meta":{
                    "id" : vector_id,
                    "score": score,
                    "doc_dir" : doc_dir
                }
            }
            answers.append(result_dict)
            # print(f"Score : {score}, ID : {vector_id}, TEXT : {text}")
        
        return answers

# Integrating LFQA and Haystack

# Get Answers from LFQA

Custom function to retrieve documents from input file

In [None]:
from pathlib import Path
import os

def get_doc(all_doc_dir):
    threshold = 300
    doc_results = []

    print(f"Documents Found : {os.listdir(all_doc_dir)}")

    for document in os.listdir(all_doc_dir):
        doc_dir = os.path.join(all_doc_dir, document)
        print(f"Processing Document : {doc_dir}")
        text = Path(doc_dir).read_text().replace("\n", " ")
        text = text.replace('"', "")
        out = []
        
        for chunk in text.split('. '):
            if out and len(chunk)+len(out[-1]) < threshold:
                out[-1] += ' '+chunk+'.'
            else:
                out.append(chunk+'.')

        
        for doc in out:
            result_dict = {
                    "content" : doc,
                    "context-type" : "text",
                    "meta":{
                        "id" : None,
                        "score": 0,
                        "doc_dir" : document
                    }
                }
            doc_results.append(result_dict)

    return doc_results

Specify query and document dicrectory to use the file. Then we get the list of texts from the document. Now we pass the model and the query to top 3 documents from the query_pinecone function. Combined result contains the document from the texts as well as top 3 documents from query_pinecone function. For all the documents in combined results we create a document store. Then we pass this document for the LFQA to obtain the answers.

In [None]:
def get_answers_wrapper(query, doc_dir, model, namespace, top_k):
    doc_results = get_doc(doc_dir)
    # print(f"DOC RESULTS : {doc_results}")

    pineconeWrapper = PineconeWrapper(model, top_k=top_k, dimensions = 768)

    pineconeWrapper.upload_pinecone(doc_results, namespace)

    biencoder_results = pineconeWrapper.query_pinecone(query, namespace)
    document_store = []

    data = {}
    data['query'] = query
    
    for idx, doc in enumerate(biencoder_results):
        i = str(idx)
        data["top_"+i+"_content"] = doc['content']
        data["top_"+i+"_dotscore"] = doc['meta']['score']
        data["top_"+i+"_document"] = doc['meta']['doc_dir']
        document_store.append(Document(doc['content']))

    print(f"All Documents : {document_store}")

    generator = Seq2SeqGenerator(model_name_or_path="vblagoje/bart_lfqa")

    result = generator.predict(
        query=query,
        documents = document_store,
        top_k=1
    )

    print_answers(result, details="minimum")
    
    answers = result['answers']
    for answer in answers:
        final_ans = answer.answer
        ans_score = answer.score
        break
    
    data['answer'] = final_ans
    data['score'] = ans_score
    # return final_ans
    return data

In [None]:
def run_inference_refactored(query, doc_dir, model, top_k=5):
    pineconeWrapper = PineconeWrapper(model, top_k=top_k)
    pineconeWrapper.delete_vectors(namespace='Space_Search')
    data = get_answers_wrapper(query, doc_dir, model, namespace='Space_Search', top_k=top_k)
    return data

# CSV Export

In [None]:
def get_csv(queries, all_doc_dir):
    all_data = []
    for query in queries:
        answers_new = run_inference_refactored(query, all_doc_dir, new_model, top_k=3)
        answers_new['model_type'] = 'domain_adapted'
        all_data.append(answers_new)

        answers_old = run_inference_refactored(query, all_doc_dir, old_model, top_k=3)
        answers_old['model_type'] = 'pretrained'
        all_data.append(answers_old)
    
    df = pd.DataFrame(all_data)
    return df

In [None]:
queries=["What were the possible reasons for Hydrogen leak?",
    "What are the next steps taken after the launch was called off?",
    "Why is orion not planned to carry crews in the initial mission?"
    ]

In [None]:
all_docs_dir='/content/drive/MyDrive/data/'
df = get_csv(queries, all_docs_dir)
df.to_csv("Result.csv", index=None)