#An efficient approach to adapt existing dense retrieval models to new domains & data without the need for labelled data

# I am not illustrating the GPL training part here. This is a retriever generator pipeline where I am comparing pretrained and GPL trained retriever. Idea is to compare pretrained and GPL trained model for information retrieval over multiple text documents. I am using Pinecone to store vectors and LFQA as generator.


#Next steps involve context based chunking to further improve retrieval performance


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Necessary Imports

In [None]:
from sentence_transformers import SentenceTransformer, util, InputExample, losses
from torch.utils.data import DataLoader
from sentence_transformers import CrossEncoder
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import tqdm
import random
#from datasets import load_dataset
import pinecone
import opendatasets as od

# Model Selection

In [None]:
# we load old (pretrained) and new (GPL trained) models
from sentence_transformers import SentenceTransformer
MODEL_PATH = '/content/drive/MyDrive/Astrophysics/biencoder-spacesciencemodel-gtebase' # GPL trained on aviation docs

old_model_name = "msmarco-distilbert-base-tas-b"
old_model = SentenceTransformer(old_model_name)

new_model_name = MODEL_PATH
new_model = SentenceTransformer(new_model_name)


We load the TAS-B model, a state-of-the-art model trained on MS MARCO

In [None]:
max_seq_length = 256
model_name = "msmarco-distilbert-base-tas-b"

org_model = SentenceTransformer(model_name)
org_model.max_seq_length = max_seq_length

# PineCone Operations

Logging into Pinecone and creating index negative-mines-petal if not present

Uploading current documents to Pinecone in batch with a namespace to perform faster search

Function to query top 3 similar embeddings from Pinecone, we take the query and get the embeddings based on the model provided. For all the matches, we get the ID and the TEXT from the metadata. This is then convert to a result dictionary to be used for Haystack using the document store.

PineCone Wrapper to perform operations such as upload, query and delete for our needs.

In [None]:
class PineconeWrapper():
    def __init__(self, model, top_k=1, dimensions=768):
        pinecone.init(api_key='b368dee5-cb6b-4ddf-8858-abcc30c4acb7', environment='us-west1-gcp')
        # create new mining index if does not exist
        if 'negative-mines-petal' not in pinecone.list_indexes():
            pinecone.create_index(
                'negative-mines-petal', dimension=dimensions,
                metric='dotproduct', pods=1, pod_type='p1'  # limit of pods=1 for free plan (more pods == faster mining)
            )
        # connect
        self.index = pinecone.Index('negative-mines-petal')
        print("Index Stats: ", self.index)

        self.batch_size = 16
        self.dimension = dimensions
        self.model = model
        self.top_k = top_k


    def upload_pinecone(self, haystack_docs, namespace):
        docs = [haystack_docs[i]['content'] for i in range(len(haystack_docs))]

        # doc_dir = [haystack_docs[i]['meta']['doc_dir'] for i in range(len(haystack_docs))]

        print("\nUPLOAD PINECONE START!*******!")
        docs_emb = self.model.encode(docs, convert_to_tensor=True, show_progress_bar=True)
        print(f"Document Embeddings Shape : {docs_emb.shape}")

        index_data = self.index.describe_index_stats()
        print(f"Index Data before Adding : {index_data}")
        totalVectorCount = int(index_data['total_vector_count'])

        for i in tqdm.tqdm(range(0, len(docs_emb), self.batch_size)):
            i_end = min(i+self.batch_size, len(docs_emb))
            batch_emb = docs_emb[i:i_end, :].tolist()
            # batch_data = docs[i:i_end]

            # batch_metadata = [{"text": batch_data[i]} for i in range(0, len(batch_data))]
            batch_metadata = [{"text": haystack_docs[j]['content'],
                               "doc_dir": haystack_docs[j]['meta']['doc_dir']}
                              for j in range(i, i_end)]

            batch_ids = [str(x+totalVectorCount) for x in range(i, i_end)]
            # print(f"Batch ID : {batch_ids}, Batch MetaData : {batch_metadata}")
            # print(f"Batch ID : {batch_ids}, Batch Embeddings : {batch_emb}")
            # print(f"Batch ID : {batch_ids}, Batch Data : {batch_data}")
            # upload to index
            upload_vectors = list(zip(batch_ids, batch_emb, batch_metadata))
            print(f"\nBatch Upload Vectors : {upload_vectors}\n")
            self.index.upsert(vectors=upload_vectors, namespace=namespace)

        index_data = self.index.describe_index_stats()
        print(f"Index Data after Adding : {index_data}")
        print("\nUPLOAD PINECONE END!*******!")

    def delete_vectors(self, namespace=None):
        if not namespace:
            index_data = self.index.describe_index_stats()
            print(f"Index Data before deleting : {index_data}")

            to_delete = []
            for i in range(12263, 12294):
                to_delete.append(str(i))
            print(f"Index to Delete : {to_delete}")
            self.index.delete(ids = to_delete)

            index_data = self.index.describe_index_stats()
            print(f"Index Data after deleting : {index_data}")
        else:
            index_data = self.index.describe_index_stats()
            print(f"Index Data before deleting : {index_data}")

            self.index.delete(delete_all=True, namespace=namespace)

            index_data = self.index.describe_index_stats()
            print(f"Index Data after deleting : {index_data}")

    def query_pinecone(self, query, namespace):
        query_emb = self.model.encode(query).tolist()
        # print(query_emb.shape)
        # print(query_emb)

        # res = index.query([query_emb], top_k = 10)
        if namespace:
            res = self.index.query([query_emb], top_k = self.top_k, namespace=namespace, include_metadata=True)
        else:
            res = self.index.query([query_emb], top_k = self.top_k)
        print(f"Pinecone Results : {res}")

        # ids = [match.id for match in res['results'][0]['matches']]
        # scores = [match.score for match in res['results'][0]['matches']]

        # print(ids)
        # print(scores)
        answers = []

        for match in res['matches']:
            vector_id = int(match.id)
            score = match.score
            # text = corpus[vector_id]
            try:
                text = match['metadata']['text']
                doc_dir = match['metadata']['doc_dir']
            except:
                text = abstracts[vector_id]
                doc_dir = "NA"

            result_dict = {
                "content" : text,
                "context-type" : "text",
                "meta":{
                    "id" : vector_id,
                    "score": score,
                    "doc_dir" : doc_dir
                }
            }
            answers.append(result_dict)
            # print(f"Score : {score}, ID : {vector_id}, TEXT : {text}")

        return answers

# LXML

# Integrating LFQA and Haystack

# Get Answers from LFQA

In [None]:
import torch
import haystack
from haystack import Document
#from haystack.generator.transformers import Seq2SeqGenerator
from haystack.nodes import Seq2SeqGenerator
from haystack.utils import print_answers


Custom function to retrieve documents from input file

In [None]:
from pathlib import Path
import os

def get_doc(all_doc_dir):
    threshold = 300
    doc_results = []

    print(f"Documents Found : {os.listdir(all_doc_dir)}")

    for document in os.listdir(all_doc_dir):
        doc_dir = os.path.join(all_doc_dir, document)
        print(f"Processing Document : {doc_dir}")
        text = Path(doc_dir).read_text().replace("\n", " ")
        text = text.replace('"', "")
        out = []
        doc_results.append(result_dict)

    return doc_results

In [None]:
def get_answers_wrapper(query, doc_dir, model, namespace, top_k):
    doc_results = get_doc(doc_dir)
    # print(f"DOC RESULTS : {doc_results}")

    pineconeWrapper = PineconeWrapper(model, top_k=top_k, dimensions = 768)

    pineconeWrapper.upload_pinecone(doc_results, namespace)

    biencoder_results = pineconeWrapper.query_pinecone(query, namespace)
    document_store = []

    data = {}
    data['query'] = query

    for idx, doc in enumerate(biencoder_results):
        i = str(idx)
        data["top_"+i+"_content"] = doc['content']
        data["top_"+i+"_dotscore"] = doc['meta']['score']
        data["top_"+i+"_document"] = doc['meta']['doc_dir']
        document_store.append(Document(doc['content']))

    print(f"All Documents : {document_store}")

    generator = Seq2SeqGenerator(model_name_or_path="vblagoje/bart_lfqa")

    result = generator.predict(
        query=query,
        documents = document_store,
        top_k=1
    )

    print_answers(result, details="minimum")

    answers = result['answers']
    for answer in answers:
        final_ans = answer.answer
        ans_score = answer.score
        break

    data['answer'] = final_ans
    data['score'] = ans_score
    # return final_ans
    return data

In [None]:
def run_inference_refactored(query, doc_dir, model, top_k=5):
    pineconeWrapper = PineconeWrapper(model, top_k=top_k)
    pineconeWrapper.delete_vectors(namespace='artemis')
    data = get_answers_wrapper(query, doc_dir, model, namespace='artemis', top_k=top_k)
    return data

Specify query and document directory to use the file. Then we get the list of texts from the document. Now we pass the model and the query to top 3 documents from the query_pinecone function. Combined result contains the document from the texts as well as top 3 documents from query_pinecone function. For all the documents in combined results we create a document store. Then we pass this document for the LFQA to obtain the answers.

# CSV Export

Function to return for a list of query_mapping values.

In [None]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

old_model_name = "msmarco-distilbert-base-tas-b"
old_model = SentenceTransformer(old_model_name)
MODEL_PATH='/content/drive/MyDrive/Astrophysics/biencoder-spacesciencemodel-gtebase'
#MODEL_PATH='/content/drive/MyDrive/Petal/biencoder-arxiv-v2-petal'

# new_model_name = "/content/notebooks/biencoder-arxiv"
new_model_name = MODEL_PATH
new_model = SentenceTransformer(new_model_name)

In [None]:
import pandas as pd

def get_csv(queries, all_doc_dir):
    all_data = []
    for query in queries:
        answers_new = run_inference_refactored(query, all_doc_dir, new_model, top_k=3)
        answers_new['model_type'] = 'trained'
        all_data.append(answers_new)

        answers_old = run_inference_refactored(query, all_doc_dir, old_model, top_k=3)
        answers_old['model_type'] = 'pretrained'
        all_data.append(answers_old)

    df = pd.DataFrame(all_data)
    return df

# GET CSV

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:

queries=['What are passenger behavior problems reported to ASRS?',
'What is significance of reporting to the ASRS? ',
'Tell me about Special reporting forms ?',
'State importance of risk management in high reliability systems ',
'Why did the ratio of cabin crew-related reports to database reports increase?',
'What is a top priority for all participants in aviation operations?',
]

all_docs_dir = '/content/Text/'
#"/content/drive/MyDrive/AviationDocs"
df = get_csv(queries, all_docs_dir)
df

# Save to excel

In [None]:
df.to_excel("Results_AviationDoc.xlsx", index=None)