## Optimizing the RAG performance
- better retrievar
- better computational load

### Dataset preparation
- Using Stanford Q and A dataset (SQuAD)

In [1]:
# %pip install datasets
# %pip install einops
import nest_asyncio
nest_asyncio.apply()

In [2]:

from datasets import load_dataset

# loading the dataset
dataset = load_dataset("squad")

# Extract unique context from the dataset

data = [item["context"] for item in dataset["train"]]

texts = list(set(data))


In [3]:
texts[0]

'The United States is the chief remaining nation to assign official responsibilities to a region called the Near East. Within the government the State Department has been most influential in promulgating the Near Eastern regional system. The countries of the former empires of the 19th century have in general abandoned the term and the subdivision in favor of Middle East, North Africa and various forms of Asia. In many cases, such as France, no distinct regional substructures have been employed. Each country has its own French diplomatic apparatus, although regional terms, including Proche-Orient and Moyen-Orient, may be used in a descriptive sense. The most influential agencies in the United States still using Near East as a working concept are as follows.'

### Embed dataset
Embedding for each context-level, Each element of the above text list will be embedded into a single vector

In [4]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from tqdm import tqdm
def batch_iterate(lst, batch_size):
    for i in range(0, len(lst), batch_size):
        yield lst[i : i + batch_size]
    
class EmbedData:
    """
    A class for generating and managing text embeddings using a Hugging Face embedding model.
    This class handles the loading of an embedding model and batch processing of text data
    to generate embeddings.
    Attributes:
        embed_model_name (str): Name of the Hugging Face model to use for embeddings.
            Defaults to "nomic-ai/nomic-embed-text-v1.5".
        embed_model: Loaded Hugging Face embedding model instance.
        batch_size (int): Number of texts to process in each batch. Defaults to 32.
        embeddings (list): Storage for generated embeddings.
    Example:
        >>> embed_data = EmbedData()
        >>> texts = ["Sample text 1", "Sample text 2"]
        >>> embed_data.embed(texts)
        >>> embeddings = embed_data.embeddings
    """
    def __init__(self, 
                 embed_model_name="nomic-ai/nomic-embed-text-v1.5",
                 batch_size = 32):
        self.embed_model_name = embed_model_name
        self.embed_model = self._load_embed_model()
        self.batch_size = batch_size
        self.embeddings = []

    def _load_embed_model(self):
        """
        Load and initialize a HuggingFace embedding model with specified configurations.

        Returns:
            HuggingFaceEmbedding: Initialized embedding model instance configured with the model name.
        """
        embed_model = HuggingFaceEmbedding(model_name=self.embed_model_name,
                                           trust_remote_code=True,
                                           cache_folder='./hf_cache')
        return embed_model
    
    def generate_embedding(self, context):
        return self.embed_model.get_text_embedding_batch(context)
    

    def embed(self, contexts):
        """
        Embeds a list of contexts into vector representations using batched processing.
        This method processes the input contexts in batches and generates embeddings 
        for each context using the underlying embedding model. The embeddings are stored
        internally in the class instance.
        Args:
            contexts (list): List of text contexts to be embedded.
                             Each context should be a string.
        Example:
            embedder = EmbeddingModel()
            contexts = ["text1", "text2", "text3"]
            embedder.embed(contexts)
        """
        self.contexts = contexts
        
        for batch_context in tqdm(batch_iterate(contexts, self.batch_size),
                                  total=len(contexts)//self.batch_size,
                                  desc="Embedding data in batches"):
                                  
            batch_embeddings = self.generate_embedding(batch_context)
            
            self.embeddings.extend(batch_embeddings)


In [None]:
batch_size = 32

embeddata = EmbedData(batch_size=batch_size)

embeddata.embed(texts)

<All keys matched successfully>
Embedding data in batches:  27%|██▋       | 160/590 [05:35<21:48,  3.04s/it]

In [11]:
# # # Writing the embedings to pickle file
import pickle
with open("data/squad_embedded_full.pickle", "wb") as h:
    pickle.dump(embeddata,h)

In [17]:
import dill as pickle

with open("data/squad_embedded_full.pickle", "rb") as h:
    embeddata = pickle.load(h)

ModuleNotFoundError: No module named 'transformers_modules.nomic-ai.nomic-bert-2048.c1b1fd7a715b8eb2e232d34593154ac782c98ac9'

### Vector Database
as we have embedded our dataset, we can define a vector database and dump our embeddings in it.

In [13]:
## Qdrant
from qdrant_client import models
from qdrant_client import QdrantClient
class QdrantVDB:
    def __init__(self, collection_name, vector_dim=768, batch_size=512):
        self.collection_name = collection_name
        self.batch_size = batch_size
        self.vector_dim = vector_dim
    def define_client(self):
        self.client = QdrantClient(url="http://localhost:6333",
                                   prefer_grpc=True)
        
    def create_collection(self):
        if not self.client.collection_exists(collection_name=self.collection_name):
            self.client.create_collection(collection_name=self.collection_name,

                                          vectors_config=models.VectorParams(
                                                              size=self.vector_dim,
                                                              distance=models.Distance.DOT,
                                                              on_disk=True),
                                          optimizers_config=models.OptimizersConfigDiff(
                                                                            default_segment_number=5,
                                                                            indexing_threshold=0)
                                         )
    
    def ingest_data(self, embeddata):
        for batch_context, batch_embeddings in tqdm(zip(batch_iterate(embeddata.contexts, self.batch_size),
                                                        batch_iterate(embeddata.embeddings, self.batch_size)),
                                                    total=len(embeddata.contexts)//self.batch_size,
                                                    desc = "Ingesting in batches"):
            self.client.upload_collection(collection_name=self.collection_name,
                                          vectors=batch_embeddings,
                                          payload=[{"context": context} for context in batch_context])
            
            self.client.update_collection(collection_name=self.collection_name,
                                        optimizer_config=models.OptimizersConfigDiff(indexing_threshold=20000))

In [None]:
database = QdrantVDB(collection_name="squad_collection")
database.define_client()
database.create_collection()
database.ingest_data(embeddata)

## Retriever 
## Search and Retrieve from VectorDB

In [None]:
class Retriever:
    def __init__(self, vector_db, embeddata):
        self.vector_db = vector_db
        self.embeddata = embeddata

In [None]:
def search(self, query):
    query_embedding = self.embeddata.embed_model.get_query_embedding(query)

    # Start the timer for logging the time taken for search
    start_time = time.time()

    result = self.vector_db.client.search()