In [1]:
import torch
from llama_index.llms.huggingface import HuggingFaceLLM
from transformers import BitsAndBytesConfig
from llama_index.core import PromptTemplate
from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.core import Settings
from sqlalchemy import make_url
from llama_index.vector_stores.postgres import PGVectorStore
from llama_index.core import VectorStoreIndex
from llama_index.core.query_engine.retriever_query_engine import RetrieverQueryEngine
from llama_index.core.indices.vector_store.retrievers.retriever import VectorIndexRetriever
from llama_index.core import get_response_synthesizer

In [2]:
model_name = 'mistralai/Mistral-7B-Instruct-v0.2'
embedding_model_name = "BAAI/bge-large-en-v1.5"
# Context Window specifies how many tokens to use as context for the LLM
context_window = 4096
# Max New Tokens specifies how many new tokens to generate for the LLM
max_new_tokens = 256
chunk_size = 1024
chunk_overlap = 256
# Device specifies which device to use for the LLM
device = "cuda"
connection_string = "postgresql://postgres:test123@localhost:5432"
db_name = "chatbotdb"
table_name = 'companyDocEmbeddings'
# This is the prompt that will be used to instruct the model behavior
system_prompt = """
    You are an AI chatbot that is designed to answer questions related to E2E Networks. 
    You are provided with a context and a question. You need to answer the question based on the context provided. 
    If the context is not helpful, do not answer based on prior knowledge, instead, redirect the user to the E2E Networks Support team. 
    You should also provide links that you got from context that are relevant to the answer. 
    You are allowed to answer in first person only, like I/Us/Our; It should feel like a human is answering the question. 
    You should only provide the links and not like [E2E Networks Official Website](https://www.e2enetworks.com/)
    You're not allowed to say something like "Based on the context, I think the answer is...", instead, you should directly answer the question.
    When in confusion, you can ask for more information from the user.

    Here is an example of how you should answer:

    Question: What is the pricing for E2E Networks?
    Context: E2E Networks is a cloud computing company that provides cloud infrastructure and cloud services to businesses and startups.
    Unacceptable Answer: Based on the context, I think the pricing for E2E Networks is...
    Acceptable Answer: The pricing for E2E Networks is...
"""
top_k_index_to_return = 16

In [3]:
class AIChatBot:
    def __init__(self, model_name, embedding_model_name, system_prompt, database_connection_string, 
    database_name, table_name, context_window, max_new_tokens, device, chunk_size, chunk_overlap, top_k_index_to_return):
        # Initialize model configuration and database connection here
        self.model_name = model_name
        self.embedding_model_name = embedding_model_name
        self.database_connection_string = database_connection_string
        self.database_name = database_name
        self.table_name = table_name
        self.system_prompt = system_prompt
        self.context_window = context_window
        self.max_new_tokens = max_new_tokens
        self.device = device
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.top_k_index_to_return = top_k_index_to_return

    def setup_model(self):
        # This will wrap the default prompts that are internal to llama-index
        query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")

        # Create the LLM using the HuggingFaceLLM class
        llm = HuggingFaceLLM(
            context_window=self.context_window,
            max_new_tokens=self.max_new_tokens,
            system_prompt=self.system_prompt,
            query_wrapper_prompt=query_wrapper_prompt,
            tokenizer_name=self.model_name,
            model_name=self.model_name,
            device_map=self.device,
            generate_kwargs={"temperature": 0.2, "top_k": 5, "top_p": 0.95, "do_sample": True},
            # uncomment this if using CUDA to reduce memory usage
            # model_kwargs={
            #     # "torch_dtype": torch.float16
            #     'quantization_config':quantization_config
            # }
        )

        # Create the embedding model using the HuggingFaceBgeEmbeddings class
        embed_model = LangchainEmbedding(
        HuggingFaceBgeEmbeddings(model_name=embedding_model_name)
        )

        # Get the embedding dimension of the model by doing a forward pass with a dummy input
        embed_dim = len(embed_model.get_text_embedding("Hello world")) # 1024

        self.embed_dim = embed_dim
        self.llm = llm
        self.embed_model = embed_model

    def apply_settings(self):
        Settings.llm = self.llm
        Settings.embed_model = self.embed_model


        Settings.chunk_size = self.chunk_size
        Settings.chunk_overlap = self.chunk_overlap

    def get_index_from_database(self):
        # Creates a URL object from the connection string
        url = make_url(self.database_connection_string)

        # Create the vector store
        vector_store = PGVectorStore.from_params(
            database=self.database_name,
            host=url.host,
            password=url.password,
            port=url.port,
            user=url.username,
            table_name=self.table_name,
            embed_dim=self.embed_dim,
        )

        # Load the index from the vector store of the database
        index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
        self.index = index

    def setup_engine(self):
        # Create the retriever that manages the index and the number of results to return
        retriever = VectorIndexRetriever(
            index=self.index,
            similarity_top_k=self.top_k_index_to_return,
        )

        # Create the response synthesizer that will be used to synthesize the response
        response_synthesizer = get_response_synthesizer(
            response_mode='simple_summarize',
        )

        # Create the query engine that will be used to query the retriever and synthesize the response
        engine = RetrieverQueryEngine(
            retriever=retriever,
            response_synthesizer=response_synthesizer,
        )
        self.engine = engine

    def build_bot(self):
        self.setup_model()
        self.apply_settings()
        self.get_index_from_database()
        self.setup_engine()


    def process_query(self, query_text):
        response = self.engine.query(query_text)
        return response

In [8]:
e2e_chatbot = AIChatBot(model_name, embedding_model_name, system_prompt, connection_string, db_name, 
                        table_name, context_window, max_new_tokens, device, chunk_size, chunk_overlap, top_k_index_to_return)
e2e_chatbot.build_bot()
response = e2e_chatbot.process_query("What is the business of E2E Networks?")
response.response

'E2E Networks is a cloud computing company that provides cloud infrastructure and cloud services to businesses and startups.'