In [1]:
import textwrap
wrapper = textwrap.TextWrapper(width=140)

In [2]:
# model_name = 'mistralai/Mixtral-8x7B-Instruct-v0.1'
model_name = 'mistralai/Mistral-7B-Instruct-v0.2'

In [3]:
# from llama_index.core.llms import ChatMessage, MessageRole
# from llama_index.core import ChatPromptTemplate

# # Text QA Prompt
# chat_text_qa_msgs = [
#     ChatMessage(
#         role=MessageRole.SYSTEM,
#         content=(
#             "Always answer the question, even if the context isn't helpful."
#         ),
#     ),
#     ChatMessage(
#         role=MessageRole.USER,
#         content=(
#             "Context information is below.\n"
#             "---------------------\n"
#             "{context_str}\n"
#             "---------------------\n"
#             "Given the context information and not prior knowledge, "
#             "answer the question: {query_str}\n"
#         ),
#     ),
# ]
# text_qa_template = ChatPromptTemplate(chat_text_qa_msgs)

# # Refine Prompt
# chat_refine_msgs = [
#     ChatMessage(
#         role=MessageRole.SYSTEM,
#         content=(
#             "Always answer the question, even if the context isn't helpful."
#         ),
#     ),
#     ChatMessage(
#         role=MessageRole.USER,
#         content=(
#             "We have the opportunity to refine the original answer "
#             "(only if needed) with some more context below.\n"
#             "------------\n"
#             "{context_msg}\n"
#             "------------\n"
#             "Given the new context, refine the original answer to better "
#             "answer the question: {query_str}. "
#             "If the context isn't useful, output the original answer again.\n"
#             "Original Answer: {existing_answer}"
#         ),
#     ),
# ]
# refine_template = ChatPromptTemplate(chat_refine_msgs)

In [4]:
# %pip install llama-index-llms-llama-cpp

In [5]:
from llama_index.llms.llama_cpp.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)

In [6]:
import torch
# from llama_index.prompts.prompts import SimpleInputPrompt
from llama_index.llms.huggingface import HuggingFaceLLM
from transformers import BitsAndBytesConfig
from llama_index.core import PromptTemplate

# Context Window specifies how many tokens to use as context for the LLM
context_window = 2048
# Max New Tokens specifies how many new tokens to generate for the LLM
max_new_tokens = 256
# Device specifies which device to use for the LLM
device = "cuda"

# This is the prompt that will be used to instruct the model behavior
system_prompt = """
    You are an AI chatbot that is designed to answer questions related to E2E Networks. 
    You are provided with a context and a question. You need to answer the question based on the context provided. 
    If the context is not helpful, do not answer based on prior knowledge, instead, redirect the user to the E2E Networks Support team. 
    You should also provide links that you got from context that are relevant to the answer. 
    You are allowed to answer in first person only, like I/Us/Our; It should feel like a human is answering the question. 
    You only provide the like and not like [E2E Networks Official Website](https://www.e2enetworks.com/)
"""

# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")

# Create the LLM using the HuggingFaceLLM class
llm = HuggingFaceLLM(
    context_window=context_window,
    max_new_tokens=max_new_tokens,
    system_prompt=system_prompt,
    query_wrapper_prompt=PromptTemplate("<s>[INST] {query_str} [/INST] </s>\n"),
    tokenizer_name=model_name,
    model_name=model_name,
    device_map=device,
    generate_kwargs={"temperature": 0.2, "top_k": 5, "top_p": 0.95},
    # uncomment this if using CUDA to reduce memory usage
    # model_kwargs={
    #     # "torch_dtype": torch.float16
    #     'quantization_config':quantization_config
    # }
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
embedding_model_name = "BAAI/bge-large-en-v1.5"

In [8]:
from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding

# Create the embedding model using the HuggingFaceBgeEmbeddings class
embed_model = LangchainEmbedding(
  HuggingFaceBgeEmbeddings(model_name=embedding_model_name)
)

# Get the embedding dimension of the model by doing a forward pass with a dummy input
embed_dim = len(embed_model.get_text_embedding("Hello world")) # 1024

# database

In [9]:
connection_string = "postgresql://postgres:test123@localhost:5432"
db_name = "chatbotdb"
table_name = 'companyDocEmbeddings'

In [10]:
from llama_index.core import Settings
from llama_index.core.node_parser import SentenceSplitter

Settings.llm = llm
Settings.embed_model = embed_model

Settings.chunk_size = 2048
Settings.chunk_overlap = 256

from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Settings

Settings.transformations = [SentenceSplitter(chunk_size=1024)]

In [11]:
from sqlalchemy import make_url
from llama_index.vector_stores.postgres import PGVectorStore

# Creates a URL object from the connection string
url = make_url(connection_string)

# Create the vector store
vector_store = PGVectorStore.from_params(
    database=db_name,
    host=url.host,
    password=url.password,
    port=url.port,
    user=url.username,
    table_name=table_name,
    embed_dim=embed_dim,
)

In [12]:
from llama_index.core import VectorStoreIndex
from llama_index.core import SummaryIndex

# Load the index from the vector store of the database
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
# summary_index = SummaryIndex.from_vector_store(vector_store=vector_store)

# Engine

In [13]:
from llama_index.core.query_engine.retriever_query_engine import RetrieverQueryEngine
from llama_index.core.indices.vector_store.retrievers.retriever import VectorIndexRetriever
from llama_index.core import get_response_synthesizer

# Create the retriever that manages the index and the number of results to return
retriever = VectorIndexRetriever(
      index=index,
      similarity_top_k=5,
)

# Create the response synthesizer that will be used to synthesize the response
response_synthesizer = get_response_synthesizer(
      response_mode='simple_summarize',
)

# Create the query engine that will be used to query the retriever and synthesize the response
query_engine = RetrieverQueryEngine(
      retriever=retriever,
      response_synthesizer=response_synthesizer,
)

In [14]:
response = query_engine.query('Do you know about e2e networks? What is e2e networks? And what all services do they provide?')

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [15]:
print(wrapper.fill(response.response))

I'm an AI chatbot designed to answer questions related to E2E Networks based on the context provided. According to the context, E2E Networks
Limited is a rapidly growing cloud computing player based in India. They offer various cloud services including Webuzo Linux Cloud, High
Memory Cloud, E2E Object Storage, E2E Volumes, and Windows Cloud. The Webuzo Linux Cloud provides fully-featured Multi-Account Webuzo
Control panel with plans that include varying numbers of vCPUs, CPU frequency, dedicated RAM, and disk space. The High Memory Cloud offers
access to in-memory data faster than latest NVMe flash storage and is ideal for memory-intensive workloads. E2E Object Storage is an SSD-
based S3-compatible object storage service designed for demanding workloads like machine learning and deep learning. E2E Volumes provides
block-level storage, and Windows Cloud offers Microsoft Windows Server 2016/2019 on E2E Networks Cloud for Windows Server workloads. For
more information, you can visit the 

In [16]:
response = query_engine.query('What is the price of HGX 8xH100?')

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [17]:
print(wrapper.fill(response.response))

I'm sorry for any confusion, but the context provided does not mention the price of the HGX 8xH100 GPU. The context only provides
information about the H100 GPU and its different plans with varying vCPUs, dedicated RAM, and disk space, along with their hourly, weekly,
monthly, and longer-term billing options. If you have any other questions related to the context or E2E Networks in general, feel free to
ask and I'll do my best to help you out. If you need more specific pricing information for the HGX 8xH100, I would recommend reaching out to
E2E Networks' sales team at [+91-11-4084-4965](callto:%2B91-11-4084-4965) or [sales@e2enetworks.com](mailto:sales@e2enetworks.com). They
would be able to provide you with the most accurate and up-to-date pricing information.  Here's the link to the E2E Networks Support team:
[https://www.


In [19]:
response = query_engine.query('Is my server at E2E Networks automatically backed up?')
print(wrapper.fill(response.response))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Based on the context provided, E2E Networks offers CDP (Continuous Data Protection) backup plans for servers. However, it's important to
note that maintaining your own backups is also an option. If you have signed up for a backup plan from E2E Networks, they will backup your
entire server minus excluded directories as per a backup schedule. If you prefer using a third-party backup service, you can choose from
options like Tarsnap, Barracuda, Rackspace, or Zmanda Cloud Backup.  For more information about E2E Networks CDP backup, you can visit their
website or contact their sales team at [+91-11-4084-4965](callto:%2B91-11-4084-4965) or
[sales@e2enetworks.com](mailto:sales@e2enetworks.com).  Here's the link to the E2E CDP Backup page on their website: [E2E CDP
Backup](https://www.e2enetworks.com/product/e2e-cd


In [20]:
from llama_index.core import tools

In [21]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata

vector_tool = QueryEngineTool(
    index.as_query_engine(),
    metadata=ToolMetadata(
        name="vector_search",
        description="Useful for searching for specific facts."
    )
)

In [22]:
from llama_index.core.query_engine import RouterQueryEngine

query_engine = RouterQueryEngine.from_defaults(
    [vector_tool],
    select_multi=True
)



In [23]:
response = query_engine.query('Is my server at E2E Networks automatically backed up?')
print(wrapper.fill(response.response))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Based on the context provided, E2E Networks offers CDP (Continuous Data Protection) backup plans for servers, but the billing for Microsoft
licenses is on a calendar month basis, regardless of usage. If you deprovision your private cloud server in the middle of a billing period,
you will receive a prorated refund, but not for the first month's service fee. E2E Networks provides a 99.95% uptime, and you can customize
your cloud servers by choosing from the available options in the public cloud. The internet bandwidth is not metered, but in exceptional
cases, dedicated bandwidth can be provided for an additional cost. You get root access to your server and can host multiple websites
depending on the resources in your Cloud server plan. The server provisioning process takes around 5-10 minutes once the payment is
completed.  Regarding your question, E2E Networks does offer CDP backup plans for servers, but the context does not provide information
about whether these backups are included i

In [24]:
response = query_engine.query("'What is the price of 8xH100?'")
print(wrapper.fill(response.response))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


I'm sorry for any confusion, but the context provided does not include information about the price of an 8xH100 configuration. The context
only mentions the prices for L40S, 2xL40S, and 4xL40S configurations. If you have more specific details about the hardware or plan you're
interested in, please let me know and I'll try to help you find that information. In the meantime, you can contact E2E Networks sales team
at [+91-11-4084-4965](callto:%2B91-11-4084-4965) or [sales@e2enetworks.com](mailto:sales@e2enetworks.com) for the most accurate and up-to-
date pricing information.  Here are some links that might be helpful for you:  * [E2E Networks L40S Cloud
GPU](https://www.e2enetworks.com/product/l40s-cloud-gpu) * [E2E Networks NVIDIA A100
