# Dependencies

In [1]:
# !conda install psycopg2
# !pip install sqlalchemy
# !pip install langchain
# !pip install llama-index
# !pip install llama-index-llms-huggingface
# !pip install torch
# !pip install llama-index-embeddings-langchain
# !pip install bitsandbytes
# !pip install sentence_transformers
# %pip install llama-index-readers-web
# %pip install llama-index-vector-stores-postgres

# Embedding Model

In [2]:
embedding_model_name = "BAAI/bge-large-en-v1.5"

In [3]:
from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding

# Create the embedding model using the HuggingFaceBgeEmbeddings class
embed_model = LangchainEmbedding(
  HuggingFaceBgeEmbeddings(model_name=embedding_model_name)
)

# Get the embedding dimension of the model by doing a forward pass with a dummy input
embed_dim = len(embed_model.get_text_embedding("Hello world")) # 1024

# Database

In [4]:
connection_string = "postgresql://postgres:test123@localhost:5432"
db_name = "chatbotdb"
table_name = 'companyDocEmbeddings'

In [5]:
import psycopg2

# Connect to the database
conn = psycopg2.connect(connection_string)
# Set autocommit to True to avoid having to commit after every command
conn.autocommit = True

# Create the database
# If it already exists, then delete it and create a new one
with conn.cursor() as c:
    c.execute(f"DROP DATABASE IF EXISTS {db_name}")
    c.execute(f"CREATE DATABASE {db_name}")

# Knowledge

In [6]:
from llama_index.readers.web import SimpleWebPageReader

documents = SimpleWebPageReader(html_to_text=True).load_data(
    [
        "https://www.e2enetworks.com/",
        "https://www.e2enetworks.com/products",
        "https://www.e2enetworks.com/about-us",
        "https://www.e2enetworks.com/contact-us",
        "https://www.e2enetworks.com/contact-sales",
        "https://www.e2enetworks.com/policies/service-level-agreement",
        "https://www.e2enetworks.com/policies/terms-of-service",
        "https://www.e2enetworks.com/policies/privacy-policy",
        "https://www.e2enetworks.com/policies/refund-policy",
        "https://www.e2enetworks.com/policy-faq",
        "https://www.e2enetworks.com/countries-served",

    ]
)

# Index the knowledge

In [7]:
from llama_index.core import Settings

In [8]:
Settings.llm = None
Settings.embed_model = embed_model

Settings.chunk_size = 1024
Settings.chunk_overlap = 512

LLM is explicitly disabled. Using MockLLM.


In [9]:
from sqlalchemy import make_url
from llama_index.vector_stores.postgres import PGVectorStore

# Creates a URL object from the connection string
url = make_url(connection_string)

# Create the vector store
vector_store = PGVectorStore.from_params(
    database=db_name,
    host=url.host,
    password=url.password,
    port=url.port,
    user=url.username,
    table_name=table_name,
    embed_dim=embed_dim,
)

In [10]:
from llama_index.core.storage.storage_context import StorageContext

# Create the storage context to be used while indexing and storing the vectors
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [11]:
from llama_index.core import VectorStoreIndex

# Create the index
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, show_progress=True
)

Parsing nodes:   0%|          | 0/11 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/117 [00:00<?, ?it/s]

In [None]:
conn.close()