
Vector Databases

Vector databases are used to store embeddings and perform similarity searches. There are many vector databases available, some simple and some with advanced features.

In [2]:
pip install sentence-transformers

Collecting sentence-transformers
  Using cached sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.7.1-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting scikit-learn (from sentence-transformers)
  Using cached scikit_learn-1.7.0-cp312-cp312-win_amd64.whl.metadata (14 kB)
Collecting Pillow (from sentence-transformers)
  Using cached pillow-11.2.1-cp312-cp312-win_amd64.whl.metadata (9.1 kB)
Collecting networkx (from torch>=1.11.0->sentence-transformers)
  Using cached networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch>=1.11.0->sentence-transformers)
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting MarkupSafe>=2.0 (from jinja2->torch>=1.11.0->sentence-transformers)
  Using cached MarkupSafe-3.0.2-cp312-cp312-win_

In [8]:
pip install --upgrade ipywidgets jupyter


Collecting ipywidgets
  Using cached ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting jupyter
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Using cached widgetsnbextension-4.0.14-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Using cached jupyterlab_widgets-3.0.15-py3-none-any.whl.metadata (20 kB)
Collecting notebook (from jupyter)
  Downloading notebook-7.4.3-py3-none-any.whl.metadata (10 kB)
Collecting jupyter-console (from jupyter)
  Downloading jupyter_console-6.6.3-py3-none-any.whl.metadata (5.8 kB)
Collecting nbconvert (from jupyter)
  Downloading nbconvert-7.16.6-py3-none-any.whl.metadata (8.5 kB)
Collecting jupyterlab (from jupyter)
  Downloading jupyterlab-4.4.3-py3-none-any.whl.metadata (16 kB)
Collecting async-lru>=1.0.0 (from jupyterlab->jupyter)
  Downloading async_lru-2.0.5-py3-none-any.whl.metadata (4.5 kB)
Collecting jupyter-lsp>=2.0

In [9]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

loader = WebBaseLoader("https://www.govinfo.gov/content/pkg/CDOC-110hdoc50/html/CDOC-110hdoc50.htm")
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, separators=["\n\n", "\n", " ", ""])
chunks = text_splitter.split_documents(documents)
print("Number of chunks: ", len(chunks))

# Instantiate the embeddings model. The embeddings model_name can be changed as desired
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

Number of chunks:  430


In [10]:

# Install FAISS-CPU
!pip install faiss-cpu --quiet

# Import FAISS class from vectorstore library
from langchain_community.vectorstores import FAISS

# Create the database
vectore_store = FAISS.from_documents(chunks, embeddings)

# Check the number of chunks that have been indexed
vectore_store.index.ntotal

430

In [11]:
# ask a question and it should grab the relevant chunks
query = "how long does the president's term last?"
docs = vectore_store.similarity_search(query, k=3) # returns the top 3 chunks

# print the chunks
for doc in docs:
    print("-" * 80)
    print(doc.page_content)
    print("\n" * 2)

--------------------------------------------------------------------------------
Section 1. No person shall be elected to the office of the 
President more than twice, and no person who has held the 
office of President, or acted as President, for more than two 
years of a term of which some other person was elected 
President shall be elected to the office of the President more 
than once. But this Article shall not apply to any person 
holding the office of President when this Article was proposed 
by the Congress, and shall not prevent any person who may be 
holding the office of President, or acting as President, during 
the term within which this Article becomes operative from 
holding the office of President or acting as President during 
the remainder of such term.
    Section 2. This article shall be inoperative unless it 
shall have been ratified as an amendment to the Constitution by 
the legislatures of three-fourths of the several States within 
seven years from the date of

In [12]:
# saving the vector database
vectore_store.save_local("vector_db")

# loading the vector database
vectore_store = FAISS.load_local("vector_db", embeddings, allow_dangerous_deserialization=True)
