## Vector Stores

Readings:

https://python.langchain.com/docs/concepts/vectorstores/

In [1]:
# Install package if not Install
# !pip install -qU langchain-google-genai langchain-core

In [2]:
import getpass
import os

if not os.environ.get("GOOGLE_API_KEY"):
  os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter API key for Google Gemini: ")

from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings_model = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

#### In Memory Vector Store

In [3]:
from langchain_core.vectorstores import InMemoryVectorStore

in_mem_vector_store = InMemoryVectorStore(embeddings_model)

In [4]:
from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)

document_3 = Document(
    page_content="Albert Einstein was a theoretical physicist who developed the theory of relativity.",
    metadata={
        "source": "wikipedia",
        "title": "Albert Einstein",
        "url": "https://en.wikipedia.org/wiki/Albert_Einstein",
        "last_updated": "2025-01-01",
        "lang": "en"
    }
)

documents = [document_1, document_2, document_3]

# You should usually provide IDs for the documents you add to the vector store, 
# so that instead of adding the same document multiple times, you can update the existing document.

docs = in_mem_vector_store.add_documents(documents=documents, ids=["doc1", "doc2", "doc3"])
docs

['doc1', 'doc2', 'doc3']

##### Delete

In [5]:
in_mem_vector_store.delete(ids=["doc1"])


#### ChromaDB

In [6]:
# Install package once
# !pip install -U langchain-chroma

from langchain_chroma import Chroma

vector_store_chromadb = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings_model,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

In [7]:
docs = vector_store_chromadb.add_documents(documents=documents, ids=["doc1", "doc2", "doc3"])
docs

['doc1', 'doc2', 'doc3']

#### Search

Vector stores embed and store the documents that added. If we pass in a query, the vectorstore will embed the query, perform a similarity search over the embedded documents, and return the most similar ones.

In [8]:
query = "Einstein"
result = vector_store_chromadb.similarity_search(query)
result

[Document(id='doc3', metadata={'lang': 'en', 'url': 'https://en.wikipedia.org/wiki/Albert_Einstein', 'source': 'wikipedia', 'title': 'Albert Einstein', 'last_updated': '2025-01-01'}, page_content='Albert Einstein was a theoretical physicist who developed the theory of relativity.'),
 Document(id='doc1', metadata={'source': 'tweet'}, page_content='I had chocolate chip pancakes and scrambled eggs for breakfast this morning.'),
 Document(id='doc2', metadata={'source': 'news'}, page_content='The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.')]

#### Search Parameters / Metadata filtering

Typical search parameters:
- query (str) - Text to look up documents similar to.
- k (int) - Number of documents to return. Defaults to 4.
- filter (dict | None) - Dictionary of argument(s) to filter on metadata

In [9]:
vector_store_chromadb.similarity_search(
    "Einstein",
    k=2,
    filter={"source": "wikipedia"},
)


[Document(id='doc3', metadata={'lang': 'en', 'last_updated': '2025-01-01', 'source': 'wikipedia', 'title': 'Albert Einstein', 'url': 'https://en.wikipedia.org/wiki/Albert_Einstein'}, page_content='Albert Einstein was a theoretical physicist who developed the theory of relativity.')]

In [10]:
vector_store_chromadb.similarity_search(
    "Einstein",
    k=2,
    filter={"source": "www"}, # No result for source - www
)

[]