In [1]:
!pip install langchain chromadb openai tiktoken pypdf langchain_openai langchain_community

Collecting chromadb
  Downloading chromadb-1.4.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting pypdf
  Downloading pypdf-6.6.2-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain_openai
  Downloading langchain_openai-1.1.7-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain_community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.4.0-py3-none-any.whl.metadata (5.8 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.3-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting opentelemetry-exporter

In [2]:
!pip install -U langchain-chroma

Collecting langchain-chroma
  Downloading langchain_chroma-1.1.0-py3-none-any.whl.metadata (1.9 kB)
Downloading langchain_chroma-1.1.0-py3-none-any.whl (12 kB)
Installing collected packages: langchain-chroma
Successfully installed langchain-chroma-1.1.0


In [6]:
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

In [5]:
from langchain_core.documents import Document

doc1 = Document(
    page_content="A bunch of scientists bring back dinosaurs and mayhem breaks loose",
    metadata={"year": 1993, "rating": 7.7, "genre": "science fiction"},
)

doc2 = Document(
    page_content="Leo DiCaprio gets lost in a dream within a dream within a dream within a ...",
    metadata={"year": 2010, "director": "Christopher Nolan", "rating": 8.2},
)

doc3 = Document(
    page_content="A psychologist / detective gets lost in a series of dreams within dreams within dreams and ...",
    metadata={"year": 2006, "director": "Satoshi Kon", "rating": 8.6},
)

docs = [doc1, doc2, doc3]

In [10]:
import os
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
vector_store = Chroma(
    embedding_function=OpenAIEmbeddings(api_key= OPENAI_API_KEY
),
    persist_directory="./chroma_db",
    collection_name="my_collection"
)

In [11]:
vector_store.add_documents(docs)

['cc8f6639-ca32-4b83-9f32-40dcc48bfdf7',
 'a1c9713e-1840-4499-94e1-935e8795fd50',
 '7d4ee936-fed8-4c0d-b640-5904ed5786bd']

In [13]:
vector_store.get(include=['embeddings', 'documents', 'metadatas'])

{'ids': ['cc8f6639-ca32-4b83-9f32-40dcc48bfdf7',
  'a1c9713e-1840-4499-94e1-935e8795fd50',
  '7d4ee936-fed8-4c0d-b640-5904ed5786bd'],
 'embeddings': array([[-0.00776256, -0.03804565, -0.00274355, ..., -0.01363974,
         -0.00248025, -0.00390404],
        [-0.01003289, -0.02619806,  0.01095144, ..., -0.00847394,
         -0.01124899, -0.03516362],
        [-0.00118049, -0.0100914 ,  0.01644185, ...,  0.00656955,
         -0.02558494, -0.02232796]]),
 'documents': ['A bunch of scientists bring back dinosaurs and mayhem breaks loose',
  'Leo DiCaprio gets lost in a dream within a dream within a dream within a ...',
  'A psychologist / detective gets lost in a series of dreams within dreams within dreams and ...'],
 'uris': None,
 'included': ['embeddings', 'documents', 'metadatas'],
 'data': None,
 'metadatas': [{'year': 1993, 'rating': 7.7, 'genre': 'science fiction'},
  {'rating': 8.2, 'year': 2010, 'director': 'Christopher Nolan'},
  {'director': 'Satoshi Kon', 'year': 2006, 'rating

In [22]:
vector_store.similarity_search_with_relevance_scores(
    query="Which is about dinosaur?",
    k=1,
)

[(Document(id='cc8f6639-ca32-4b83-9f32-40dcc48bfdf7', metadata={'year': 1993, 'rating': 7.7, 'genre': 'science fiction'}, page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose'),
  0.7732003903175413)]

In [23]:
vector_store.delete(ids = ['cc8f6639-ca32-4b83-9f32-40dcc48bfdf7'])

In [24]:
vector_store.get(include=['embeddings', 'documents', 'metadatas'])

{'ids': ['a1c9713e-1840-4499-94e1-935e8795fd50',
  '7d4ee936-fed8-4c0d-b640-5904ed5786bd'],
 'embeddings': array([[-0.01003289, -0.02619806,  0.01095144, ..., -0.00847394,
         -0.01124899, -0.03516362],
        [-0.00118049, -0.0100914 ,  0.01644185, ...,  0.00656955,
         -0.02558494, -0.02232796]]),
 'documents': ['Leo DiCaprio gets lost in a dream within a dream within a dream within a ...',
  'A psychologist / detective gets lost in a series of dreams within dreams within dreams and ...'],
 'uris': None,
 'included': ['embeddings', 'documents', 'metadatas'],
 'data': None,
 'metadatas': [{'director': 'Christopher Nolan', 'rating': 8.2, 'year': 2010},
  {'year': 2006, 'rating': 8.6, 'director': 'Satoshi Kon'}]}

## Using Faiss

In [38]:
from langchain_community.vectorstores import FAISS
!pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m70.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.2


In [39]:

vector_store = FAISS.from_documents(docs,
    embedding=OpenAIEmbeddings(api_key=OPENAI_API_KEY)
)

In [43]:
vector_store.docstore.search(vector_store.index_to_docstore_id[0])

Document(id='3d60dc9b-cb09-4b85-928d-cfa3c3632232', metadata={'year': 1993, 'rating': 7.7, 'genre': 'science fiction'}, page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose')