Vector stores in langchain


In [18]:
!pip install langchain_openai
!pip install -U langchain-chroma
!pip install -U langchain-huggingface
!pip install pypdf tiktoken

Collecting langchain-chroma
  Using cached langchain_chroma-1.0.0-py3-none-any.whl.metadata (1.9 kB)
Using cached langchain_chroma-1.0.0-py3-none-any.whl (12 kB)
Installing collected packages: langchain-chroma
Successfully installed langchain-chroma-1.0.0


Collecting langchain-huggingface
  Downloading langchain_huggingface-1.1.0-py3-none-any.whl.metadata (2.8 kB)
Downloading langchain_huggingface-1.1.0-py3-none-any.whl (29 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-1.1.0


KeyboardInterrupt: 

In [2]:
from langchain_openai import OpenAIEmbeddings


In [8]:
!pip uninstall langchain_chroma



Found existing installation: langchain-chroma 1.0.0
Uninstalling langchain-chroma-1.0.0:
  Would remove:
    /usr/local/lib/python3.12/dist-packages/langchain_chroma-1.0.0.dist-info/*
    /usr/local/lib/python3.12/dist-packages/langchain_chroma/*
Proceed (Y/n)? y
  Successfully uninstalled langchain-chroma-1.0.0


In [1]:
from langchain_chroma import Chroma

In [16]:
!pip install sentence-transformers



In [6]:
from langchain_core.documents import Document

doc1 = Document(
    page_content = "Virat Kohli is one of the most successful Indian cricketers, known for his consistency and aggressive batting style. He has been the captain of the Indian team and plays for Royal Challengers Bangalore in the IPL.",
    metadata = {"team": "Royal Challengers Bangalore"}
)

doc2 = Document(
    page_content = "Rohit Sharma is an explosive Indian batsman, renowned for his ability to score big hundreds, including multiple double centuries in One Day Internationals. He is the captain of Mumbai Indians in the IPL.",
    metadata = {"team": "Mumbai Indians"}
)

doc3 = Document(
    page_content = "Jasprit Bumrah is one of the best fast bowlers in modern-day cricket, known for his yorkers and death-over bowling. He plays for Mumbai Indians in the IPL and has been a key player for India in all formats.",
    metadata = {"team": "Mumbai Indians"}
)

doc4 = Document(
    page_content = "MS Dhoni is a legendary Indian cricketer, recognized for his calmness under pressure and incredible leadership skills. He has been the captain of Chennai Super Kings in the IPL and led India to numerous victories, including the 2007 T20 World Cup.",
    metadata = {"team": "Chennai Super Kings"}
)

doc5 = Document(
    page_content = "Ravindra Jadeja is an all-rounder known for his superb fielding, bowling, and batting skills. He plays for Chennai Super Kings in the IPL and is a key player for India in all three formats.",
    metadata = {"team": "Chennai Super Kings"}
)


In [7]:
docs = [doc1, doc2, doc3, doc4, doc5]

To use open-source embedding models, you typically need to install the `sentence-transformers` library, which provides access to a wide range of pre-trained models from Hugging Face.

Once installed, you can import `HuggingFaceEmbeddings` and use it as the `embedding_function` for your `Chroma` vector store. You can specify a model name, or it will default to a general-purpose model.

In [4]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma

# Initialize HuggingFaceEmbeddings. You can choose a specific model,
# for example: 'sentence-transformers/all-MiniLM-L6-v2'
# If no model_name is provided, it defaults to 'sentence-transformers/all-MiniLM-L6-v2'
hf_embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

# Now, create your vector store using the HuggingFaceEmbeddings
vector_store_hf = Chroma(
    embedding_function = hf_embeddings,
    persist_directory = "chroma_db", # Using a different directory to avoid conflicts
    collection_name= "sample_hf"
)

print("Chroma vector store initialized with HuggingFaceEmbeddings and documents added.")

NameError: name 'docs' is not defined

In [8]:
# add documents
vector_store_hf.add_documents(docs)


['3c558c0a-eef0-4aa1-9a4d-53116bd0f77d',
 '0e2f0530-b29b-4459-9ecd-3c77bf152103',
 'b6037d8f-613b-40d9-8fa2-cf1c4b12e540',
 'f1d45796-f22a-49df-8cdb-74fd4884da73',
 '31407f18-dc8b-467e-912f-8260c01e3433']

In [10]:
# view documents
vector_store_hf.get(include=['embeddings', 'documents', 'metadatas'])

{'ids': ['3c558c0a-eef0-4aa1-9a4d-53116bd0f77d',
  '0e2f0530-b29b-4459-9ecd-3c77bf152103',
  'b6037d8f-613b-40d9-8fa2-cf1c4b12e540',
  'f1d45796-f22a-49df-8cdb-74fd4884da73',
  '31407f18-dc8b-467e-912f-8260c01e3433'],
 'embeddings': array([[ 0.02629065,  0.03684738, -0.03583898, ...,  0.00820491,
          0.03308463, -0.00751587],
        [ 0.01015616,  0.02171214, -0.05621425, ..., -0.0222393 ,
         -0.02329503, -0.01656128],
        [-0.0054234 , -0.02580199, -0.0462631 , ..., -0.10444629,
         -0.00320863,  0.09166257],
        [-0.05090713,  0.06426378,  0.04696449, ...,  0.00437634,
         -0.03580778, -0.04898571],
        [ 0.00071029,  0.0242068 , -0.01900118, ..., -0.08556581,
         -0.09028383,  0.0034117 ]]),
 'documents': ['Virat Kohli is one of the most successful Indian cricketers, known for his consistency and aggressive batting style. He has been the captain of the Indian team and plays for Royal Challengers Bangalore in the IPL.',
  'Rohit Sharma is an ex

In [13]:
# Search Documents
vector_store_hf.similarity_search(
    query = "who is the captain of chennai super kings",
    k = 2
)

[Document(id='f1d45796-f22a-49df-8cdb-74fd4884da73', metadata={'team': 'Chennai Super Kings'}, page_content='MS Dhoni is a legendary Indian cricketer, recognized for his calmness under pressure and incredible leadership skills. He has been the captain of Chennai Super Kings in the IPL and led India to numerous victories, including the 2007 T20 World Cup.'),
 Document(id='31407f18-dc8b-467e-912f-8260c01e3433', metadata={'team': 'Chennai Super Kings'}, page_content='Ravindra Jadeja is an all-rounder known for his superb fielding, bowling, and batting skills. He plays for Chennai Super Kings in the IPL and is a key player for India in all three formats.')]

In [14]:
# Search with similarity score

vector_store_hf.similarity_search_with_score(
    query = "who among these are bowlers",
    k=2
)

[(Document(id='b6037d8f-613b-40d9-8fa2-cf1c4b12e540', metadata={'team': 'Mumbai Indians'}, page_content='Jasprit Bumrah is one of the best fast bowlers in modern-day cricket, known for his yorkers and death-over bowling. He plays for Mumbai Indians in the IPL and has been a key player for India in all formats.'),
  1.0735973119735718),
 (Document(id='31407f18-dc8b-467e-912f-8260c01e3433', metadata={'team': 'Chennai Super Kings'}, page_content='Ravindra Jadeja is an all-rounder known for his superb fielding, bowling, and batting skills. He plays for Chennai Super Kings in the IPL and is a key player for India in all three formats.'),
  1.1793253421783447)]

In [19]:
# meta-date filtering

vector_store_hf.similarity_search_with_score(
    query = "",
    filter = {"name": "rohit sharma"}
)

[]

In [20]:
# update documents

updated_doc1 = Document(
    page_content = "Virat Kohli is regarded as one of the greatest modern-day batsmen, with numerous records to his name across all formats of the game. He is known for his aggressive yet technically sound batting, and has been a driving force for India in both Test and limited-overs cricket. Kohli played for Royal Challengers Bangalore in the IPL for many years and is also a former captain of the Indian cricket team, leading them to several international triumphs.",
    metadata = {"team": "Royal Challengers Bangalore"}
)

vector_store_hf.update_document(document_id = '3c558c0a-eef0-4aa1-9a4d-53116bd0f77d', document = updated_doc1)

In [21]:
# view documents
vector_store_hf.get(include=['embeddings', 'documents', 'metadatas'])

{'ids': ['3c558c0a-eef0-4aa1-9a4d-53116bd0f77d',
  '0e2f0530-b29b-4459-9ecd-3c77bf152103',
  'b6037d8f-613b-40d9-8fa2-cf1c4b12e540',
  'f1d45796-f22a-49df-8cdb-74fd4884da73',
  '31407f18-dc8b-467e-912f-8260c01e3433'],
 'embeddings': array([[ 0.03658912,  0.05371374, -0.03403616, ..., -0.04829875,
          0.0299286 , -0.00068657],
        [ 0.01015616,  0.02171214, -0.05621425, ..., -0.0222393 ,
         -0.02329503, -0.01656128],
        [-0.0054234 , -0.02580199, -0.0462631 , ..., -0.10444629,
         -0.00320863,  0.09166257],
        [-0.05090713,  0.06426378,  0.04696449, ...,  0.00437634,
         -0.03580778, -0.04898571],
        [ 0.00071029,  0.0242068 , -0.01900118, ..., -0.08556581,
         -0.09028383,  0.0034117 ]]),
 'documents': ['Virat Kohli is regarded as one of the greatest modern-day batsmen, with numerous records to his name across all formats of the game. He is known for his aggressive yet technically sound batting, and has been a driving force for India in bot

In [22]:
# delete documents
vector_store_hf.delete(ids=['3c558c0a-eef0-4aa1-9a4d-53116bd0f77d'])

In [23]:
# view documents
vector_store_hf.get(include=['embeddings', 'documents', 'metadatas'])

{'ids': ['0e2f0530-b29b-4459-9ecd-3c77bf152103',
  'b6037d8f-613b-40d9-8fa2-cf1c4b12e540',
  'f1d45796-f22a-49df-8cdb-74fd4884da73',
  '31407f18-dc8b-467e-912f-8260c01e3433'],
 'embeddings': array([[ 0.01015616,  0.02171214, -0.05621425, ..., -0.0222393 ,
         -0.02329503, -0.01656128],
        [-0.0054234 , -0.02580199, -0.0462631 , ..., -0.10444629,
         -0.00320863,  0.09166257],
        [-0.05090713,  0.06426378,  0.04696449, ...,  0.00437634,
         -0.03580778, -0.04898571],
        [ 0.00071029,  0.0242068 , -0.01900118, ..., -0.08556581,
         -0.09028383,  0.0034117 ]]),
 'documents': ['Rohit Sharma is an explosive Indian batsman, renowned for his ability to score big hundreds, including multiple double centuries in One Day Internationals. He is the captain of Mumbai Indians in the IPL.',
  'Jasprit Bumrah is one of the best fast bowlers in modern-day cricket, known for his yorkers and death-over bowling. He plays for Mumbai Indians in the IPL and has been a key p