In [1]:
import chromadb

client = chromadb.Client()
collection = client.create_collection(name = "my_collection")

In [2]:
collection.add(
    documents=[
        "This document is about New York",
        "This document is about Delhi"
    ],
    ids = ['id1', 'id2']
)

In [3]:
all_docs = collection.get()
all_docs

{'ids': ['id1', 'id2'],
 'embeddings': None,
 'metadatas': [None, None],
 'documents': ['This document is about New York',
  'This document is about Delhi'],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents']}

In [4]:
documents = collection.get(ids=["id1"])
documents

{'ids': ['id1'],
 'embeddings': None,
 'metadatas': [None],
 'documents': ['This document is about New York'],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents']}

In [5]:
#Performs meaning based semantic search
#I'm searching a document by providing a food item which is popular in Delhi - I'm not searching a document by Delhi - This is where vector database comes into play
results = collection.query(
    query_texts=['Query is about Chole Bhature'],
    n_results=2
)
results
#The vector database does semantic search and tie that database to Delhi
#The distance from Delhi is 1.5 whereas from New York is 1.8  - Which means this is more closer to Delhi

{'ids': [['id2', 'id1']],
 'distances': [[1.443028450012207, 1.8624567985534668]],
 'metadatas': [[None, None]],
 'embeddings': None,
 'documents': [['This document is about Delhi',
   'This document is about New York']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

In [6]:
#Here New York comes first - which is more closer to our query as compared to Delhi
results = collection.query(
    query_texts=['Query is about Pizza'],
    n_results=2
)
results


{'ids': [['id1', 'id2']],
 'distances': [[1.6637976169586182, 1.7067852020263672]],
 'metadatas': [[None, None]],
 'embeddings': None,
 'documents': [['This document is about New York',
   'This document is about Delhi']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

In [7]:
#The more relevant the query - the closer it is to our document - Here , the doc about New York
results = collection.query(
    query_texts=['Query is about Brooklyn Bridge'],
    n_results=2
)
results

{'ids': [['id1', 'id2']],
 'distances': [[1.095342755317688, 1.571172833442688]],
 'metadatas': [[None, None]],
 'embeddings': None,
 'documents': [['This document is about New York',
   'This document is about Delhi']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

In [8]:
collection.delete(ids=all_docs['ids'])
collection.get()

{'ids': [],
 'embeddings': None,
 'metadatas': [],
 'documents': [],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents']}

In [9]:
#Add Meta Data
#Useful while building LLM toold - Cuz LLM will generate an answer but to know which source it is referring to - we need this
collection.add(
    documents=[
        "This document is about New York",
        "This document is about Delhi"
    ],
    ids = ['id1', 'id2'],
    metadatas=[
        {"url":"https://en.wikipedia.org/wiki/New_York_City"},
        {"url":"https://en.wikipedia.org/wiki/New_Delhi"},
    ]
)

In [10]:
results = collection.query(
    query_texts=['Query is about Chole Bhature'],
    n_results=2
)
results

{'ids': [['id2', 'id1']],
 'distances': [[1.443028450012207, 1.8624567985534668]],
 'metadatas': [[{'url': 'https://en.wikipedia.org/wiki/New_Delhi'},
   {'url': 'https://en.wikipedia.org/wiki/New_York_City'}]],
 'embeddings': None,
 'documents': [['This document is about Delhi',
   'This document is about New York']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}