In [1]:
!pip install chromadb



In [2]:
import chromadb
client = chromadb.Client()
collection = client.create_collection(name="my_collection")

In [13]:
# adding documents to the collection with ids
collection.add(
    documents=[
        "This document is about New York",
        "This document is about Delhi"
    ],
    ids=["id1", "id2"]
)

In [4]:
# fetching all the documents
all_docs = collection.get()
print(all_docs)

{'ids': ['id1', 'id2'], 'embeddings': None, 'documents': ['This document is about New York', 'This document is about Delhi'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [None, None]}


In [5]:
# can also fetch documents based on ids
documents = collection.get(ids = ["id1"])
documents

{'ids': ['id1'],
 'embeddings': None,
 'documents': ['This document is about New York'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [None]}

In [6]:
collection.query(
    query_texts=["Query is about Chhole Bhature"],
    n_results=2
)

# we are using chromadb which is a vector based db because it does a meaning based semantic search rather than just word matching

#  chhole bhature is more related to Delhi so it should come first

{'ids': [['id2', 'id1']],
 'embeddings': None,
 'documents': [['This document is about Delhi',
   'This document is about New York']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[None, None]],
 'distances': [[1.5588479042053223, 1.8114912509918213]]}

In [7]:
collection.query(
    query_texts=["Query is about Pizza"],
    n_results=2
)
#  pizza is more related to New York so it should come first

{'ids': [['id1', 'id2']],
 'embeddings': None,
 'documents': [['This document is about New York',
   'This document is about Delhi']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[None, None]],
 'distances': [[1.6637976169586182, 1.7067852020263672]]}

In [8]:
collection.query(
    query_texts=["Query is about India"],
    n_results=2
)

{'ids': [['id2', 'id1']],
 'embeddings': None,
 'documents': [['This document is about Delhi',
   'This document is about New York']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[None, None]],
 'distances': [[0.7835131287574768, 1.596900224685669]]}

In [9]:
collection.query(
    query_texts=["Query is about Big apple"],
    n_results=2
)
#  new york is also called big apple

{'ids': [['id1', 'id2']],
 'embeddings': None,
 'documents': [['This document is about New York',
   'This document is about Delhi']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[None, None]],
 'distances': [[1.6312181949615479, 1.867558479309082]]}

In [10]:
collection.query(
    query_texts=["Query is about Air pollution"],
    n_results=2
)

{'ids': [['id2', 'id1']],
 'embeddings': None,
 'documents': [['This document is about Delhi',
   'This document is about New York']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[None, None]],
 'distances': [[1.5176868438720703, 1.6557269096374512]]}

In [16]:
#  to delete all the documents in the collection

collection.delete(ids = all_docs['ids'])
collection.get()



{'ids': [],
 'embeddings': None,
 'documents': [],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': []}

In [17]:
#   to delete the documents based on ids in the collection

collection.delete(
    ids=["id1", "id2"]
)
collection.get()

{'ids': [],
 'embeddings': None,
 'documents': [],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': []}

In [18]:
collection.add(
    documents=[
        "This document is about New York",
        "This document is about Delhi"
    ],
    ids=["id3", "id4"],
    metadatas=[
        {"url": "https://en.wikipedia.org/wiki/New_York_City"},
        {"url": "https://en.wikipedia.org/wiki/New_Delhi"}
    ]
)


In [19]:
results = collection.query(
    query_texts=["Query is about Chhole Bhature"],
    n_results=2
)
results

{'ids': [['id4', 'id3']],
 'embeddings': None,
 'documents': [['This document is about Delhi',
   'This document is about New York']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'url': 'https://en.wikipedia.org/wiki/New_Delhi'},
   {'url': 'https://en.wikipedia.org/wiki/New_York_City'}]],
 'distances': [[1.5588479042053223, 1.8114913702011108]]}