## ChromaDB


In [1]:
# !pip install chromadb 

## Create a DB

In [2]:
import chromadb
chroma_client = chromadb.Client()


## Create a collection

In [3]:
collection = chroma_client.create_collection(name="my_collection")

## Add some text documents to the collection

In [4]:
collection.add(
    documents=[
        "This is a document about pineapple. This is a document about oranges",
        " Welcome to Natural Language Processing  It is one of the most exciting research areas as of today  We will see how Python can be used to work with text files. "
    ],
    ids=["id1", "id2"]
)


## Query the collection

In [5]:
results = collection.query(
    query_texts=["This is a query document about hawaii"], # Chroma will embed this for you
    n_results=2 # how many results to return
)
print(results)

{'ids': [['id1', 'id2']], 'distances': [[1.082511067390442, 1.848189115524292]], 'metadatas': [[None, None]], 'embeddings': None, 'documents': [['This is a document about pineapple. This is a document about oranges', ' Welcome to Natural Language Processing  It is one of the most exciting research areas as of today  We will see how Python can be used to work with text files. ']], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}


In [6]:
results = collection.query(
    query_texts=["This is a query document about oranges"], # Chroma will embed this for you
    n_results=2 # how many results to return
)
print(results)

{'ids': [['id1', 'id2']], 'distances': [[0.5819712281227112, 1.859252691268921]], 'metadatas': [[None, None]], 'embeddings': None, 'documents': [['This is a document about pineapple. This is a document about oranges', ' Welcome to Natural Language Processing  It is one of the most exciting research areas as of today  We will see how Python can be used to work with text files. ']], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}


## Persist the data

In [7]:
client = chromadb.PersistentClient(path="/db/")

In [8]:
client.heartbeat() # returns a nanosecond heartbeat. Useful for making sure the client remains connected.

1727027208703287400

In [10]:
# client.reset()

In [11]:
from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings

In [12]:
DEFAULT_TENANT,DEFAULT_DATABASE

('default_tenant', 'default_database')

In [13]:
client = chromadb.PersistentClient(
    path="/db2/",
    settings=Settings(
                is_persistent = True,
                persist_directory = "/db2/",
                allow_reset = True,
        anonymized_telemetry=False),
    tenant=DEFAULT_TENANT,
    database=DEFAULT_DATABASE,
)

# path - parameter must be a local path on the machine where Chroma is running. If the path does not exist, it will be created. The path can be relative or absolute. If the path is not specified, the default is ./chroma in the current working directory.
# settings - Chroma settings object.
# tenant - the tenant to use. Default is default_tenant.
# database - the database to use. Default is default_database.

In [14]:
client.reset()

True

## Creating, inspecting, and deleting Collections#

Chroma uses collection names in the url, so there are a few restrictions on naming them:

* The length of the name must be between 3 and 63 characters.
* The name must start and end with a lowercase letter or a digit, and it can contain dots, dashes, and underscores in between.
* The name must not contain two consecutive dots.
* The name must not be a valid IP address.

## Chroma collections are created with a name and an optional embedding function. If you supply an embedding function, you must supply it every time you get the collection.

In [15]:
from chromadb.utils import embedding_functions

In [16]:
emb_fun=embedding_functions.SentenceTransformerEmbeddingFunction()

  from tqdm.autonotebook import tqdm, trange


In [17]:
model_name = "all-MiniLM-L6-v2"
emb_fun = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=model_name)

In [19]:
# emb_fun(["foo"])

In [20]:
len(emb_fun(["foo"])[0])

384

## Create collection

In [21]:
collection = client.create_collection(name="my_collection2", embedding_function=emb_fun)


## Get collection

In [22]:
collection = client.get_collection(name="my_collection2", embedding_function=emb_fun)

## Delete collection

In [23]:
client.delete_collection(name="my_collection2")

## If not sure if a collection exist or needs to be created

In [24]:
collection = client.get_or_create_collection(name="some_collection") 

In [25]:
collection

Collection(id=04a00942-b72f-4a32-96cf-234c2690e070, name=some_collection)

## Rename a collection

In [26]:
collection.modify(name="new_name") # Rename the collection

In [27]:
collection

Collection(id=04a00942-b72f-4a32-96cf-234c2690e070, name=new_name)

## Add documents

In [28]:
collection.add(
    documents=["som2", "doc2", "doc3"],
    metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}],
    ids=["id1", "id2", "id3"]
)

If Chroma is passed a list of documents, it will automatically tokenize and embed them with the collection's embedding function (the default will be used if none was supplied at collection creation). Chroma will also store the documents themselves. If the documents are too large to embed using the chosen embedding function, an exception will be raised.

Each document must have a unique associated id. Trying to .add the same ID twice will result in only the initial value being stored. An optional list of metadata dictionaries can be supplied for each document, to store additional information and enable filtering.

Alternatively, you can supply a list of document-associated embeddings directly, and Chroma will store the associated documents without embedding them itself.

In [None]:
collection.add(
    documents=["doc1", "doc2", "doc3"],
#     embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2]],
    metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}],
    ids=["id11", "id21", "id31"]
)

In [29]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')

In [30]:
doc_lists=[]
metda_list=[]
for i in newsgroups_train['data'][:100]:
    doc_lists.append(i)
    metda_list.append({'len_of_doc':len(i)})

In [31]:
collection.add(
    documents=doc_lists,
#     embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2]],
    metadatas=metda_list,
    ids=['id_{}'.format(i) for i in range(100)]
)

## select 10 records

In [32]:
collection.peek(2) # returns a list of the first 10 items in the collection

{'ids': ['id1', 'id2'],
 'embeddings': [[-0.007276785094290972,
   -0.03586104139685631,
   -0.0585470050573349,
   0.04303989186882973,
   0.025623103603720665,
   -0.025396939367055893,
   0.033715952187776566,
   0.0034339618869125843,
   -0.02634759247303009,
   -0.04613979905843735,
   0.08306796848773956,
   -0.006422397680580616,
   -0.03559736907482147,
   -0.03969216346740723,
   0.0466601699590683,
   0.026125263422727585,
   0.08533192425966263,
   -0.016890788450837135,
   -0.001704622758552432,
   -0.018152790144085884,
   0.06352955847978592,
   -0.037103720009326935,
   0.05742988735437393,
   0.02375021018087864,
   -0.04897937923669815,
   0.034039758145809174,
   0.004022863693535328,
   0.0653342455625534,
   0.06259661912918091,
   -0.16734807193279266,
   0.07337174564599991,
   0.09654492139816284,
   -0.027098871767520905,
   0.00661829998716712,
   -0.08192151039838791,
   -0.028260089457035065,
   0.03995088115334511,
   -0.09238503128290176,
   -0.094994731247

## Get number of documents in a collection

In [33]:
collection.count() # returns the number of items in the collection

103

In [34]:
collection

Collection(id=04a00942-b72f-4a32-96cf-234c2690e070, name=new_name)

## Define a alternate distance function

In [None]:
collection = client.create_collection(
        name="collection_name",
        metadata={"hnsw:space": "cosine"} # l2 is the default
    )

Valid options for hnsw:space are 

|------|------------|
|Parameter|Full name|
|------|------------|
|"l2"  |"Squared L2"| 
|------|-----|
|"ip |  Inner product|
|------|-----|
|"cosine"| Cosine similarity|
|------|-----|

## Query the collections

In [None]:
collection.query(
    query_embeddings=[[11.1, 12.1, 13.1],[1.1, 2.3, 3.2], ...],
    n_results=10,
    where={"metadata_field": "is_equal_to_this"},
    where_document={"$contains":"search_string"}
)

.get also supports the where and where_document filters. If no ids are supplied, it will return all items in the collection that match the where and where_document filters.

Choosing which data is returned#

When using get or query you can use the include parameter to specify which data you want returned - any of embeddings, documents, metadatas, and for query, distances. By default, Chroma will return the documents, metadatas and in the case of query, the distances of the results. embeddings are excluded by default for performance and the ids are always returned. You can specify which of these you want returned by passing an array of included field names to the includes parameter of the query or get method.

In [35]:
collection.get(
    include=["documents"]
)

{'ids': ['id1',
  'id2',
  'id3',
  'id_0',
  'id_1',
  'id_10',
  'id_11',
  'id_12',
  'id_13',
  'id_14',
  'id_15',
  'id_16',
  'id_17',
  'id_18',
  'id_19',
  'id_2',
  'id_20',
  'id_21',
  'id_22',
  'id_23',
  'id_24',
  'id_25',
  'id_26',
  'id_27',
  'id_28',
  'id_29',
  'id_3',
  'id_30',
  'id_31',
  'id_32',
  'id_33',
  'id_34',
  'id_35',
  'id_36',
  'id_37',
  'id_38',
  'id_39',
  'id_4',
  'id_40',
  'id_41',
  'id_42',
  'id_43',
  'id_44',
  'id_45',
  'id_46',
  'id_47',
  'id_48',
  'id_49',
  'id_5',
  'id_50',
  'id_51',
  'id_52',
  'id_53',
  'id_54',
  'id_55',
  'id_56',
  'id_57',
  'id_58',
  'id_59',
  'id_6',
  'id_60',
  'id_61',
  'id_62',
  'id_63',
  'id_64',
  'id_65',
  'id_66',
  'id_67',
  'id_68',
  'id_69',
  'id_7',
  'id_70',
  'id_71',
  'id_72',
  'id_73',
  'id_74',
  'id_75',
  'id_76',
  'id_77',
  'id_78',
  'id_79',
  'id_8',
  'id_80',
  'id_81',
  'id_82',
  'id_83',
  'id_84',
  'id_85',
  'id_86',
  'id_87',
  'id_88',
  'id_8

In [None]:
collection.query(
    query_embeddings=[[11.1, 12.1, 13.1],[1.1, 2.3, 3.2], ...],
    include=["documents"]
)

## Using Where filters#

Chroma supports filtering queries by metadata and document contents. The where filter is used to filter by metadata, and the where_document filter is used to filter by document contents.

## Filtering by metadata#
In order to filter on metadata, you must supply a where filter dictionary to the query. The dictionary must have the following structure:

{
    "metadata_field": {
        <Operator>: <Value>
    }
}
    
    Filtering metadata supports the following operators:

 * $eq - equal to (string, int, float)
 
 * $ne - not equal to (string, int, float)
    
 * $gt - greater than (int, float)
 
 * $gte - greater than or equal to (int, float)
    
 * $lt - less than (int, float)
 
 * $lte - less than or equal to (int, float)
    
# Filtering for a search_string
{
    "$contains": "search_string"
}

## Using logical operators#
You can also use the logical operators $and and $or to combine multiple filters.

An $and operator will return results that match all of the filters in the list.

{
    "$and": [
        {
            "metadata_field": {
                <Operator>: <Value>
            }
        },
        {
            "metadata_field": {
                <Operator>: <Value>
            }
        }
    ]
}

{
    "$or": [
        {
            "metadata_field": {
                <Operator>: <Value>
            }
        },
        {
            "metadata_field": {
                <Operator>: <Value>
            }
        }
    ]
}

## Update

In [None]:
collection.update(
    ids=["id1", "id2", "id3", ...],
    embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...],
    metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...],
    documents=["doc1", "doc2", "doc3", ...],
)

## Find and Update if not found add

In [None]:
collection.upsert(
    ids=["id1", "id2", "id3", ...],
    embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...],
    metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...],
    documents=["doc1", "doc2", "doc3", ...],
)

## Deleting data from a collection#
Chroma supports deleting items from a collection by id using .delete. The embeddings, documents, and metadata associated with each item will be deleted. ⚠️ Naturally, this is a destructive operation, and cannot be undone.

In [None]:
collection.delete(
    ids=["id1", "id2", "id3",...],
	where={"chapter": "20"}
)