# Data persistance

In [17]:
pip install chromadb -q

Note: you may need to restart the kernel to use updated packages.


In [3]:
import chromadb

In [4]:
# Create a new Chroma client with persistence enabled. 
persist_directory = "chromadb"

client = chromadb.PersistentClient(path=persist_directory)

# Create a new chroma collection
collection_name = "peristed_collection"
collection = client.get_or_create_collection(name=collection_name)

In [5]:
# Add some data to the collection
collection.add(
    embeddings=[
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
    ],
    metadatas=[
        {"uri": "img1.png", "style": "style1"},
        {"uri": "img2.png", "style": "style2"},
        {"uri": "img3.png", "style": "style1"},
        {"uri": "img4.png", "style": "style1"},
        {"uri": "img5.png", "style": "style1"},
        {"uri": "img6.png", "style": "style1"},
        {"uri": "img7.png", "style": "style1"},
        {"uri": "img8.png", "style": "style1"},
    ],
    documents=["doc1", "doc2", "doc3", "doc4", "doc5", "doc6", "doc7", "doc8"],
    ids=["id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8"],
)

In [6]:
# Create a new client with the same settings
client = chromadb.PersistentClient(path=persist_directory)

# Load the collection
collection = client.get_collection(collection_name)

In [11]:

# Query the collection
results = collection.query(
    query_embeddings=[[1.1, 2.3, 3.2]],
    n_results=1
)

print(results)

{'ids': [['id1']], 'distances': [[5.1159076593562386e-15]], 'metadatas': [[{'style': 'style1', 'uri': 'img1.png'}]], 'embeddings': None, 'documents': [['doc1']]}


In [8]:
collection.get(include=["embeddings", "metadatas", "documents"])

{'ids': ['id1', 'id2', 'id3', 'id4', 'id5', 'id6', 'id7', 'id8'],
 'embeddings': [[1.100000023841858, 2.299999952316284, 3.200000047683716],
  [4.5, 6.900000095367432, 4.400000095367432],
  [1.100000023841858, 2.299999952316284, 3.200000047683716],
  [4.5, 6.900000095367432, 4.400000095367432],
  [1.100000023841858, 2.299999952316284, 3.200000047683716],
  [4.5, 6.900000095367432, 4.400000095367432],
  [1.100000023841858, 2.299999952316284, 3.200000047683716],
  [4.5, 6.900000095367432, 4.400000095367432]],
 'metadatas': [{'style': 'style1', 'uri': 'img1.png'},
  {'style': 'style2', 'uri': 'img2.png'},
  {'style': 'style1', 'uri': 'img3.png'},
  {'style': 'style1', 'uri': 'img4.png'},
  {'style': 'style1', 'uri': 'img5.png'},
  {'style': 'style1', 'uri': 'img6.png'},
  {'style': 'style1', 'uri': 'img7.png'},
  {'style': 'style1', 'uri': 'img8.png'}],
 'documents': ['doc1', 'doc2', 'doc3', 'doc4', 'doc5', 'doc6', 'doc7', 'doc8']}

# Where Clause

In [12]:
# Create a new chroma collection
collection_name = "filter_example_collection"
collection = client.create_collection(name=collection_name)

In [13]:
# Add some data to the collection
collection.add(
    embeddings=[
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
    ],
    metadatas=[
        {"status": "read"},
        {"status": "unread"},
        {"status": "read"},
        {"status": "unread"},
        {"status": "read"},
        {"status": "unread"},
        {"status": "read"},
        {"status": "unread"},
    ],
    documents=["A document that discusses domestic policy", "A document that discusses international affairs", "A document that discusses kittens", "A document that discusses dogs", "A document that discusses chocolate", "A document that is sixth that discusses government", "A document that discusses international affairs", "A document that discusses global affairs"],
    ids=["id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8"],
)

In [14]:
# Get documents that are read and about affairs
collection.get(where={"status": "read"}, where_document={"$contains": "affairs"})

{'ids': ['id7'],
 'embeddings': None,
 'metadatas': [{'status': 'read'}],
 'documents': ['A document that discusses international affairs']}

In [15]:
# Get documents that are about global affairs or domestic policy
collection.get(where_document={"$or": [{"$contains": "global affairs"}, {"$contains": "domestic policy"}]})

{'ids': ['id1', 'id8'],
 'embeddings': None,
 'metadatas': [{'status': 'read'}, {'status': 'unread'}],
 'documents': ['A document that discusses domestic policy',
  'A document that discusses global affairs']}

In [16]:
# Get 5 closest vectors to [0, 0, 0] that are about affairs
# Outputs 3 docs because collection only has 3 docs about affairs
collection.query(query_embeddings=[[0, 0, 0]], where_document={"$contains": "affairs"}, n_results=5)

{'ids': [['id7', 'id8', 'id2']],
 'distances': [[16.740000000000006, 87.22000000000003, 87.22000000000003]],
 'metadatas': [[{'status': 'read'},
   {'status': 'unread'},
   {'status': 'unread'}]],
 'embeddings': None,
 'documents': [['A document that discusses international affairs',
   'A document that discusses global affairs',
   'A document that discusses international affairs']]}