In [None]:
!pip install chromadb

In [66]:
import chromadb

client = chromadb.Client()
collection = client.create_collection("fruit")

In [67]:
# Creating a collection by passing documents as input together with metadata
# Chomadb uses "Sentence transformers" to create it own embeddings
collection.add(
    ids=["1", "2", "3", "4", "5", "6", "7"],
    documents=["apple",
        "banana",
        "pineapple",
        "mango",
        "dragonfruit",
        "passionfruit",
        "raspberry"],
    metadatas=[
        { "color": "red", "weight": 180 },
        { "color": "yellow", "weight": 120 },
        { "color": "brown", "weight": 900 },
        { "color": "yellow", "weight": 200 },
        { "color": "pink", "weight": 600 },
        { "color": "purple", "weight": 18 },
        { "color": "red", "weight": 4 },        
    ]
)

In [68]:
# Browsing the collection
# Print everything
all_data = collection.get()
print(all_data)

{'ids': ['1', '2', '3', '4', '5', '6', '7'], 'embeddings': None, 'documents': ['apple', 'banana', 'pineapple', 'mango', 'dragonfruit', 'passionfruit', 'raspberry'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'weight': 180, 'color': 'red'}, {'weight': 120, 'color': 'yellow'}, {'weight': 900, 'color': 'brown'}, {'weight': 200, 'color': 'yellow'}, {'weight': 600, 'color': 'pink'}, {'color': 'purple', 'weight': 18}, {'weight': 4, 'color': 'red'}]}


In [69]:
from pprint import pprint
pprint(all_data)

{'data': None,
 'documents': ['apple',
               'banana',
               'pineapple',
               'mango',
               'dragonfruit',
               'passionfruit',
               'raspberry'],
 'embeddings': None,
 'ids': ['1', '2', '3', '4', '5', '6', '7'],
 'included': ['metadatas', 'documents'],
 'metadatas': [{'color': 'red', 'weight': 180},
               {'color': 'yellow', 'weight': 120},
               {'color': 'brown', 'weight': 900},
               {'color': 'yellow', 'weight': 200},
               {'color': 'pink', 'weight': 600},
               {'color': 'purple', 'weight': 18},
               {'color': 'red', 'weight': 4}],
 'uris': None}


In [70]:
# NOTE - embeddings : None, Older version of Chroma use to use default Sentence transformers
# We will use the OpenAI embeddings to recreate the same collection

# Clearing the collection
client.delete_collection("fruit")

In [None]:
# Using OpenAI embedding function
!pip install openai
!pip install python-dotenv

In [71]:
import openai
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
from dotenv import load_dotenv

In [72]:
load_dotenv()
embedding_func = OpenAIEmbeddingFunction(
    model_name="text-embedding-3-small"  # or "text-embedding-3-large"
)

In [73]:
collection = client.create_collection("fruit", embedding_function=embedding_func)

In [74]:
collection.add(
    ids=["1", "2", "3", "4", "5", "6", "7"],
    documents=["apple",
        "banana",
        "pineapple",
        "mango",
        "dragonfruit",
        "passionfruit",
        "raspberry"],
    metadatas=[
        { "color": "red", "weight": 180 },
        { "color": "yellow", "weight": 120 },
        { "color": "brown", "weight": 900 },
        { "color": "yellow", "weight": 200 },
        { "color": "pink", "weight": 600 },
        { "color": "purple", "weight": 18 },
        { "color": "red", "weight": 4 },        
    ]
)

In [75]:
all_data = collection.get()
pprint(all_data)

{'data': None,
 'documents': ['apple',
               'banana',
               'pineapple',
               'mango',
               'dragonfruit',
               'passionfruit',
               'raspberry'],
 'embeddings': None,
 'ids': ['1', '2', '3', '4', '5', '6', '7'],
 'included': ['metadatas', 'documents'],
 'metadatas': [{'color': 'red', 'weight': 180},
               {'color': 'yellow', 'weight': 120},
               {'color': 'brown', 'weight': 900},
               {'color': 'yellow', 'weight': 200},
               {'color': 'pink', 'weight': 600},
               {'color': 'purple', 'weight': 18},
               {'color': 'red', 'weight': 4}],
 'uris': None}


In [None]:
# Gotcha moment!
# Embeddings are not returned by default to save bandwidth
# Do the following to show embeddings
all_data = collection.get(include=["embeddings", "documents", "metadatas"])
pprint(all_data)

In [62]:
# Now that data is stored with embeddings, lets query and so semantic search!
# Semantic Search
tropical_fruits = collection.query(query_texts=["Tropical"])
pprint(tropical_fruits)

{'data': None,
 'distances': [[0.6120957732200623,
                0.6204509735107422,
                0.6238695979118347,
                0.6701418161392212,
                0.706540584564209,
                0.7711185216903687,
                0.7747601866722107]],
 'documents': [['pineapple',
                'mango',
                'passionfruit',
                'dragonfruit',
                'banana',
                'apple',
                'raspberry']],
 'embeddings': None,
 'ids': [['3', '4', '6', '5', '2', '1', '7']],
 'included': ['metadatas', 'documents', 'distances'],
 'metadatas': [[{'color': 'brown', 'weight': 900},
                {'color': 'yellow', 'weight': 200},
                {'color': 'purple', 'weight': 18},
                {'color': 'pink', 'weight': 600},
                {'color': 'yellow', 'weight': 120},
                {'color': 'red', 'weight': 180},
                {'color': 'red', 'weight': 4}]],
 'uris': None}


In [76]:
# Print required data
print(tropical_fruits["documents"][0])

['pineapple', 'mango', 'passionfruit', 'dragonfruit', 'banana', 'apple', 'raspberry']


In [77]:
# Metadata Filtering
tropical_yellow = collection.query(query_texts=["Tropical"], 
                                  where={"$and":[
                                      {"color": "yellow"},
                                      {"weight": {"$gt": 150}}
                                  ]
                                  })

In [78]:
print(tropical_yellow["documents"][0])

['mango']


In [79]:
# Full Text Search
contains_fruit = collection.get(
    where_document={"$contains": "fruit"}
)
print(contains_fruit["documents"])

['dragonfruit', 'passionfruit']


In [None]:
# Types of Storage
# Ephemeral Client - What we did so far, NOT persisted
# Persistent Client - Saved to Disk
# Cloud Client - Saved to Cloud

In [80]:
# We will recreate the Fruits database with persistent client
persistent_directory = "db"
persistent_client = chromadb.PersistentClient(path=persistent_directory)

In [81]:
persistent_client.delete_collection("fruit")

In [82]:
persistent_collection = persistent_client.create_collection("fruit", embedding_function=embedding_func)

In [83]:
persistent_collection.add(
    ids=["1", "2", "3", "4", "5", "6", "7"],
    documents=["apple",
        "banana",
        "pineapple",
        "mango",
        "dragonfruit",
        "passionfruit",
        "raspberry"],
    metadatas=[
        { "color": "red", "weight": 180 },
        { "color": "yellow", "weight": 120 },
        { "color": "brown", "weight": 900 },
        { "color": "yellow", "weight": 200 },
        { "color": "pink", "weight": 600 },
        { "color": "purple", "weight": 18 },
        { "color": "red", "weight": 4 },        
    ]
)

In [None]:
!ls -al

In [None]:
!tree