In [None]:
!pip install chromadb

# run this command when you want to install chromadb

In [None]:
import chromadb

In [None]:
embedding_function = chromadb.utils.embedding_functions.DefaultEmbeddingFunction()

# we are using DefaultEmbeddingFunction from the library chromadb to create text vector embeddings

embedding = embedding_function(['Some random text'])

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:00<00:00, 90.8MiB/s]


In [None]:
# the code below is an example of a sample embedding

# sample_embedding = embedding_function([
#     'I live in Gill Massachussets',
#     'Massachussets is a state on the east coast of US',
#     'US is a country on the other side of the atlantic'
#     ])

In [None]:
# client is an external agent to interact with the database that will be created using chromadb

client = chromadb.Client()


In [None]:
# docs is a list of documents/texts that we want to add as vector embeddings to the database

docs = [
    'Humpty dumpty sat on the wall',
    'Humpty had a great fall',
    'All the kings horses and all the kings men',
    'Couldnt put Humpty together again'
]

In [None]:
# the code below creates a new database/collection in chromadb called my_collection

collection = client.create_collection(
    name='my_collection'
)

In [None]:
# adding the texts/documents as vector embeddings to the database

collection.add(
    documents= docs,
    embeddings= embedding_function(docs),
    ids=['id1','id2','id3','id4']
)



In [None]:
# .peek() can be used to take a look at our chroma database

collection.peek()

{'ids': ['id1', 'id2', 'id3', 'id4'],
 'embeddings': array([[ 0.06535835,  0.03916836,  0.04223052, ...,  0.0692008 ,
         -0.00860841, -0.03487474],
        [ 0.03438104,  0.03137168,  0.06576575, ..., -0.05009589,
          0.01887586,  0.00255676],
        [-0.02288095,  0.04789393, -0.00156947, ..., -0.02386645,
          0.02365233, -0.00899747],
        [ 0.02342279, -0.02867901,  0.03778486, ..., -0.02947035,
          0.00254525, -0.03437073]]),
 'documents': ['Humpty dumpty sat on the wall',
  'Humpty had a great fall',
  'All the kings horses and all the kings men',
  'Couldnt put Humpty together again'],
 'uris': None,
 'data': None,
 'metadatas': [None, None, None, None],
 'included': [<IncludeEnum.embeddings: 'embeddings'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [None]:
# collection.query() is searching/querying the database for relevant texts/vectors

# n_results = 1 means that database needs to return only 1 text that is most similar (in terms of cosine or L2 norm similarity) to the query text

result_query = collection.query(query_texts=['How many kings men were there?'],
                                n_results=1
                                )

In [None]:
print(result_query['documents'])

[['All the kings horses and all the kings men', 'Couldnt put Humpty together again', 'Humpty had a great fall']]


In [None]:
collection2 = client.get_or_create_collection(
    name='cosine_database',
    metadata={"hnsw:space": "cosine"} # this metadata ensures that vectors using cosine formula to calculate similarity
)

In [None]:
collection2.add(
    documents=docs,
    ids = ['1','2','3','4']
)

In [None]:
result_query = collection2.query(query_texts=['Who is humpty dumpty'],n_results=1)

In [None]:
print(result_query['documents'][0][0])

Humpty dumpty sat on the wall
