In [1]:
%pip install --upgrade chromadb

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting chromadb
  Downloading chromadb-0.4.24-py3-none-any.whl.metadata (7.3 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.1.1-py3-none-any.whl.metadata (4.2 kB)
Collecting chroma-hnswlib==0.7.3 (from chromadb)
  Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.4.2-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting pulsar-client>=3.1.0 (from chromadb)
  Downloading pulsar_client-3.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.17.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.3 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.23.0-py3-none-any.whl.m

In [3]:
# # Install chromadb
# %pip install chromadb

# Install sentence transformers
# This is used to convert text to vector embeddings. In other words, it converts text to a bunch of numbers that represent the 'meaning' of the text.
%pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting sentence-transformers
  Downloading sentence_transformers-2.5.1-py3-none-any.whl.metadata (11 kB)
Collecting transformers<5.0.0,>=4.32.0 (from sentence-transformers)
  Downloading transformers-4.38.2-py3-none-any.whl.metadata (130 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.7/130.7 kB[0m [31m791.6 kB/s[0m eta [36m0:00:00[0m1m715.5 kB/s[0m eta [36m0:00:01[0m
Collecting tokenizers<0.19,>=0.14 (from transformers<5.0.0,>=4.32.0->sentence-transformers)
  Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading sentence_transformers-2.5.1-py3-none-any.whl (156 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.38.2-py3-none-any.whl (8.5 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━

In [29]:
import csv

# Load sample data (a restaurant menu of items)
with open('dataset/menu_items.csv') as file:
    lines = csv.reader(file)

    # Store the name of the menu items in this array. In Chroma, a "document" is a string i.e. name, sentence, paragraph, etc.
    documents = []

    # Store the corresponding menu item IDs in this array.
    metadatas = []

    # Each "document" needs a unique ID. This is like the primary key of a relational database. We'll start at 1 and increment from there.
    ids = []
    id = 1

    # Loop thru each line and populate the 3 arrays.
    for i, line in enumerate(lines):
        if i==0:
            # Skip the first row (the column headers)
            continue

        documents.append(line[1])
        metadatas.append({"item_id": line[0]})
        ids.append(str(id))
        id+=1


# Reference: https://docs.trychroma.com/getting-started

import chromadb
from chromadb.utils import embedding_functions

# Instantiate chromadb instance. Data is stored on disk (a folder named 'my_vectordb' will be created in the same folder as this file).
chroma_client = chromadb.PersistentClient(path="my_vectordb")

# List of model names can be found here https://www.sbert.net/docs/pretrained_models.html
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

# Create the collection, aka vector database. Or, if database already exist, then use it. Specify the model that we want to use to do the embedding.
collection = chroma_client.get_or_create_collection(name="my_collection", embedding_function=sentence_transformer_ef)

# Add all the data to the vector database. ChromaDB automatically converts and stores the text as vector embeddings. This may take a few minutes.
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

# Query mispelled word: 'vermiceli'. Expect to find the correctly spelled 'vermicelli' item
results = collection.query(
    query_texts=["vermiceli"],
    n_results=5,
    include=['documents', 'distances', 'metadatas']
)
print(results['documents'])

# Query word variation: 'donut'. Expect to find the 'doughnut' item
results = collection.query(
    query_texts=["donut"],
    n_results=5,
    include=['documents', 'distances', 'metadatas']
)
print(results['documents'])

# Query similar meaning: 'shrimp'. Expect to find the 'prawn' items
results = collection.query(
    query_texts=["shrimp"],
    n_results=5,
    include=['documents', 'distances', 'metadatas']
)
print(results['documents'])

Insert of existing embedding ID: 1
Insert of existing embedding ID: 2
Insert of existing embedding ID: 3
Insert of existing embedding ID: 4
Insert of existing embedding ID: 5
Insert of existing embedding ID: 6
Insert of existing embedding ID: 7
Insert of existing embedding ID: 8
Insert of existing embedding ID: 9
Insert of existing embedding ID: 10
Insert of existing embedding ID: 11
Insert of existing embedding ID: 12
Insert of existing embedding ID: 13
Insert of existing embedding ID: 14
Insert of existing embedding ID: 15
Insert of existing embedding ID: 16
Insert of existing embedding ID: 17
Insert of existing embedding ID: 18
Insert of existing embedding ID: 19
Insert of existing embedding ID: 20
Insert of existing embedding ID: 21
Insert of existing embedding ID: 22
Insert of existing embedding ID: 23
Insert of existing embedding ID: 24
Insert of existing embedding ID: 25
Insert of existing embedding ID: 26
Insert of existing embedding ID: 27
Insert of existing embedding ID: 28
I

[['Melon and Vermicelli', 'Vegetable Tempura', 'Veggie Lee Secret Drink', 'Assorted Vegetables', 'Broccoli with Szechuan Sauce']]
[['Chinese Doughnut', 'French Fries', 'Strawberry Smoothie', 'Soda', 'Pinenuts with Toons Fried Rice']]
[['Kung Pao Vegan Prawns', 'Salt and Pepper Vegan Prawns', 'Vegan Kung Pao Prawns', 'Vegan Prawns and Mushrooms with Black Pepper Sauce', 'Vegan Prawns with Mixed Nuts']]


In [None]:
import json


# # Load sample data (a restaurant menu of items)
# with open('dataset/merged_data.json') as file:
#     lines = json.load(file)

#     # Store the name of the menu items in this array. In Chroma, a "document" is a string i.e. name, sentence, paragraph, etc.
#     documents = []

#     # Store the corresponding menu item IDs in this array.
#     metadatas = []

#     # Each "document" needs a unique ID. This is like the primary key of a relational database. We'll start at 1 and increment from there.
#     ids = []
#     id = 1

    # Loop thru each line and populate the 3 arrays.
    # for i, line in enumerate(lines):
    #     if i==0:
    #         # Skip the first row (the column headers)
    #         continue

    #     documents.append(line[1])
    #     metadatas.append({"item_id": line[0]})
    #     ids.append(str(id))
    #     id+=1

import json
 
ids = []
with open('dataset/merged_data.json', 'r') as file:
  data = json.load(file)

# Extract desired data and create a dictionary for each article
articles = []
for item in data:
  article = {
      "metadata": {
          "headline": item["headline"],
          "author": item["author"] if isinstance(item["author"], list) else item["author"],  # Handle single author as string
          "article": item["article"],
          "summary": item["summary"],
          "category": item["category"],
          "url": item["url"],
      }
  }
  articles.append(article)


# Reference: https://docs.trychroma.com/getting-started

import chromadb
from chromadb.utils import embedding_functions

# Instantiate chromadb instance. Data is stored in memory only.
# chroma_client = chromadb.Client()

# Instantiate chromadb instance. Data is stored on disk (a folder named 'my_vectordb' will be created in the same folder as this file).
chroma_client = chromadb.PersistentClient(path="my_vectordb")


# Select the embedding model to use.
# List of model names can be found here https://www.sbert.net/docs/pretrained_models.html
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

# Use this to delete the database
# chroma_client.delete_collection(name="my_collection")

# Create the collection, aka vector database. Or, if database already exist, then use it. Specify the model that we want to use to do the embedding.
collection = chroma_client.get_or_create_collection(name="my_collection2", embedding_function=sentence_transformer_ef)


# Add all the data to the vector database. ChromaDB automatically converts and stores the text as vector embeddings. This may take a few minutes.
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

# Query the vector database

# Query mispelled word: 'vermiceli'. Expect to find the correctly spelled 'vermicelli' item
results = collection.query(
    query_texts=["vermiceli"],
    n_results=5,
    include=['documents', 'distances', 'metadatas']
)
print(results['documents'])

# Query word variation: 'donut'. Expect to find the 'doughnut' item
results = collection.query(
    query_texts=["donut"],
    n_results=5,
    include=['documents', 'distances', 'metadatas']
)
print(results['documents'])

# Query similar meaning: 'shrimp'. Expect to find the 'prawn' items
results = collection.query(
    query_texts=["shrimp"],
    n_results=5,
    include=['documents', 'distances', 'metadatas']
)
print(results['documents'])

In [4]:
# from .autonotebook import tqdm as notebook_tqdm
import json

# Load sample data from a JSON file (assuming the structure is an array of objects)
with open('dataset/merged_data.json', 'r') as file:
    data = json.load(file)

    # Store the name of the menu items in this array.
    documents = []

    # Store the corresponding metadata in this array.
    metadatas = []

    # Each "document" needs a unique ID.
    ids = []
    id = 1

    # Loop through each item in the JSON data
    for item in data:
        documents.append(item['headline'])  # Assuming 'headline' holds the name of the menu item

        # Handle multiple authors by joining them into a single string
        authors = ', '.join(item.get('author', []))  # Ensure default value is an empty list if 'author' is missing

        # Ensure other fields are not None
        article = item.get('article', '')
        summary = item.get('summary', '')
        category = item.get('category', '')
        url = item.get('url', '')

        # Check if any metadata value is None and replace it with an empty string
        metadata = {
            'headline': item['headline'] if item['headline'] is not None else '',
            'author': authors,
            'article': article if article is not None else '',
            'summary': summary if summary is not None else '',
            'category': category if category is not None else '',
            'url': url if url is not None else ''
        }
        metadatas.append(metadata)
        # print('metadatas: ', (metadatas))
        ids.append(str(id))
        id += 1

# The rest of your code remains unchanged



# The rest of your code remains unchanged


# # Reference: https://docs.trychroma.com/getting-started

import chromadb
from chromadb.utils import embedding_functions

# Instantiate chromadb instance.
chroma_client = chromadb.PersistentClient(path="my_vectordb")

# Specify the model for embedding
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

# Create or retrieve the collection
collection = chroma_client.get_or_create_collection(name="my_collection3", embedding_function=sentence_transformer_ef)

# Add data to the vector database
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)


# results = collection.query(
#     query_texts=["donal trump"],
#     n_results=5,
#     include=['documents', 'distances', 'metadatas']
# )
# print(results['metadatas'])

 


Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Add of existing embedding ID: 4
Add of existing embedding ID: 5
Add of existing embedding ID: 6
Add of existing embedding ID: 7
Add of existing embedding ID: 8
Add of existing embedding ID: 9
Add of existing embedding ID: 10
Add of existing embedding ID: 11
Add of existing embedding ID: 12
Add of existing embedding ID: 13
Add of existing embedding ID: 14
Add of existing embedding ID: 15
Add of existing embedding ID: 16
Add of existing embedding ID: 17
Add of existing embedding ID: 18
Add of existing embedding ID: 19
Add of existing embedding ID: 20
Add of existing embedding ID: 21
Add of existing embedding ID: 22
Add of existing embedding ID: 23
Add of existing embedding ID: 24
Add of existing embedding ID: 25
Add of existing embedding ID: 26
Add of existing embedding ID: 27
Add of existing embedding ID: 28
Add of existing embedding ID: 29
Add of existing embedding ID: 30
Add of existing emb

In [49]:
# results = collection.query(
#     query_texts=["donal trump"],
#     n_results=5,
#     include=['documents', 'distances', 'metadatas']
# )

# # Print multiple metadata fields for each result
# for result in results['metadatas']:
#     print("Headline:", result[0]['headline'])
#     print("Author:", result[0]['author'])
#     print("Summary:", result[0]['summary'])
#     print("Category:", result[0]['category'])
#     print("URL:", result[0]['url'])
#     print("\n")

results = collection.query(
    query_texts=["gaza"],
    n_results=5,
    include=['documents', 'distances', 'metadatas']
)

# Print multiple metadata fields for each result
for result in results['metadatas']:
    for metadata in result:
        print("Headline:", metadata['headline'])
        print("Author:", metadata['author'])
        print("Summary:", metadata['summary'])
        print("Category:", metadata['category'])
        print("URL:", metadata['url'])
        print("\n")






Headline: Conflict in Israel and Gaza, in Photos
Author: The New York Times
Summary: As several countries sought to broker a cease-fire in the war in the Gaza Strip and the release of hostages held there, Israel pressed on in February with its devastating campaign in the territory against Hamas. The Gazan Health Ministry says Israels offensive, precipitated by a Hamas-led attack on Oct. 7 that killed 1,200 people and saw 240 others abducted into Gaza, has killed more than 26,000 people. The Israeli aerial bombardment and ground invasion war have displaced most of Gazas people and destroyed many of their homes, and its blockade has left them critically short of basic necessities. The families of the Israeli hostages have been taking more aggressive steps to demand the immediate release of their loved ones, pressuring Prime Minister Benjamin Netanyahu, who has vowed to pursue the war, to make a deal for their freedom.
Category: World
URL: https://www.nytimes.com/article/israel-gaza-photo

In [44]:
# Query the vector database

# Query mispelled word: 'vermiceli'. Expect to find the correctly spelled 'vermicelli' item
results = collection.query(
    query_texts=["donal trump"],
    n_results=5,
    include=['documents', 'distances', 'metadatas']
)
print(results['metadatas'])

# # Query word variation: 'donut'. Expect to find the 'doughnut' item
# results = collection.query(
#     query_texts=["donut"],
#     n_results=5,
#     include=['documents', 'distances', 'metadatas']
# )
# print(results['documents'])

# # Query similar meaning: 'shrimp'. Expect to find the 'prawn' items
# results = collection.query(
#     query_texts=["shrimp"],
#     n_results=5,
#     include=['documents', 'distances', 'metadatas']
# )
# print(results['documents'])

[[{'article': 'Former President Donald Trump jumped into the fray this week after former Democratic Rep. Tom Suozzi won a closely watched special election for a vacant House seat once held by former Republican Rep. George Santos, who was expelled from the chamber in December. Trump slammed Republican candidate Mazi Pilip after she lost to Suozzi, claiming it was because she did not endorse him. Meanwhile, Trump and his GOP primary rival Nikki Haley have their eyes on the next major contest on the 2024 Republican nominating calendar -- Haleys home state of South Carolina. Here\'s a snapshot of where the battle to lead the Republican Party stands. GOP DELEGATE COUNT AFTER NEVADA: DEMOCRAT DELEGATE COUNT AFTER NEVADA: ONE NEW VICTORY LAP: Former Democratic Rep. Tom Suozzi won the New York special election against Republican county lawmaker Mazi Pilip. Suozzi, speaking at his victory celebration, emphasized that "despite all the attacks and despite all the lies about Tom Suozzi and the squ