In [19]:
# Some of these libraries may not be necessary...leftovers from other notebooks.

import chromadb
# from chromadb.config import Settings
# from chromadb.utils import embedding_functions

simple_collection_name = "EO-Simple"
complex_collection_name = "EO-Complex"
cdb_path = "Chroma"

# This creates the chroma collection unless it already exists...in which case, it just initializes it. 

client = chromadb.PersistentClient(path=cdb_path)

try:
    collection1 = client.get_collection(simple_collection_name)
except:
    collection1 = client.create_collection(name=simple_collection_name, metadata={"hnsw:space": "cosine"})
    
item_count = collection1.count()
print(f"Count of items in simple collection: {item_count}")

try:
    collection2 = client.get_collection(complex_collection_name)
except:
    collection2 = client.create_collection(name=complex_collection_name, metadata={"hnsw:space": "cosine"})
    
item_count = collection2.count()
print(f"Count of items in complex collection: {item_count}")

Count of items in simple collection: 598
Count of items in complex collection: 2158


In [3]:
from openai import OpenAI

emb_client = OpenAI()

def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return emb_client.embeddings.create(input = [text], model=model).data[0].embedding

In [20]:
# Sample Query with just Chroma...no LLM yet. 

myquery = "What is the COVID mask mandate?"

myembedding = get_embedding(myquery)

results1 = collection1.query(
    query_embeddings=myembedding,
    n_results= 5,
#     where={"year": {"$eq": "2023"}}  # You can filter by the metadata
)

for x in range(len(results1['metadatas'][0])):
    print(results1['distances'][0][x])
    print("Source:",results1['metadatas'][0][x]['source'])
    print("Link:",results1['metadatas'][0][x]['link'])
    print("Year:",results1['metadatas'][0][x]['year'])
    print("Chunks:",results1['metadatas'][0][x]['page'])
    
    print("-"*40)


0.16209673881530762
Source: 2021-01766.pdf
Link: https://www.federalregister.gov/d/2021-01766
Year: 2021
Chunks: 1
----------------------------------------
0.177090585231781
Source: 2021-01766.pdf
Link: https://www.federalregister.gov/d/2021-01766
Year: 2021
Chunks: 2
----------------------------------------
0.18631315231323242
Source: 2021-19927.pdf
Link: https://www.federalregister.gov/d/2021-19927
Year: 2021
Chunks: 1
----------------------------------------
0.18877273797988892
Source: 2021-01859.pdf
Link: https://www.federalregister.gov/d/2021-01859
Year: 2021
Chunks: 1
----------------------------------------
0.20658951997756958
Source: 2021-01766.pdf
Link: https://www.federalregister.gov/d/2021-01766
Year: 2021
Chunks: 3
----------------------------------------


In [34]:
##### Sample Query with just Chroma...no LLM yet. 

unique_cnt = 5
unique_ids = []

myquery = "What is the COVID mask mandate?"

myembedding = get_embedding(myquery)

results2 = collection2.query(
    query_embeddings=myembedding,
    n_results= 50,
#     where={"year": {"$eq": "2023"}}  # You can filter by the metadata
#     where={"type": {"$eq": "parent"}}  # You can filter by the metadata
#     where={"source": {"$eq": "2021-01766.pdf"}}  # You can filter by the metadata

)

for x in range(len(results2['metadatas'][0])):
    if results2['metadatas'][0][x]['parentid'] not in unique_ids:
        print(results2['distances'][0][x])
        print("Source:",results2['metadatas'][0][x]['source'])
        print("Link:",results2['metadatas'][0][x]['link'])
        print("Year:",results2['metadatas'][0][x]['year'])
        print("Chunks:",results2['metadatas'][0][x]['page'])
        print("Parent:",results2['metadatas'][0][x]['parentid'])
        print("Type:",results2['metadatas'][0][x]['type'])
        print(results2['documents'][0][x])

        print("-"*40)
        unique_ids.append(results2['metadatas'][0][x]['parentid'])
        if len(unique_ids) >= unique_cnt:
            break

0.15830600261688232
Source: 2021-01766.pdf
Link: https://www.federalregister.gov/d/2021-01766
Year: 2021
Chunks: 1
Parent: f0670fb8-8a21-472e-abda-2bf93499389c
Type: parent
Presidential Documents
7045 
Federal Register / Vol. 86, No. 14 / Monday, January 25, 2021 / Presidential Documents 
Executive Order 13991 of January 20, 2021 
Protecting the Federal Workforce and Requiring Mask-Wear-
ing 
By the authority vested in me as President by the Constitution and the 
laws of the United States of America, including section 7902(c) of title 
5, United States Code, it is hereby ordered as follows: 
Section 1. Policy. It is the policy of my Administration to halt the spread 
of coronavirus disease 2019 (COVID–19) by relying on the best available 
data and science-based public health measures. Such measures include wear-
ing masks when around others, physical distancing, and other related pre-
cautions recommended by the Centers for Disease Control and Prevention 
(CDC). Put simply, masks and o

In [37]:
for x in results2['documents'][0]:
    print(x)
    print("-"*60)

Presidential Documents
7045 
Federal Register / Vol. 86, No. 14 / Monday, January 25, 2021 / Presidential Documents 
Executive Order 13991 of January 20, 2021 
Protecting the Federal Workforce and Requiring Mask-Wear-
ing 
By the authority vested in me as President by the Constitution and the 
laws of the United States of America, including section 7902(c) of title 
5, United States Code, it is hereby ordered as follows: 
Section 1. Policy. It is the policy of my Administration to halt the spread 
of coronavirus disease 2019 (COVID–19) by relying on the best available 
data and science-based public health measures. Such measures include wear-
ing masks when around others, physical distancing, and other related pre-
cautions recommended by the Centers for Disease Control and Prevention 
(CDC). Put simply, masks and other public health measures reduce the 
spread of the disease, particularly when communities make widespread use 
of such measures, and thus save lives. 
Accordingly, to pro

In [33]:
print(results2['documents'][0][6])

Presidential Documents
30891 
Federal Register / Vol. 88, No. 93 / Monday, May 15, 2023 / Presidential Documents 
Executive Order 14099 of May 9, 2023 
Moving Beyond COVID–19 Vaccination Requirements for Fed-
eral Workers 
By the authority vested in me as President by the Constitution and the 
laws of the United States of America, it is hereby ordered as follows: 
Section 1. Policy. In 2021, based on the best available data and guidance 
from our public health experts, I issued Executive Order 14043 of September 
9, 2021 (Requiring Coronavirus Disease 2019 Vaccination for Federal Employ-
ees), to direct executive departments and agencies (agencies) to require 
coronavirus disease 2019 (COVID–19) vaccination for their employees, and 
Executive Order 14042 of September 9, 2021 (Ensuring Adequate COVID 
Safety Protocols for Federal Contractors), to ensure that Federal contractors 
and subcontractors have adequate COVID–19 safety protocols. I issued those 
orders at a time when the highly 

In [17]:
# 5f134535-bc4b-48e0-9481-4489da1d28fb

collection2.delete(
    ids=myids
)


In [15]:
results3 = collection2.query(
    query_embeddings=myembedding,
    n_results= 50,
    where={"parentid": {"$eq": "868304bf-adda-4890-bc2e-8888f8cf7aee"}}  # You can filter by the metadata
#     where={"type": {"$eq": "parent"}}  # You can filter by the metadata
#     where={"source": {"$eq": "2021-01766.pdf"}}  # You can filter by the metadata

)

In [16]:
myids = results3['ids'][0]