In [26]:
cyberlabels = ["2021-10460.pdf","2022-05471.pdf","2022-20450.pdf","2022-22531.pdf","2024-04573.pdf",
               "NIST.SP.800-172A.pdf","NIST.SP.1800-29.pdf"]

# Simple Index

In [27]:
# Some of these libraries may not be necessary...leftovers from other notebooks.

from os import listdir
from os.path import isfile, join
import fitz
import uuid

import chromadb
# from chromadb.config import Settings
# from chromadb.utils import embedding_functions
from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import OpenAI

emb_client = OpenAI()
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return emb_client.embeddings.create(input = [text], model=model).data[0].embedding

collection_name = "EO-Simple"
cdb_path = "Chroma"
doc_path = "content"

In [28]:
# This creates the chroma collection unless it already exists...in which case, it just initializes it. 

client = chromadb.PersistentClient(path=cdb_path)

try:
    collection = client.get_collection(collection_name)
except:
    collection = client.create_collection(name=collection_name, metadata={"hnsw:space": "cosine"})
    
item_count = collection.count()
print(f"Count of items in collection: {item_count}")

Count of items in collection: 695


In [29]:
# Here are the files...

pdffiles = [f for f in listdir(doc_path) if isfile(join(doc_path, f)) \
            and f.lower().endswith('.pdf') \
#             and f.lower().startswith('nist') \
           ]
print(str(len(pdffiles))+" PDF File(s):")
print(pdffiles)


137 PDF File(s):
['2021-01761.pdf', '2023-15347.pdf', '2022-10076.pdf', '2021-27605.pdf', '2021-19927.pdf', '2022-27585.pdf', '2021-01762.pdf', '2023-08659.pdf', '2022-13391.pdf', '2021-21908.pdf', '2021-19924.pdf', '2022-02869.pdf', '2021-09263.pdf', '2021-25548.pdf', '2023-10407.pdf', '2022-28474.pdf', '2024-04012.pdf', '2021-10691.pdf', '2021-01767.pdf', '2022-03346.pdf', '2021-02252.pdf', '2022-22834.pdf', '2023-27318.pdf', 'NIST.SP.800-172A.pdf', '2022-21911.pdf', '2022-11810.pdf', '2023-13889.pdf', '2024-04573.pdf', '2021-01766.pdf', '2022-20167.pdf', '2022-20210.pdf', '2021-01759.pdf', '2021-01765.pdf', '2022-05949.pdf', '2022-15743.pdf', '2021-04281.pdf', '2022-07757.pdf', '2021-27114.pdf', '2023-28661.pdf', '2021-04280.pdf', '2021-27505.pdf', '2021-02034.pdf', '2021-01924.pdf', '2021-12019.pdf', '2023-03779.pdf', '2023-16570.pdf', '2021-01852.pdf', '2023-08955.pdf', '2021-25715.pdf', '2021-07239.pdf', '2023-28662.pdf', '2021-14127.pdf', '2021-05200.pdf', '2021-01923.pdf', '202

In [22]:
# Create chunks, embeddings, and vector index

chunksize = 5000

# I prevfer the LangChain recursive splitter since it's better at making logical chunks.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunksize,
    chunk_overlap  = 0,
    length_function = len,)

# Iterate through all the PDFs
for z in pdffiles:
    mydata_all = ''
    fname = join(doc_path,z)
    # The PDF text layer comes down in paragraphs so I start with building up one text layer
    doc = fitz.open(fname) 
    this_doc = ''
    for page in doc:  # iterate the document pages
        this_doc = this_doc + page.get_text() + "\n"
    mydata_all = mydata_all + this_doc + '\n'  

    # I collect the metadata going into the index
    doc_source = z
    if z in cyberlabels:
        rbac_label = "IT"
    else:
        rbac_label = "PUBLIC"
    doc_year = z.split("-")[0]
    if z.split(".")[0] == 'NIST':
        doc_link = "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/"+z
    else:
        doc_link = "https://www.federalregister.gov/d/"+z.split(".")[0]
    print(z+" contains "+str(len(mydata_all))+" characters")
    texts = text_splitter.create_documents([mydata_all])
    print("You have "+str(len(texts))+" chunks")
    
    pagecount = 0  # I like to collect the chunk number in my metadata
    # Iterate through the chunks.
    for mytext in texts:
        pagecount += 1
        myembedding = get_embedding(mytext.page_content)
        metadatas={"source": doc_source,"page": pagecount, "year": doc_year, "link": doc_link, "rbac":rbac_label}
        myuuid = str(uuid.uuid4()) # I like to create my own IDs though it's usually safe to default it.
        collection.add(
            embeddings=myembedding,
            metadatas=metadatas,
            documents=mytext.page_content,
            ids=myuuid,
        )
        print(metadatas)


NIST.SP.800-172A.pdf contains 168082 characters
You have 52 chunks
{'source': 'NIST.SP.800-172A.pdf', 'page': 1, 'year': 'NIST.SP.800', 'link': 'https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-172A.pdf', 'rbac': 'IT'}
{'source': 'NIST.SP.800-172A.pdf', 'page': 2, 'year': 'NIST.SP.800', 'link': 'https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-172A.pdf', 'rbac': 'IT'}
{'source': 'NIST.SP.800-172A.pdf', 'page': 3, 'year': 'NIST.SP.800', 'link': 'https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-172A.pdf', 'rbac': 'IT'}
{'source': 'NIST.SP.800-172A.pdf', 'page': 4, 'year': 'NIST.SP.800', 'link': 'https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-172A.pdf', 'rbac': 'IT'}
{'source': 'NIST.SP.800-172A.pdf', 'page': 5, 'year': 'NIST.SP.800', 'link': 'https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-172A.pdf', 'rbac': 'IT'}
{'source': 'NIST.SP.800-172A.pdf', 'page': 6, 'year': 'NIST.SP.800', 'link': 'https://nv

{'source': 'NIST.SP.800-172A.pdf', 'page': 50, 'year': 'NIST.SP.800', 'link': 'https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-172A.pdf', 'rbac': 'IT'}
{'source': 'NIST.SP.800-172A.pdf', 'page': 51, 'year': 'NIST.SP.800', 'link': 'https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-172A.pdf', 'rbac': 'IT'}
{'source': 'NIST.SP.800-172A.pdf', 'page': 52, 'year': 'NIST.SP.800', 'link': 'https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-172A.pdf', 'rbac': 'IT'}
NIST.SP.1800-29.pdf contains 166560 characters
You have 45 chunks
{'source': 'NIST.SP.1800-29.pdf', 'page': 1, 'year': 'NIST.SP.1800', 'link': 'https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.1800-29.pdf', 'rbac': 'IT'}
{'source': 'NIST.SP.1800-29.pdf', 'page': 2, 'year': 'NIST.SP.1800', 'link': 'https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.1800-29.pdf', 'rbac': 'IT'}
{'source': 'NIST.SP.1800-29.pdf', 'page': 3, 'year': 'NIST.SP.1800', 'link': 'https://nv

In [25]:
# Sample Query with just Chroma...no LLM yet. 

myquery = "What is the COVID mask mandate?"

myembedding = get_embedding(myquery)

results = collection.query(
    query_embeddings=myembedding,
    n_results= 5,
    where={"rbac":{"$eq": "IT"}}
#     where={"year": {"$eq": "2023"}}  # You can filter by the metadata
)

for x in range(len(results['metadatas'][0])):
    print(results['distances'][0][x])
    print("Source:",results['metadatas'][0][x]['source'])
    print("Link:",results['metadatas'][0][x]['link'])
    print("Year:",results['metadatas'][0][x]['year'])
    print("Chunks:",results['metadatas'][0][x]['page'])
    
    print("-"*40)


0.2495805025100708
Source: 2021-10460.pdf
Link: https://www.federalregister.gov/d/2021-10460
Year: 2021
Chunks: 1
----------------------------------------
0.25560736656188965
Source: 2024-04573.pdf
Link: https://www.federalregister.gov/d/2024-04573
Year: 2024
Chunks: 1
----------------------------------------
0.26290667057037354
Source: 2024-04573.pdf
Link: https://www.federalregister.gov/d/2024-04573
Year: 2024
Chunks: 3
----------------------------------------
0.2637559175491333
Source: 2024-04573.pdf
Link: https://www.federalregister.gov/d/2024-04573
Year: 2024
Chunks: 8
----------------------------------------
0.26559877395629883
Source: 2021-10460.pdf
Link: https://www.federalregister.gov/d/2021-10460
Year: 2021
Chunks: 4
----------------------------------------
0.2684970498085022
Source: 2024-04573.pdf
Link: https://www.federalregister.gov/d/2024-04573
Year: 2024
Chunks: 4
----------------------------------------
0.26873618364334106
Source: 2022-05471.pdf
Link: https://www.federa

In [None]:
#The most relevant block of text to your question...

print(results['documents'][0][0])

# Advanced Index

In [10]:
# Some of these libraries may not be necessary...leftovers from other notebooks.

from os import listdir
from os.path import isfile, join
import fitz
import uuid

import chromadb
# from chromadb.config import Settings
# from chromadb.utils import embedding_functions
from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import OpenAI

chat_client = OpenAI()
emb_client = OpenAI()

def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return emb_client.embeddings.create(input = [text], model=model).data[0].embedding

def get_questions(text, model="gpt-4-0125-preview"):
    completion = chat_client.chat.completions.create(
      model=model,
      messages=[
        {"role": "system", "content": "Analyze the user's input and create five questions that could be answered using the user's input."},
        {"role": "user", "content": text}
      ]
    )
    return completion.choices[0].message.content

collection_name = "EO-Complex"
cdb_path = "Chroma"
doc_path = "content"

In [11]:
# This creates the chroma collection unless it already exists...in which case, it just initializes it. 

client = chromadb.PersistentClient(path=cdb_path)

try:
    collection = client.get_collection(collection_name)
except:
    collection = client.create_collection(name=collection_name, metadata={"hnsw:space": "cosine"})
    
item_count = collection.count()
print(f"Count of items in collection: {item_count}")

# Here are the files...

pdffiles = [f for f in listdir(doc_path) if isfile(join(doc_path, f)) and f.lower().endswith('.pdf')]
print(str(len(pdffiles))+" PDF File(s):")
# print(pdffiles)

Count of items in collection: 0
135 PDF File(s):


In [5]:
# client.delete_collection(collection_name)

In [12]:
# Create chunks, embeddings, and vector index
pdffiles = ["2021-01766.pdf"]

largechunksize = 10000
smallchunksize = 2000

# I prevfer the LangChain recursive splitter since it's better at making logical chunks.
large_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = largechunksize,
    chunk_overlap  = 0,
    length_function = len,)
small_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = smallchunksize,
    chunk_overlap  = 0,
    length_function = len,)

# Iterate through all the PDFs
for z in pdffiles:
    mydata_all = ''
    fname = join(doc_path,z)
    # The PDF text layer comes down in paragraphs so I start with building up one text layer
    doc = fitz.open(fname) 
    this_doc = ''
    for page in doc:  # iterate the document pages
        this_doc = this_doc + page.get_text() + "\n"
    mydata_all = mydata_all + this_doc + '\n'  

    # I collect the metadata going into the index
    doc_source = z
    doc_year = z.split("-")[0]
    doc_link = "https://www.federalregister.gov/d/"+z.split(".")[0]
    print(z+" contains "+str(len(mydata_all))+" characters")
    texts = large_text_splitter.create_documents([mydata_all])
    print("You have "+str(len(texts))+" chunks")
    
    largepagecount = 0  # I like to collect the chunk number in my metadata
    # Iterate through the chunks.
    for mytext in texts:
        largepagecount += 1
        large_text = mytext.page_content
        if z in cyberlabels:
            rbac_label = "IT"
        else:
            rbac_label = "PUBLIC"

        myembedding = get_embedding(large_text)
        parentuuid = str(uuid.uuid4()) # I like to create my own IDs though it's usually safe to default it.
        collection.add(
            embeddings=myembedding,
            metadatas={"source": doc_source,"page": largepagecount, "year": doc_year, "link": doc_link, "parentid": parentuuid, "type": "parent", "rbac":rbac_label},
            documents=mytext.page_content,
            ids=parentuuid,
        )
#         print("parent")
        
        myquestions = get_questions(large_text)
        myembedding = get_embedding(myquestions)
        collection.add(
            embeddings=myembedding,
            metadatas={"source": doc_source,"page": largepagecount, "year": doc_year, "link": doc_link, "parentid": parentuuid, "type": "questions", "rbac":rbac_label},
            documents=large_text,
            ids=str(uuid.uuid4()),
        )
#         print(myquestions)
                                    
        smalltexts = small_text_splitter.create_documents([large_text])
        smallpagecount = 0
        for mystext in smalltexts:
            smallpagecount += 1
            small_text = mystext.page_content
            myembedding = get_embedding(small_text)
            collection.add(
                embeddings=myembedding,
                metadatas={"source": doc_source,"page": largepagecount, "subpage": smallpagecount, "year": doc_year, "link": doc_link, "parentid": parentuuid, "type": "child", "rbac":rbac_label},
                documents=large_text,
                ids=str(uuid.uuid4()),
            )
#             print(small_text)
        

2021-01766.pdf contains 12357 characters
You have 2 chunks


In [13]:
# Sample Query with just Chroma...no LLM yet. 

myquery = "What is the COVID mask mandate?"
myarray = []

myembedding = get_embedding(myquery)

results = collection.query(
    query_embeddings=myembedding,
    n_results= 50,
    where={"rbac":{"$eq":"PUBLIC"}}
#     where={"year": {"$eq": "2023"}}  # You can filter by the metadata
)

for x in range(len(results['metadatas'][0])):
    if results['metadatas'][0][x]['parentid'] not in myarray:
        print(results['distances'][0][x])
        print("Source:",results['metadatas'][0][x]['source'])
        print("Link:",results['metadatas'][0][x]['link'])
        print("Year:",results['metadatas'][0][x]['year'])
        print("Chunks:",results['metadatas'][0][x]['page'])
        print("Parent:",results['metadatas'][0][x]['parentid'])
        print("Type:",results['metadatas'][0][x]['type'])

        print("-"*40)
#         myarray.append(results['metadatas'][0][x]['parentid'])


Number of requested results 50 is greater than number of elements in index 12, updating n_results = 12


0.15830623086595264
Source: 2021-01766.pdf
Link: https://www.federalregister.gov/d/2021-01766
Year: 2021
Chunks: 1
Parent: 0e97d63e-b078-48c5-a1ea-54a54ef0d0b4
Type: parent
----------------------------------------
0.16293332381730874
Source: 2021-01766.pdf
Link: https://www.federalregister.gov/d/2021-01766
Year: 2021
Chunks: 1
Parent: 0e97d63e-b078-48c5-a1ea-54a54ef0d0b4
Type: child
----------------------------------------
0.17941037835148532
Source: 2021-01766.pdf
Link: https://www.federalregister.gov/d/2021-01766
Year: 2021
Chunks: 1
Parent: 0e97d63e-b078-48c5-a1ea-54a54ef0d0b4
Type: child
----------------------------------------
0.1899467747157506
Source: 2021-01766.pdf
Link: https://www.federalregister.gov/d/2021-01766
Year: 2021
Chunks: 2
Parent: e4eb0565-008a-4a4e-a74e-e4b8e907e95f
Type: questions
----------------------------------------
0.19832841344116625
Source: 2021-01766.pdf
Link: https://www.federalregister.gov/d/2021-01766
Year: 2021
Chunks: 2
Parent: e4eb0565-008a-4a4e-a7