In [3]:
from langchain_community.document_loaders import PyPDFLoader

import os
import glob
import tiktoken
import pandas as pd
from datetime import datetime

from utils import *
from chromadb_utils import *

## Chromadb configuration
import chromadb
chroma_client = chromadb.Client()
collection_name = 'story_collection'

already_stored_cdb = True

if not already_stored_cdb:
    
    # PDF data location/path
    root_directory = 'data'
    chunk_size = 1024
    overlap = 80
    doc_content_df = pd.DataFrame()
    
    # Listing all the pdf files listed in root_directory
    files = get_pdf_files(root_directory)
    
    # Extract the files content and persisting in DF
    for file in files:
        doc_content = extract_pdf_content(file)    
        entry  = {'doc_name':file,'doc_content' : doc_content}
        doc_content_df = doc_content_df.append(entry, ignore_index=True)
    
    # creating the chunks of files content of configured size
    encoding = tiktoken.get_encoding("cl100k_base")
    doc_content_df = doc_content_df.apply(lambda row: create_chunks(row, encoding, chunk_size, overlap), axis=1)
    
    # Creating the chromadb collection. if it already created, will return collection instance with exception - collection already present.
    collection = chroma_client.create_collection(name=collection_name)
    
    # Upsert the chunks in chroma db with respective ID's
    doc_content_df = doc_content_df.apply(lambda row: upsert_to_chromadb(row, collection), axis=1)
    
    timestamp = datetime.now().timestamp()
    
    doc_content_df.to_csv(f'data/{collection_name}-{timestamp}.csv')
    

# Test Your RAG retrival part.
if collection is None:
    collection = chroma_client.create_collection(name=collection_name)

results = collection.query(
    query_texts=["Explain me the wifi settings "],
    n_results=1)



{'ids': [['data/.ipynb_checkpoints/HT2000W-Satellite-Modem-User-Sheet-checkpoint.pdf_1']], 'distances': [[0.829623281955719]], 'metadatas': [[None]], 'embeddings': None, 'documents': [[' modem’s Wi -Fi service.  \n1. To access these pages, click t he Wi-Fi Settings  link on the side \npanel . You will see the login page as shown in Figure 8.  \n2. The default password is “admin.” Ent er this password in the dialog \nbox and click Login . \n \nFigure 8: Wi -Fi Settings login screen  \n \n \n \n 4 3 1 2 Do not block any of the modem’s \nventilation openings. Leave six \ninches around the top and sides of \nthe modem to ensure adequate \nventilation. Do not put the modem \nnear a heat source such as direct \nsunlight, a radiator, or a vent.1041318 -0001 Revision A  January 26 , 2017 \n8   \n  \n Once you have logged in, you will see the Wi -Fi Settings home screen, as \nshown in Figure 9.  \n \nFigure 9: Wi-Fi Settings home screen  \nWe recommend you change the default login password imme

In [7]:
print(results.get('ids')[0][0].split('/')[-1])

HT2000W-Satellite-Modem-User-Sheet-checkpoint.pdf_1


In [8]:
print(results.get('documents')[0][0])

 modem’s Wi -Fi service.  
1. To access these pages, click t he Wi-Fi Settings  link on the side 
panel . You will see the login page as shown in Figure 8.  
2. The default password is “admin.” Ent er this password in the dialog 
box and click Login . 
 
Figure 8: Wi -Fi Settings login screen  
 
 
 
 4 3 1 2 Do not block any of the modem’s 
ventilation openings. Leave six 
inches around the top and sides of 
the modem to ensure adequate 
ventilation. Do not put the modem 
near a heat source such as direct 
sunlight, a radiator, or a vent.1041318 -0001 Revision A  January 26 , 2017 
8   
  
 Once you have logged in, you will see the Wi -Fi Settings home screen, as 
shown in Figure 9.  
 
Figure 9: Wi-Fi Settings home screen  
We recommend you change the default login password immediately upon 
installation. Choose a password that is easy to remember but cannot be 
easily guessed. To do this:  
1. On the left panel, click  Administration . 
2. New option s will appear in the left panel.

In [7]:
from utils import *
from chromadb_utils import *