RAG ETL

In [16]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from chromadbx import UUIDGenerator
import chromadb
from tqdm import tqdm
import datetime

class FileHelper():
    def populate_file_paths(self, dir_path, files):
        directories = os.listdir(dir_path)
        for dircontent in directories:
            # ignore hidden files
            if dircontent.startswith("."):
                continue
            file_path = os.path.join(dir_path, dircontent)
            if os.path.isfile(file_path):
                files.append(file_path)
            elif os.path.isdir(file_path):
                self.populate_file_paths(file_path, files)

    def read_documents(self, file_paths):
        pages_all = []
        for file_path in file_paths:
            loader = PyPDFLoader(file_path)
            pages = loader.load_and_split()
            pages_all.extend(pages)

        return pages_all

class IterHelper():
    def batched(self, iterable, n):
        if n < 1:
            raise ValueError('n must be at least one')

        for i in range(0, len(iterable), n):
            yield iterable[i: i + n]

class TimeHelper():
    def get_utc_now(self):
        return datetime.datetime.now(datetime.timezone.utc)

    def get_elapsed_seconds(self, start_time: datetime):
        return (self.get_utc_now() - start_time).total_seconds()

class ChomaDbRepository():
    def __init__(self, collection_name):
        self.collection_name = collection_name
        self.client = chromadb.PersistentClient()
        self.collection = self.client.get_or_create_collection(collection_name)       

    def add_to_collection(self, documents_list: list[Document]) -> None:
        ids = UUIDGenerator(len(documents_list))
        self.collection.add(ids=ids,
                metadatas=[p.metadata for p in documents_list],
                documents=[p.page_content for p in documents_list])
        
    def get_data_by_source(self, source: str):
        return self.collection.get(where={"source":source})
    
    def delete_data_by_source(self, source: str):
        self.collection.delete(where={"source":source})
    

In [17]:
# constants
ROOT_DIR = "/Users/pujanmaharjan/pdfs"

file_helper = FileHelper()

files = []
root_path = os.path.join(ROOT_DIR, "agile")
print('root path ', root_path)
file_helper.populate_file_paths(root_path, files)
files

root path  /Users/pujanmaharjan/pdfs/agile


['/Users/pujanmaharjan/pdfs/agile/Kameron H. Clean Code. An Agile Guide to Software Craft 2023.pdf',
 '/Users/pujanmaharjan/pdfs/agile/akka/Abraham F. Akka in Action (MEAP v13) 2ed 2023.pdf']

In [18]:
pages = file_helper.read_documents(files)

In [None]:
print("Total number of pages ", len(pages))

In [19]:
collection_name = "docs_collection"
batch_size = 5
chroma_db_repository = ChomaDbRepository(collection_name)
#if file already exists, then delete them, and add again

def add_to_db(pages, batch_size):
    sources = list(set([p.metadata['source'] for p in pages]))
    for source in sources:
        source_data = chroma_db_repository.get_data_by_source(source)
        if len(source_data["ids"]) > 0:
            print("previous data found so delete them ", source)
            chroma_db_repository.delete_data_by_source(source)

    time_helper = TimeHelper()
    start_time = time_helper.get_utc_now()
    iter_helper = IterHelper()

    for page_batch in tqdm(iter_helper.batched(pages, batch_size), total=len(pages)/batch_size):
        start_time_batch = time_helper.get_utc_now()
        chroma_db_repository.add_to_collection(page_batch)
        # print(f"Added {batch_size} record in {time_helper.get_elapsed_seconds(start_time_batch)} seconds")

    total_elapsed = time_helper.get_elapsed_seconds(start_time)
    print(f"Added {len(pages)} records in {total_elapsed} seconds")
    return {"batch_size": batch_size, "elapsed": total_elapsed}


In [20]:
batch_sizes = [1,3,5,7,9]
batch_size_results = []
for batch_size in batch_sizes:
    print("Batch size ", batch_size)
    result = add_to_db(pages, batch_size)
    batch_size_results.add(result)

Batch size  1
previous data found so delete them  /Users/pujanmaharjan/pdfs/agile/Kameron H. Clean Code. An Agile Guide to Software Craft 2023.pdf
previous data found so delete them  /Users/pujanmaharjan/pdfs/agile/akka/Abraham F. Akka in Action (MEAP v13) 2ed 2023.pdf


100%|██████████| 962/962.0 [01:23<00:00, 11.47it/s]


Added 962 records in 83.851577 seconds
Batch size  3
previous data found so delete them  /Users/pujanmaharjan/pdfs/agile/Kameron H. Clean Code. An Agile Guide to Software Craft 2023.pdf
previous data found so delete them  /Users/pujanmaharjan/pdfs/agile/akka/Abraham F. Akka in Action (MEAP v13) 2ed 2023.pdf


  full_bar = Bar(frac,
100%|██████████| 321/320.6666666666667 [01:23<00:00,  3.85it/s]


Added 962 records in 83.294511 seconds
Batch size  5
previous data found so delete them  /Users/pujanmaharjan/pdfs/agile/Kameron H. Clean Code. An Agile Guide to Software Craft 2023.pdf
previous data found so delete them  /Users/pujanmaharjan/pdfs/agile/akka/Abraham F. Akka in Action (MEAP v13) 2ed 2023.pdf


193it [01:41,  1.90it/s]                           


Added 962 records in 101.583143 seconds
Batch size  7
previous data found so delete them  /Users/pujanmaharjan/pdfs/agile/Kameron H. Clean Code. An Agile Guide to Software Craft 2023.pdf
previous data found so delete them  /Users/pujanmaharjan/pdfs/agile/akka/Abraham F. Akka in Action (MEAP v13) 2ed 2023.pdf


138it [06:22,  2.77s/it]                                        


Added 962 records in 382.770732 seconds
Batch size  9
previous data found so delete them  /Users/pujanmaharjan/pdfs/agile/Kameron H. Clean Code. An Agile Guide to Software Craft 2023.pdf
previous data found so delete them  /Users/pujanmaharjan/pdfs/agile/akka/Abraham F. Akka in Action (MEAP v13) 2ed 2023.pdf


100%|██████████| 107/106.88888888888889 [09:03<00:00,  5.08s/it]


Added 962 records in 543.628044 seconds


In [21]:
import pandas as pd
batch_size_results_df = pd.DataFrame(batch_size_results)
batch_size_results_df