RAG ETL

In [31]:
# load imports
import os
import chromadb
from langchain_core.documents import Document
from chromadbx import UUIDGenerator
from tqdm import tqdm
import datetime
import pandas as pd
from joblib import Parallel, delayed
import uuid

In [32]:
class FileHelper():
    def populate_file_paths(self, dir_path, files):
        directories = os.listdir(dir_path)
        for dircontent in directories:
            if dircontent.startswith("."):
                continue
            file_path = os.path.join(dir_path, dircontent)
            if os.path.isfile(file_path):
                files.append(file_path)
            elif os.path.isdir(file_path):
                self.populate_file_paths(file_path, files)

    def load_documents(self, file_path, document_loader):
        loader = document_loader(file_path)
        return loader.load_and_split()

    def read_documents(self, file_paths, document_loader, n_jobs=2):
        pages_all = []
        pages = Parallel(n_jobs=n_jobs)(delayed(self.load_documents)(file_path, document_loader) for file_path in file_paths)
        for page in pages:
            pages_all.extend(page)
        
        return pages_all

In [49]:
class IterHelper():
    def batched(self, iterable, n):
        if n < 1:
            raise ValueError('n must be at least one')

        for i in range(0, len(iterable), n):
            yield iterable[i: i + n]

class TimeHelper():
    def get_utc_now(self):
        return datetime.datetime.now(datetime.timezone.utc)

    def get_elapsed_seconds(self, start_time: datetime):
        return (self.get_utc_now() - start_time).total_seconds()

class IdHelper():
    def new_id(self):
        return str(uuid.uuid4())

    def new_ids(self, num: int):
        return [self.new_id() for x in range(num)]
   
class VectorDbRepository():
    def __init__(self, collection_name, repository):
        self.collection_name = collection_name
        self.repository = repository(collection_name)

    def get_or_create_collection(self, collection_name):
        return self.repository.get_or_create_collection(collection_name)

    def add_to_collection(self, documents_list: list[Document]) -> None:
        return self.repository.add_to_collection()
        
    def get_data_by_source(self, source: str):
        return self.repository.get_data_by_source(source)
    
    def delete_data_by_source(self, source: str):
        return self.repository.delete_data_by_source(source)

In [None]:
class ChromaDbRepository():
    def __init__(self, collection_name):
        self.collection_name = collection_name
        self.client = chromadb.PersistentClient()

    def get_or_create_collection(self):
        return self.client.get_or_create_collection(self.collection_name)

    def add_to_collection(self, documents_list: list[Document]) -> None:
        ids = IdHelper().new_ids(len(documents_list))
        return self.get_or_create_collection().add(ids=ids,
                metadatas=[p.metadata for p in documents_list],
                documents=[p.page_content for p in documents_list])
        
    def get_data_by_source(self, source: str):
        return self.get_or_create_collection().get(where={"source":source})
    
    def delete_data_by_source(self, source: str):
        return self.get_or_create_collection().delete(where={"source":source})

In [None]:
class MilvusDbRepository():
    def __init__(self, collection_name):
        self.collection_name = collection_name
        self.client = MilvusClient(uri="http://localhost:19530")

    def get_or_create_collection(self):
        if not self.client.has_collection(self.collection_name):            
            self.client.create_collection(self.collection_name)

    def add_to_collection(self, documents_list: list[Document]) -> None:
        id_helper = IdHelper()
        data = []
        for document in documents_list:
            single_data = page.metadata | {'text': page.page_content}
            single_data['id'] = id_helper.new_id()
            data.append(single_data)
       
        return self.client.insert(
            collection_name=self.collection_name,
            data=data)
        
    def get_data_by_source(self, source: str):
        return self.client.query(
            collection_name=self.collection_name,
            filter=f"source == '{source}'")
    
    def delete_data_by_source(self, source: str):
        # Delete entities by a filter expression
        return self.client.delete(
            collection_name=self.collection_name,
            filter=f"source == '{source}'")

In [34]:
# constants
ROOT_DIR = "/Users/pujanmaharjan/pdfs"

file_helper = FileHelper()

files = []
root_path = os.path.join(ROOT_DIR, "agile")
print('root path ', root_path)
file_helper.populate_file_paths(root_path, files)
files

root path  /Users/pujanmaharjan/pdfs/agile


['/Users/pujanmaharjan/pdfs/agile/Kameron H. Clean Code. An Agile Guide to Software Craft 2023.pdf',
 '/Users/pujanmaharjan/pdfs/agile/akka/Abraham F. Akka in Action (MEAP v13) 2ed 2023.pdf']

In [5]:
# Experiment pdf loaders
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders import PyPDFium2Loader

pdf_loader_results = []
pdf_loaders = [PyPDFLoader, PyMuPDFLoader, PyPDFium2Loader]
# pdf_loaders = [PyPDFium2Loader]
time_helper = TimeHelper()
for pdf_loader in pdf_loaders:
    start_time = time_helper.get_utc_now()
    pages = file_helper.read_documents(files, pdf_loader, n_jobs=5)
    pdf_loader_results.append({"loader": pdf_loader.__name__,
                               "elapsed": time_helper.get_elapsed_seconds(start_time),
                               "pages_count": len(pages)})
    
pdf_loader_results_df = pd.DataFrame(pdf_loader_results)
pdf_loader_results_df.sort_values(by="elapsed")

# The result shows PyMuPDFLoader is fastest



Unnamed: 0,loader,elapsed,pages_count
1,PyMuPDFLoader,1.155494,962
2,PyPDFium2Loader,1.429482,962
0,PyPDFLoader,14.880094,962


In [35]:
pages = file_helper.read_documents(files, PyMuPDFLoader, n_jobs=5)

I0000 00:00:1722176474.899582 29062266 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork
I0000 00:00:1722176474.913677 29062266 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork
I0000 00:00:1722176474.942068 29062266 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork
I0000 00:00:1722176474.998655 29062266 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork
I0000 00:00:1722176475.111387 29062266 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork


In [40]:
page = pages[0]
page

Document(metadata={'source': '/Users/pujanmaharjan/pdfs/agile/Kameron H. Clean Code. An Agile Guide to Software Craft 2023.pdf', 'file_path': '/Users/pujanmaharjan/pdfs/agile/Kameron H. Clean Code. An Agile Guide to Software Craft 2023.pdf', 'page': 0, 'total_pages': 447, 'format': 'PDF 1.4', 'title': '677757029', 'author': 'Unknown', 'subject': '', 'keywords': '', 'creator': 'calibre (6.17.0) [http://calibre-ebook.com]', 'producer': 'calibre (6.17.0) [http://calibre-ebook.com]', 'creationDate': "D:20240203035244+00'00'", 'modDate': "D:20240203105244+07'00'", 'trapped': ''}, page_content='Clean Code: An Agile Guide to Software Craft\n\xa0\nKameron Hussain and Frahaan Hussain\n\xa0\nPublished by Sonar Publishing, 2023.')

In [41]:
page.metadata

{'source': '/Users/pujanmaharjan/pdfs/agile/Kameron H. Clean Code. An Agile Guide to Software Craft 2023.pdf',
 'file_path': '/Users/pujanmaharjan/pdfs/agile/Kameron H. Clean Code. An Agile Guide to Software Craft 2023.pdf',
 'page': 0,
 'total_pages': 447,
 'format': 'PDF 1.4',
 'title': '677757029',
 'author': 'Unknown',
 'subject': '',
 'keywords': '',
 'creator': 'calibre (6.17.0) [http://calibre-ebook.com]',
 'producer': 'calibre (6.17.0) [http://calibre-ebook.com]',
 'creationDate': "D:20240203035244+00'00'",
 'modDate': "D:20240203105244+07'00'",
 'trapped': ''}

In [42]:
page.page_content

'Clean Code: An Agile Guide to Software Craft\n\xa0\nKameron Hussain and Frahaan Hussain\n\xa0\nPublished by Sonar Publishing, 2023.'

In [43]:
d = page.metadata | {'text': page.page_content}
d

{'source': '/Users/pujanmaharjan/pdfs/agile/Kameron H. Clean Code. An Agile Guide to Software Craft 2023.pdf',
 'file_path': '/Users/pujanmaharjan/pdfs/agile/Kameron H. Clean Code. An Agile Guide to Software Craft 2023.pdf',
 'page': 0,
 'total_pages': 447,
 'format': 'PDF 1.4',
 'title': '677757029',
 'author': 'Unknown',
 'subject': '',
 'keywords': '',
 'creator': 'calibre (6.17.0) [http://calibre-ebook.com]',
 'producer': 'calibre (6.17.0) [http://calibre-ebook.com]',
 'creationDate': "D:20240203035244+00'00'",
 'modDate': "D:20240203105244+07'00'",
 'trapped': '',
 'text': 'Clean Code: An Agile Guide to Software Craft\n\xa0\nKameron Hussain and Frahaan Hussain\n\xa0\nPublished by Sonar Publishing, 2023.'}

In [None]:
collection_name = "docs_collection"
chroma_db_repository = ChomaDbRepository(collection_name)
#if file already exists, then delete them, and add again

def add_to_db(pages, batch_size):
    sources = list(set([p.metadata['source'] for p in pages]))
    for source in sources:
        source_data = chroma_db_repository.get_data_by_source(source)
        if len(source_data["ids"]) > 0:
            print("previous data found so delete them ", source)
            chroma_db_repository.delete_data_by_source(source)

    time_helper = TimeHelper()
    start_time = time_helper.get_utc_now()
    iter_helper = IterHelper()

    for page_batch in tqdm(iter_helper.batched(pages, batch_size), total=len(pages)/batch_size):
        start_time_batch = time_helper.get_utc_now()
        chroma_db_repository.add_to_collection(page_batch)
        # print(f"Added {batch_size} record in {time_helper.get_elapsed_seconds(start_time_batch)} seconds")

    total_elapsed = time_helper.get_elapsed_seconds(start_time)
    print(f"Added {len(pages)} records in {total_elapsed} seconds")
    return {"batch_size": batch_size, "elapsed": total_elapsed}


In [None]:
batch_sizes = [3,5]
batch_size_results = []
for batch_size in batch_sizes:
    print("Batch size ", batch_size)
    result = add_to_db(pages, batch_size)
    batch_size_results.append(result)

In [None]:

batch_size_results_df = pd.DataFrame(batch_size_results)
batch_size_results_df.sort_values(by="elapsed")

In [6]:
l = [[1,2],[3,4]]
len(l)

2

In [22]:
import functools
def log_execution_time(func):
    @functools.wraps(func)
    def log_execution_time_wrapper(*args, **kwargs):
        time_helper = TimeHelper()
        start_time = time_helper.get_utc_now()
        result = func(*args, **kwargs)
        elapsed = time_helper.get_elapsed_seconds(start_time)
        print("Elapsed time ", elapsed)
        return result
    
    return log_execution_time_wrapper

@log_execution_time
def sum_num(a, b):
    return a + b

sum_num(2,3)

Elapsed time  1e-05


5

In [24]:
sum_num.__name__

'sum_num'

In [26]:
from pymilvus import MilvusClient

client = MilvusClient(uri="http://localhost:19530")


I0000 00:00:1722173139.054879 29062266 config.cc:230] gRPC experiments enabled: call_status_override_on_cancellation, event_engine_dns, event_engine_listener, http2_stats_fix, monitoring_experiment, pick_first_new, trace_record_callops, work_serializer_clears_time_cache


In [None]:
if client.has_collection(collection_name="demo_collection"):
    client.drop_collection(collection_name="demo_collection")
client.create_collection(
    collection_name="demo_collection",
    dimension=768,  # The vectors we will use in this demo has 768 dimensions
)
