RAG ETL

In [18]:
# load imports
import os
import chromadb
from langchain_core.documents import Document
from chromadbx import UUIDGenerator
from tqdm import tqdm
import datetime
import pandas as pd
from joblib import Parallel, delayed
import uuid
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders import PyPDFium2Loader
from pymilvus import MilvusClient
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from pymilvus import CollectionSchema, FieldSchema, DataType, Collection


  from pandas.core import (


ContextualVersionConflict: (grpcio 1.65.1 (/Users/pujanmaharjan/anaconda/anaconda3/lib/python3.10/site-packages), Requirement.parse('grpcio<=1.63.0,>=1.49.1'), {'pymilvus'})

In [3]:
class FileHelper():
    """
        Helper class for Files
    """
    def populate_file_paths(self, dir_path, files):
        directories = os.listdir(dir_path)
        for dircontent in directories:
            if dircontent.startswith("."):
                continue
            file_path = os.path.join(dir_path, dircontent)
            if os.path.isfile(file_path):
                files.append(file_path)
            elif os.path.isdir(file_path):
                self.populate_file_paths(file_path, files)

    def load_documents(self, file_path, document_loader):
        loader = document_loader(file_path)
        return loader.load_and_split()

    def read_documents(self, file_paths, document_loader, n_jobs=2):
        pages_all = []
        pages = Parallel(n_jobs=n_jobs)(delayed(self.load_documents)(file_path, document_loader) for file_path in file_paths)
        for page in pages:
            pages_all.extend(page)
        
        return pages_all

In [17]:
class IterHelper():
    def batched(self, iterable, n):
        if n < 1:
            raise ValueError('n must be at least one')

        for i in range(0, len(iterable), n):
            yield iterable[i: i + n]

class TimeHelper():
    def get_utc_now(self):
        return datetime.datetime.now(datetime.timezone.utc)

    def get_elapsed_seconds(self, start_time: datetime):
        return (self.get_utc_now() - start_time).total_seconds()

class IdHelper():
    def new_id(self):
        return str(uuid.uuid4())

    def new_ids(self, num: int):
        return [self.new_id() for x in range(num)]
   
class VectorDbRepository():
    def __init__(self, collection_name, repository):
        self.collection_name = collection_name
        self.repository = repository(collection_name)

    def delete_collection(self):
        self.repository.delete_collection()

    def get_or_create_collection(self, collection_name):
        return self.repository.get_or_create_collection(collection_name)

    def create(self, documents_list: list[Document]) -> None:
        return self.repository.create(documents_list)
        
    def get_data_by_source(self, source: str):
        return self.repository.get_data_by_source(source)

    def has_data_for_source(self, source: str):
        return self.repository.has_data_for_source(source)
    
    def delete_data_by_source(self, source: str):
        return self.repository.delete_data_by_source(source)

    def find(self, search_query: str):
        return self.repository.find(search_query)

NameError: name 'datetime' is not defined

In [54]:
# function decorator to log the execution time of function
import functools
def log_execution_time(func):
    @functools.wraps(func)
    def log_execution_time_wrapper(*args, **kwargs):
        time_helper = TimeHelper()
        start_time = time_helper.get_utc_now()
        print(f"{func} Execution Start time {start_time}")
        result = func(*args, **kwargs)
        elapsed = time_helper.get_elapsed_seconds(start_time)
        print(f"{func} Execution End time {time_helper.get_utc_now()}. Elapsed total seconds {elapsed}")
        return result
    
    return log_execution_time_wrapper

In [56]:
import time
@log_execution_time
def some_thing(a, b):
    time.sleep(1)
    return a + b

some_thing(2,3)

<function some_thing at 0x14acc6050> Execution Start time 2024-08-28 05:39:21.982048+00:00
<function some_thing at 0x14acc6050> Execution End time 2024-08-28 05:39:22.988464+00:00. Elapsed total seconds 1.006392


5

In [53]:
str(some_thing)

'<function some_thing at 0x14acc64d0>'

In [5]:
class EmbeddingHelper():
    def __init__(self):
        model_name = "BAAI/bge-small-en"
        model_kwargs = {"device": "mps"}
        encode_kwargs = {"normalize_embeddings": True}
        self.hf_bge_embedding = HuggingFaceBgeEmbeddings(
            model_name=model_name,
            model_kwargs=model_kwargs,
            encode_kwargs=encode_kwargs)

    def embed(self, query):
        print("Starting embedding")
       
        return self.hf_bge_embedding.embed_query(query)


In [6]:
class ChromaDbRepository():
    def __init__(self, collection_name):
        self.collection_name = collection_name
        self.client = chromadb.PersistentClient()
        self.collection = self.get_or_create_collection()

    def delete_collection(self):
        self.client.delete_collection(self.collection_name)

    def get_or_create_collection(self):
        return self.client.get_or_create_collection(self.collection_name)

    def create(self, documents_list: list[Document]) -> None:
        ids = IdHelper().new_ids(len(documents_list))
        embedding_helper = EmbeddingHelper()
        return self.collection.add(ids=ids,
                metadatas=[p.metadata for p in documents_list],
                documents=[p.page_content for p in documents_list],
                embeddings=[embedding_helper.embed(p.page_content) for p in documents_list])
        
    def get_data_by_source(self, source: str):
        return self.collection.get(where={"source":source})

    def has_data_for_source(self, source: str):
        data_by_source = self.get_data_by_source(source)
        return len(data_by_source["ids"]) > 0
    
    def delete_data_by_source(self, source: str):
        return self.collection.delete(where={"source":source})

    def find(self, search_query: str):
        results = self.collection.query(query_texts=[search_query])
        result_documents = results.get("documents")
        if result_documents and len(result_documents) > 0:
            return ' '.join(result_documents[0])
        

In [7]:
class PdfCollectionHelper():
    def __init__(self):
        self.embedding_helper = EmbeddingHelper()

    def get_schema(self):
        pdf_id = FieldSchema(
            name="id",
            dtype=DataType.VARCHAR,
            is_primary=True,
            auto_id=False,
            max_length=100)
        source = FieldSchema(
            name="source",
            dtype=DataType.VARCHAR,
            max_length=200)
        page_number = FieldSchema(
            name="page",
            dtype=DataType.VARCHAR,
            max_length=10)
        text = FieldSchema(
            name="text",
            dtype=DataType.VARCHAR,
            max_length=6000)
        text_vector = FieldSchema(
            name="text_vector",
            dtype=DataType.FLOAT_VECTOR,
            dim=384)
        return CollectionSchema(
            fields=[pdf_id, source, page_number, text, text_vector],
            description="PDF Documents",
            enable_dynamic_field=True)

    def get_index_params(self, client):
        index_params = client.prepare_index_params()
        index_params.add_index(field_name="source", index_type="AUTOINDEX")
        index_params.add_index(field_name="text_vector", index_type="AUTOINDEX", metric_type="L2")
        return index_params

    def new_data(self, document: Document):
        return {
            "id": IdHelper().new_id(),
            "source": document.metadata.get("source"),
            "page": str(document.metadata.get("page")),
            "text": document.page_content,
            "text_vector": self.embedding_helper.embed(document.page_content)
        }

class MilvusDbRepository():
    def __init__(self, collection_name):
        self.collection_name = collection_name
        self.client = MilvusClient(uri="http://localhost:19530", token="root:Milvus")
        self.get_or_create_collection()
   
    def get_collection_helper(self):
        if self.collection_name == "pdfs":
            return PdfCollectionHelper()

    def delete_collection(self):
        if self.client.has_collection(self.collection_name):
            print(f"Deleting collection {self.collection_name}")
            self.client.drop_collection(collection_name = self.collection_name)

    def get_or_create_collection(self):
        if not self.client.has_collection(self.collection_name):
            print(f"Creating collection {self.collection_name}")
            collection_helper = self.get_collection_helper()      
            self.client.create_collection(
                collection_name = self.collection_name,
                schema = collection_helper.get_schema(),
                index_params = collection_helper.get_index_params(self.client))

    def create(self, documents_list: list[Document]) -> None:
        collection_helper = self.get_collection_helper()
        data = [collection_helper.new_data(d) for d in documents_list]
        print("data to add ", data)
        return self.client.insert(
            collection_name=self.collection_name,
            data=data)
        
    def get_data_by_source(self, source: str):
        return self.client.query(
            collection_name=self.collection_name,
            filter=f"source == '{source}'")

    def has_data_for_source(self, source: str):
        data_by_source = self.get_data_by_source(source)
        return len(data_by_source) > 0
    
    def delete_data_by_source(self, source: str):
        # Delete entities by a filter expression
        return self.client.delete(
            collection_name=self.collection_name,
            filter=f"source == '{source}'")

    def find(self, search_query: str):
        search_embedding = EmbeddingHelper().embed(search_query)
        search_result = self.client.search(collection_name=self.collection_name,
            data=[search_embedding],
            output_fields=["text"])
        print('Search result ', search_result)
        if len(search_result) > 0:
            docs = [x.get('entity').get('text') for x in search_result[0]]
            return ' '.join(docs)


In [8]:
# constants
ROOT_DIR = "/Users/pujanmaharjan/pdfs"

file_helper = FileHelper()

files = []
root_path = os.path.join(ROOT_DIR, "agile")
print('root path ', root_path)
file_helper.populate_file_paths(root_path, files)
files

root path  /Users/pujanmaharjan/pdfs/agile


['/Users/pujanmaharjan/pdfs/agile/Kameron H. Clean Code. An Agile Guide to Software Craft 2023.pdf',
 '/Users/pujanmaharjan/pdfs/agile/akka/Abraham F. Akka in Action (MEAP v13) 2ed 2023.pdf']

In [9]:
# # Experiment pdf loaders
# pdf_loader_results = []
# pdf_loaders = [PyPDFLoader, PyMuPDFLoader, PyPDFium2Loader]
# # pdf_loaders = [PyPDFium2Loader]
# time_helper = TimeHelper()
# for pdf_loader in pdf_loaders:
#     start_time = time_helper.get_utc_now()
#     pages = file_helper.read_documents(files, pdf_loader, n_jobs=5)
#     pdf_loader_results.append({"loader": pdf_loader.__name__,
#                                "elapsed": time_helper.get_elapsed_seconds(start_time),
#                                "pages_count": len(pages)})
    
# pdf_loader_results_df = pd.DataFrame(pdf_loader_results)
# pdf_loader_results_df.sort_values(by="elapsed")

# # The result shows PyMuPDFLoader is fastest

In [10]:
pages = file_helper.read_documents(files, PyMuPDFLoader, n_jobs=5)

In [11]:

@log_execution_time
def add_to_db(db_repository: VectorDbRepository, pages, batch_size):
    sources = list(set([p.metadata['source'] for p in pages]))
    for source in sources:
        if db_repository.has_data_for_source(source):
            print("previous data found so delete them ", source)
            db_repository.delete_data_by_source(source)

    iter_helper = IterHelper()
    for page_batch in tqdm(iter_helper.batched(pages, batch_size), total=len(pages)/batch_size):
        db_repository.create(page_batch)


In [12]:
db_repository = VectorDbRepository("pdfs", MilvusDbRepository)

In [13]:
db_repository.repository.client.get_load_state('pdfs')

{'state': <LoadState: Loaded>}

In [14]:
pages_small = pages[0:10]

In [15]:
db_repository_m = VectorDbRepository("pdfs", MilvusDbRepository)
add_to_db(db_repository_m, pages_small, 3)

Execution Start time 2024-07-29 10:06:46.380627+00:00
previous data found so delete them  /Users/pujanmaharjan/pdfs/agile/Kameron H. Clean Code. An Agile Guide to Software Craft 2023.pdf


  from tqdm.autonotebook import tqdm, trange


Starting embedding


 30%|███       | 1/3.3333333333333335 [00:08<00:20,  8.89s/it]

Starting embedding
Starting embedding
data to add  [{'id': '5630c232-c169-4057-9a87-901ad1156d87', 'source': '/Users/pujanmaharjan/pdfs/agile/Kameron H. Clean Code. An Agile Guide to Software Craft 2023.pdf', 'page': '0', 'text': 'Clean Code: An Agile Guide to Software Craft\n\xa0\nKameron Hussain and Frahaan Hussain\n\xa0\nPublished by Sonar Publishing, 2023.', 'text_vector': [-0.047733038663864136, -0.02504950761795044, -0.001811690628528595, -0.03612646460533142, 0.038309499621391296, -0.016000768169760704, -0.01492363028228283, 0.015392203815281391, -0.011930588632822037, 0.01908709481358528, 0.009967081248760223, 0.00401201331987977, 0.017264170572161674, -0.020557153970003128, 0.01816260814666748, 0.016420511528849602, 0.020166020840406418, 0.02830670401453972, 0.006382734514772892, 0.016605107113718987, 0.040077339857816696, 0.005303573794662952, 0.001837660907767713, -0.05891763046383858, 0.02775857225060463, 0.028868740424513817, -0.028928318992257118, -0.038494259119033813, -

 60%|██████    | 2/3.3333333333333335 [00:13<00:08,  6.18s/it]

data to add  [{'id': 'b41a18f4-4694-47c4-a93a-ec45f8d22d14', 'source': '/Users/pujanmaharjan/pdfs/agile/Kameron H. Clean Code. An Agile Guide to Software Craft 2023.pdf', 'page': '3', 'text': 'Chapter 10: Classes\n\xa0\nChapter 13: Concurrency\n\xa0\nChapter 15: JUnit Internals\n\xa0\nChapter 19: Appendix B: Decimal I/O\n\xa0\nChapter 20: Appendix C: How to Transform Employee', 'text_vector': [-0.10066168010234833, -0.045353103429079056, 0.027762454003095627, -0.0646040216088295, -0.01319350115954876, 0.0023368492256850004, 0.04312436655163765, 0.024412378668785095, -0.01708403415977955, -0.020047901198267937, 0.014584182761609554, -0.040169451385736465, 0.003421908477321267, 0.00972665473818779, 0.032264918088912964, 0.01330861821770668, 0.009933099150657654, 0.051750537008047104, -0.016256136819720268, 0.007255255710333586, 0.046167269349098206, -0.022910989820957184, -0.010091529227793217, 0.009895244613289833, 0.048443883657455444, 0.035476647317409515, -0.012466002255678177, -0.00

 90%|█████████ | 3/3.3333333333333335 [00:17<00:01,  5.43s/it]

data to add  [{'id': 'd7da8364-f916-41f1-8ec1-1840d74b3c30', 'source': '/Users/pujanmaharjan/pdfs/agile/Kameron H. Clean Code. An Agile Guide to Software Craft 2023.pdf', 'page': '6', 'text': '1.5.1 Readability and Adaptability\n\xa0\n1.5.2 Continuous Integration and Delivery\n\xa0\n1.5.3 Collaboration and Code Reviews\n\xa0\n1.5.4 Test-Driven Development (TDD)\n\xa0\n1.5.5 Agile Refactoring\n\xa0\n1.5.6 Reduced Technical Debt\n\xa0\n1.5.7 Empowering Cross-Functional Teams\n\xa0\nChapter 2: Meaningful Names\n\xa0\n2.1 The Importance of Good Names\n\xa0\n2.1.1 Readability and Comprehensibility\n\xa0\n2.1.2 Documentation Through Naming\n\xa0\n2.1.3 Maintainability\n\xa0\n2.1.4 Reducing Cognitive Load\n\xa0\n2.1.5 Consistency and Conventions', 'text_vector': [-0.029050303623080254, -0.0064878882840275764, 0.01341313123703003, -0.013475321233272552, 0.02947848290205002, 0.032723017036914825, 0.036923572421073914, 0.02753281593322754, -0.026885274797677994, 0.0104410070925951, 0.00017959681

4it [00:21,  5.36s/it]                                        

Starting embedding
data to add  [{'id': 'c895bf7a-0eb4-45c6-9aad-c0d8bdec5cf3', 'source': '/Users/pujanmaharjan/pdfs/agile/Kameron H. Clean Code. An Agile Guide to Software Craft 2023.pdf', 'page': '9', 'text': 'Chapter 3: Functions\n\xa0\n3.1 The Role of Functions in Clean Code\n\xa0\n3.1.1 Functions as Abstractions\n\xa0\n3.1.2 Small and Focused Functions\n\xa0\n3.1.3 Function Names and Readability\n\xa0\n3.1.4 Function Arguments and Side Effects\n\xa0\n3.1.5 Encapsulation and Reusability\n\xa0\n3.2 Small and Focused Functions\n\xa0\n3.2.1 The Single Responsibility Principle (SRP)\n\xa0\n3.2.2 Benefits of Small and Focused Functions\n\xa0\n3.2.3 Guidelines for Creating Small and Focused Functions\n\xa0\n3.3 Function Arguments and Side Effects\n\xa0\n3.3.1 Minimizing Function Arguments\n\xa0\n3.3.2 Minimizing Side Effects', 'text_vector': [-0.02947571501135826, -0.007363153621554375, 0.020826030522584915, -0.04752424359321594, 0.03490579500794411, 0.00477295508608222, 0.05491790175437




In [16]:
db_repository_c = VectorDbRepository("pdfs", ChromaDbRepository)
add_to_db(db_repository_c, pages_small, 3)

Execution Start time 2024-07-29 10:07:08.901738+00:00
previous data found so delete them  /Users/pujanmaharjan/pdfs/agile/Kameron H. Clean Code. An Agile Guide to Software Craft 2023.pdf


 30%|███       | 1/3.3333333333333335 [00:03<00:08,  3.65s/it]

Starting embedding
Starting embedding
Starting embedding


 60%|██████    | 2/3.3333333333333335 [00:07<00:05,  3.91s/it]

Starting embedding
Starting embedding
Starting embedding


 90%|█████████ | 3/3.3333333333333335 [00:11<00:01,  3.92s/it]

Starting embedding
Starting embedding
Starting embedding


4it [00:15,  3.83s/it]                                        

Starting embedding
Execution End time 2024-07-29 10:07:24.640786+00:00. Elapsed total seconds 15.739042





In [17]:
chromadb_repo = ChromaDbRepository("pdfs")

In [18]:
chromadb_repo.find("clean code")

'Table of Contents\n\xa0\nTitle Page\n\xa0\nCopyright Page\n\xa0\nClean Code: An Agile Guide to Software Craft\n\xa0\nChapter 1: Clean Code\n\xa0\nChapter 2: Meaningful Names\n\xa0\nChapter 3: Functions\n\xa0\nChapter 4: Comments\n\xa0\nChapter 5: Formatting\n\xa0\nChapter 6: Objects and Data Structures\n\xa0\nChapter 7: Error Handling\n\xa0\nChapter 8: Boundaries\n\xa0\nChapter 9: Unit Tests Table of Contents\n\xa0\nChapter 1: Clean Code\n\xa0\n1.1 What Is Clean Code?\n\xa0\nThe Benefits of Clean Code\n\xa0\nCharacteristics of Clean Code\n\xa0\n1.2 Why Does Clean Code Matter?\n\xa0\n1.2.1 Readability and Understanding\n\xa0\n1.2.2 Maintainability\n\xa0\n1.2.3 Debugging and Error Detection\n\xa0\n1.2.4 Collaboration\n\xa0\n1.2.5 Code Reviews and Quality Assurance\n\xa0\n1.2.6 Long-Term Sustainability\n\xa0\n1.3 The Principles of Clean Code Chapter 3: Functions\n\xa0\n3.1 The Role of Functions in Clean Code\n\xa0\n3.1.1 Functions as Abstractions\n\xa0\n3.1.2 Small and Focused Functions\

In [19]:
milvus_repo = MilvusDbRepository("pdfs")

In [20]:
milvus_repo.find("clean code")

Starting embedding
Search result  data: ["[{'id': '2d61123d-8f06-4212-acea-fef1aa4b6e55', 'distance': 0.2022617608308792, 'entity': {'text': 'While every precaution has been taken in the preparation of this book, the\\npublisher assumes no responsibility for errors or omissions, or for\\ndamages resulting from the use of the information contained herein.\\n\\xa0\\nCLEAN CODE: AN AGILE GUIDE TO SOFTWARE CRAFT\\n\\xa0\\nFirst edition. October 15, 2023.\\n\\xa0\\nCopyright © 2023 Kameron Hussain and Frahaan Hussain.\\n\\xa0\\nWritten by Kameron Hussain and Frahaan Hussain.'}}, {'id': '313ac805-b133-4d9d-a121-3f2c6601c8f4', 'distance': 0.20561350882053375, 'entity': {'text': 'Table of Contents\\n\\xa0\\nTitle Page\\n\\xa0\\nCopyright Page\\n\\xa0\\nClean Code: An Agile Guide to Software Craft\\n\\xa0\\nChapter 1: Clean Code\\n\\xa0\\nChapter 2: Meaningful Names\\n\\xa0\\nChapter 3: Functions\\n\\xa0\\nChapter 4: Comments\\n\\xa0\\nChapter 5: Formatting\\n\\xa0\\nChapter 6: Objects and Data

'While every precaution has been taken in the preparation of this book, the\npublisher assumes no responsibility for errors or omissions, or for\ndamages resulting from the use of the information contained herein.\n\xa0\nCLEAN CODE: AN AGILE GUIDE TO SOFTWARE CRAFT\n\xa0\nFirst edition. October 15, 2023.\n\xa0\nCopyright © 2023 Kameron Hussain and Frahaan Hussain.\n\xa0\nWritten by Kameron Hussain and Frahaan Hussain. Table of Contents\n\xa0\nTitle Page\n\xa0\nCopyright Page\n\xa0\nClean Code: An Agile Guide to Software Craft\n\xa0\nChapter 1: Clean Code\n\xa0\nChapter 2: Meaningful Names\n\xa0\nChapter 3: Functions\n\xa0\nChapter 4: Comments\n\xa0\nChapter 5: Formatting\n\xa0\nChapter 6: Objects and Data Structures\n\xa0\nChapter 7: Error Handling\n\xa0\nChapter 8: Boundaries\n\xa0\nChapter 9: Unit Tests Table of Contents\n\xa0\nChapter 1: Clean Code\n\xa0\n1.1 What Is Clean Code?\n\xa0\nThe Benefits of Clean Code\n\xa0\nCharacteristics of Clean Code\n\xa0\n1.2 Why Does Clean Code Mat

In [7]:
from enum import Enum
class DocumentType(Enum):
    TXT = 1
    CSV = 2
    PDF = 3
DocumentType.TXT.name.lower()

'txt'

In [10]:
!pip install -r requirements.txt

Collecting pypdfium2 (from -r requirements.txt (line 3))
  Downloading pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl.metadata (48 kB)
Collecting chromadbx (from -r requirements.txt (line 5))
  Downloading chromadbx-0.0.3-py3-none-any.whl.metadata (4.4 kB)
Collecting ulid-py (from -r requirements.txt (line 6))
  Downloading ulid_py-1.1.0-py2.py3-none-any.whl.metadata (9.8 kB)
Collecting nanoid (from -r requirements.txt (line 7))
  Downloading nanoid-2.0.0-py3-none-any.whl.metadata (3.5 kB)
Collecting pymilvus (from -r requirements.txt (line 8))
  Downloading pymilvus-2.4.5-py3-none-any.whl.metadata (5.6 kB)
Collecting langchain_milvus (from -r requirements.txt (line 11))
  Downloading langchain_milvus-0.1.4-py3-none-any.whl.metadata (2.1 kB)
Collecting jsonlines (from -r requirements.txt (line 17))
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Collecting setuptools>69 (from pymilvus->-r requirements.txt (line 8))
  Downloading setuptools-74.0.0-py3-none-any.whl.meta

In [11]:

import os
import jsonlines

"""
    File related operations
"""

class FileHelper:
    def __init__(self) -> None:
        self.string_helper = None

    def validate_file_path(self, file_path: str):
        if self.string_helper.is_null_or_whitespace(file_path):
            raise ValueError("File path is null or empty")

    def extract_directory_path(self, file_path: str):
        """
        Extract directory path from the full file path
        """
        self.validate_file_path(file_path)
        return os.path.dirname(file_path)

    def save_file_to_disk(self, file_path, data):
        """
        Saves file to disk. If the directory doesn't exist, it creates the directory
        """
        self.validate_file_path(file_path)
        os.makedirs(self.extract_directory_path(file_path), exist_ok=True)
        with open(file_path, "w") as f:
            f.write(data)

    def write_jsonlines(self, file_path: str, data_obj):
        """
            Append json to jsonl
        """
        with jsonlines.open(file_path, "a") as writer:
            writer.write(data_obj)

    def read_jsonlines(self, file_path: str):
        """
            Read from jsonlines file
        """

        with jsonlines.open(file_path) as reader:
            for obj in reader:
                yield obj


file_helper = FileHelper()
fp = "pu.jsonl"
file_helper.write_jsonlines(fp, {"a":1})

In [12]:
file_helper.write_jsonlines(fp, {"a":2})

In [19]:
import datetime
class TimeHelper():
    def get_utc_now(self):
        return datetime.datetime.now(datetime.timezone.utc)

    def get_elapsed_seconds(self, start_time: datetime):
        return (self.get_utc_now() - start_time).total_seconds()
    
t = TimeHelper()
n = t.get_utc_now()

In [None]:
datetime.datetime()

In [21]:
n

datetime.datetime(2024, 8, 28, 3, 9, 16, 546779, tzinfo=datetime.timezone.utc)

In [22]:
ns = str(n)
ns

'2024-08-28 03:09:16.546779+00:00'

In [33]:
import datetime

class MetadataDto():
    def __init__(self,
                source_id: str = "",
                url: str = "",
                text: str = "",
                page: int = 0,
                created_date: datetime = None) -> None:
        self.source_id = source_id
        self.url = url
        self.text = text
        self.page = page
        self.created_date = created_date

    def from_dict(dict_obj):
        time_helper = TimeHelper()
        
        return MetadataDto(
            source_id=dict_obj.get("source_id"),
            url=dict_obj.get("url"),
            text=dict_obj.get("text"),
            page=dict_obj.get("page"),
            created_date=time_helper.convert_utc_str_to_datetime(dict_obj.get("created_date")))
    
    

In [37]:
m = MetadataDto("a","b","c",0,str(TimeHelper().get_utc_now()))

In [38]:
m.__dict__

{'source_id': 'a',
 'url': 'b',
 'text': 'c',
 'page': 0,
 'created_date': '2024-08-28 03:49:42.599298+00:00'}

In [43]:
import chromadb
host = "localhost"
port = "8085"
client = chromadb.HttpClient(host=host, port = port)

In [44]:
client.list_collections()

[Collection(id=f931b56d-3c16-4182-a70c-d1745c461d05, name=b9dcb130-9db2-4eb9-afd2-8c2b86d7cbde)]

In [45]:
coll = client.get_or_create_collection("b9dcb130-9db2-4eb9-afd2-8c2b86d7cbde")

In [53]:
get_result = coll.get()

In [None]:
gr = coll.get()

In [54]:
ids = get_result.get("ids")
documents = get_result.get("documents")
metadatas = get_result.get("metadatas")

In [62]:
len(metadatas)

597

In [57]:
ids[0]

'0103d7ae-915f-4b6e-b02f-87feba960a13'

In [63]:
documents[0]

'* [Features](https://experteaseai.com/features/)\n* [Pricing](https://experteaseai.com/pricing/)\n* [Use Cases](https://experteaseai.com/use-cases/)\n* [Resourses](#)\n\t+ [FAQ](https://experteaseai.com/faq/)\n\t+ [Security](https://experteaseai.com/security/)\n\t+ [Blog](https://experteaseai.com/blog/)\n* [Company](#)\n\t+ [About Us](https://experteaseai.com/about-us/)\n\t+ [Solutions](https://experteaseai.com/solutions/)\n \n Menu\n* [Features](https://experteaseai.com/features/)\n* [Pricing](https://experteaseai.com/pricing/)\n* [Use Cases](https://experteaseai.com/use-cases/)\n* [Resourses](#)\n\t+ [FAQ](https://experteaseai.com/faq/)\n\t+ [Security](https://experteaseai.com/security/)\n\t+ [Blog](https://experteaseai.com/blog/)\n* [Company](#)\n\t+ [About Us](https://experteaseai.com/about-us/)\n\t+ [Solutions](https://experteaseai.com/solutions/)\n \n[Login](https://app.experteaseai.com/login)\n[Sign up](https://app.experteaseai.com/login)'

In [64]:
metadatas[0]

{'created_date': '2024-08-30 06:29:11.327380+00:00',
 'encoding': '',
 'page': 0,
 'sheet': '',
 'source': 'https://experteaseai.com/',
 'url': 'https://experteaseai.com/choosing-the-right-chatbot-platform-for-your-needs/'}

In [66]:
p = {'id': ids[0]} | metadatas[0] | {'document': documents[0]}
p

{'id': '0103d7ae-915f-4b6e-b02f-87feba960a13',
 'created_date': '2024-08-30 06:29:11.327380+00:00',
 'encoding': '',
 'page': 0,
 'sheet': '',
 'source': 'https://experteaseai.com/',
 'url': 'https://experteaseai.com/choosing-the-right-chatbot-platform-for-your-needs/',
 'document': '* [Features](https://experteaseai.com/features/)\n* [Pricing](https://experteaseai.com/pricing/)\n* [Use Cases](https://experteaseai.com/use-cases/)\n* [Resourses](#)\n\t+ [FAQ](https://experteaseai.com/faq/)\n\t+ [Security](https://experteaseai.com/security/)\n\t+ [Blog](https://experteaseai.com/blog/)\n* [Company](#)\n\t+ [About Us](https://experteaseai.com/about-us/)\n\t+ [Solutions](https://experteaseai.com/solutions/)\n \n Menu\n* [Features](https://experteaseai.com/features/)\n* [Pricing](https://experteaseai.com/pricing/)\n* [Use Cases](https://experteaseai.com/use-cases/)\n* [Resourses](#)\n\t+ [FAQ](https://experteaseai.com/faq/)\n\t+ [Security](https://experteaseai.com/security/)\n\t+ [Blog](http

In [9]:
client.delete_collection("b9dcb130-9db2-4eb9-afd2-8c2b86d7cbde")

In [19]:
class MathHelper():
    def calculate_percentage(self, numerator, denominator, round_upto = 0):
        if int(denominator) == 0:
            raise ValueError("Denominator cannot be zero")
        
        return round(100 * numerator/denominator, round_upto)
    
MathHelper().calculate_percentage(3,10)

30.0

In [6]:
from langchain_text_splitters import MarkdownHeaderTextSplitter
# markdown_document = "# Foo\n\n    ## Bar\n\nHi this is Jim\n\nHi this is Joe\n\n ### Boo \n\n Hi this is Lance \n\n ## Baz\n\n Hi this is Molly"
markdown_document = "\nFeatures of Self\\-Learning Chatbots \\| ExpertEase AI\n[Skip to content](#main)\n \n ![Beta logo](https://experteaseai.com/wp-content/uploads/2024/04/ddsvsd-1.png)![Beta logo](https://experteaseai.com/wp-content/uploads/2024/04/ddsvsd-1.png) \n \n[![megaphone icon](https://experteaseai.com/wp-content/uploads/2024/04/megaphone-1.png)![megaphone icon](https://experteaseai.com/wp-content/uploads/2024/04/megaphone-1.png) \n### ExpertEase AI: Quantum Level Conversational AI Automations](#)\n \n[Book A Free Consult](https://calendly.com/d-subasiadvisory/ai-consult)\n[![ExpertEase AI Logo](https://experteaseai.com/wp-content/uploads/2024/04/fd8e9b6604b8c00bc961abf4737ef5a3.png)![ExpertEase AI Logo](https://experteaseai.com/wp-content/uploads/2024/04/fd8e9b6604b8c00bc961abf4737ef5a3.png)](https://experteaseai.com/) \n \n* [Features](https://experteaseai.com/features/)\n* [Pricing](https://experteaseai.com/pricing/)\n* [Use Cases](https://experteaseai.com/use-cases/)\n* [Resourses](#)\n\t+ [FAQ](https://experteaseai.com/faq/)\n\t+ [Security](https://experteaseai.com/security/)\n\t+ [Blog](https://experteaseai.com/blog/)\n* [Company](#)\n\t+ [About Us](https://experteaseai.com/about-us/)\n\t+ [Solutions](https://experteaseai.com/solutions/)\n \n Menu\n* [Features](https://experteaseai.com/features/)\n* [Pricing](https://experteaseai.com/pricing/)\n* [Use Cases](https://experteaseai.com/use-cases/)\n* [Resourses](#)\n\t+ [FAQ](https://experteaseai.com/faq/)\n\t+ [Security](https://experteaseai.com/security/)\n\t+ [Blog](https://experteaseai.com/blog/)\n* [Company](#)\n\t+ [About Us](https://experteaseai.com/about-us/)\n\t+ [Solutions](https://experteaseai.com/solutions/)\n \n[Login](https://app.experteaseai.com/login)\n[Sign up](https://app.experteaseai.com/login)\n[![ExpertEase AI Logo](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%20200%2054'%3E%3C/svg%3E)![ExpertEase AI Logo](https://experteaseai.com/wp-content/uploads/2024/04/fd8e9b6604b8c00bc961abf4737ef5a3.png)](https://experteaseai.com/) \n* [Features](https://experteaseai.com/features/)\n* [Pricing](https://experteaseai.com/pricing/)\n* [Use Cases](https://experteaseai.com/use-cases/)\n* [Resourses](#)\n\t+ [FAQ](https://experteaseai.com/faq/)\n\t+ [Security](https://experteaseai.com/security/)\n\t+ [Blog](https://experteaseai.com/blog/)\n* [Company](#)\n\t+ [About Us](https://experteaseai.com/about-us/)\n\t+ [Solutions](https://experteaseai.com/solutions/)\n \n Menu\n* [Features](https://experteaseai.com/features/)\n* [Pricing](https://experteaseai.com/pricing/)\n* [Use Cases](https://experteaseai.com/use-cases/)\n* [Resourses](#)\n\t+ [FAQ](https://experteaseai.com/faq/)\n\t+ [Security](https://experteaseai.com/security/)\n\t+ [Blog](https://experteaseai.com/blog/)\n* [Company](#)\n\t+ [About Us](https://experteaseai.com/about-us/)\n\t+ [Solutions](https://experteaseai.com/solutions/)\n \n[Login](https://app.experteaseai.com/login)\n[Sign up](https://app.experteaseai.com/login)\nFeatures\n========\n \n \n ExpertEase AI conversational AI systems offers an enterprise\\-grade set of capabilities to automate and enhance customer engagements across industries. \n \nSelf\\-Learning\n--------------\n \nAt the core, our AI assistants continuously self\\-improve from new data and conversations without any additional training or programming. Their knowledge expands exponentially over time as they process more information across topics through natural language understanding.\nWe utilize cutting\\-edge transformer\\-based neural networks like Google’s BERT and deep reinforcement learning to power the self\\-learning. Our proprietary algorithms enable the AI to self\\-correct when responses do not match user intent. This creates a virtuous cycle where conversations improve pattern recognition, expanding viable dialog trees.\n \n![Cognitive Conversational Interface (CCI)](https://experteaseai.com/wp-content/uploads/2024/04/Group-1261154205.png)![Cognitive Conversational Interface (CCI)](https://experteaseai.com/wp-content/uploads/2024/04/Group-1261154205.png) \n![](https://experteaseai.com/wp-content/uploads/2024/04/Group-1261154296-1-e1712308479152.png)![](https://experteaseai.com/wp-content/uploads/2024/04/Group-1261154296-1-e1712308479152.png) \n![Custom AI Solutions With ExpertEase AI](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%20858%20668'%3E%3C/svg%3E)![Custom AI Solutions With ExpertEase AI](https://experteaseai.com/wp-content/uploads/2024/04/Group-1261154253-2.png) \nUnderstanding Self\\-Learning Chatbots\n-------------------------------------\n \nAt its core, a chatbot is an artificial intelligence (AI) program designed to simulate human\\-like conversations, enabling interactions via text or voice. Their rise in popularity is evident, especially in business contexts where they are deployed for automating customer service, providing 24/7 assistance, and handling various other tasks.\n \n![](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%200%200'%3E%3C/svg%3E)![](https://experteaseai.com/wp-content/uploads/2024/04/Group-1261154296-1-e1712308479152.png) \nThe Evolution to Self\\-Learning Chatbots\n----------------------------------------\n \nSelf\\-learning chatbots, often referred to as AI or intelligent chatbots, represent a leap forward in this technology. They employ machine learning algorithms, continually refining their performance through ongoing use. This means these chatbots learn from each interaction, adapting their responses based on new information they gather.\n \n![Cognitive Conversational Interface (CCI)](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%20885%20646'%3E%3C/svg%3E)![Cognitive Conversational Interface (CCI)](https://experteaseai.com/wp-content/uploads/2024/04/Group-1261154205.png) \n![](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%200%200'%3E%3C/svg%3E)![](https://experteaseai.com/wp-content/uploads/2024/04/Group-1261154296-1-e1712308479152.png) \n![Custom AI Solutions With ExpertEase AI](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%20858%20668'%3E%3C/svg%3E)![Custom AI Solutions With ExpertEase AI](https://experteaseai.com/wp-content/uploads/2024/04/Group-1261154253-2.png) \nBeyond Traditional Chatbot Limitations\n--------------------------------------\n \nTraditional chatbots typically respond to specific keywords or phrases. In contrast, self\\-learning chatbots harness natural language processing (NLP) and machine learning to grasp the underlying intent of user messages. This ability allows them to offer responses that are not only relevant but also contextually nuanced.\n \n![](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%200%200'%3E%3C/svg%3E)![](https://experteaseai.com/wp-content/uploads/2024/04/Group-1261154296-1-e1712308479152.png) \nAutonomous Learning and Adaptation\n----------------------------------\n \nWhat sets self\\-learning chatbots apart is their capacity to improve autonomously, without human intervention. They utilize feedback mechanisms like user ratings and sentiment analysis to evaluate and fine\\-tune their performance. This adaptive approach enables them to keep pace with evolving user needs and preferences, leading to a significantly enhanced user experience.\n \n![Cognitive Conversational Interface (CCI)](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%20885%20646'%3E%3C/svg%3E)![Cognitive Conversational Interface (CCI)](https://experteaseai.com/wp-content/uploads/2024/04/Group-1261154205.png) \n![](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%200%200'%3E%3C/svg%3E)![](https://experteaseai.com/wp-content/uploads/2024/04/Group-1261154296-1-e1712308479152.png) \n![Custom AI Solutions With ExpertEase AI](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%20858%20668'%3E%3C/svg%3E)![Custom AI Solutions With ExpertEase AI](https://experteaseai.com/wp-content/uploads/2024/04/Group-1261154253-2.png) \nMultilingual Conversations\n--------------------------\n \nOur platform provides native support for over 50 languages – enabling you to reach global audiences more effectively. Our assistants may comprehend regional dialects, local terminology and can articulate responses with culturally\\-appropriate etiquettes.\nWe optimize model architecture for textual or vocal inputs. Built\\-in anti\\-bias techniques also ensure more uniform quality regardless of gender, accents or background of speakers. \n \n![](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%200%200'%3E%3C/svg%3E)![](https://experteaseai.com/wp-content/uploads/2024/04/Group-1261154296-1-e1712308479152.png) \nLifelike Avatars\n----------------\n \nFor humanized face\\-to\\-face engagements, our video animation technology generates lifelike visual avatar assistants using just a photo or a short video. Complete with accurate lip\\-synching, micro\\-expressions and gestures – making it seamlessly natural.\n \n![Cognitive Conversational Interface (CCI)](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%20885%20646'%3E%3C/svg%3E)![Cognitive Conversational Interface (CCI)](https://experteaseai.com/wp-content/uploads/2024/04/Group-1261154205.png) \n![](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%200%200'%3E%3C/svg%3E)![](https://experteaseai.com/wp-content/uploads/2024/04/Group-1261154296-1-e1712308479152.png) \n Features\n \n![Theme](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%20179%20160'%3E%3C/svg%3E)![Theme](https://experteaseai.com/wp-content/uploads/2024/04/image-17.png) \n![Unmatched Cost Savings with ExpertEase AI](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%2058%2060'%3E%3C/svg%3E)![Unmatched Cost Savings with ExpertEase AI](https://experteaseai.com/wp-content/uploads/2024/04/Vector-2.png)### Unmatched Cost Savings\nBy leveraging the power of our CCI, you can now build a virtual customer support team that rivals the performance of human employees, at a fraction of the cost. Starting at just $5 a day, our CCI solution provides the equivalent support capability of hiring full\\-time employees with annual salaries ranging from $60,000 to $120,000\\. This unprecedented cost saving enables businesses of all sizes to deliver exceptional customer support without breaking the bank.\n \n![Theme](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%20179%20160'%3E%3C/svg%3E)![Theme](https://experteaseai.com/wp-content/uploads/2024/04/image-17.png) \n![Multilingual Support and Personalization](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%2041%2041'%3E%3C/svg%3E)![Multilingual Support and Personalization](https://experteaseai.com/wp-content/uploads/2024/04/Vector-3.png)### Superior Cognitive Capabilities\nNot only is our CCI solution cost\\-effective, but it also boasts an impressive level of intelligence. With an IQ surpassing 100, our CCI outperforms the average human in terms of cognitive abilities. This means that your customers will receive support from an AI system that can understand complex queries, provide accurate and contextually relevant responses, and even anticipate their needs based on previous interactions.\n \n![Theme](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%20179%20160'%3E%3C/svg%3E)![Theme](https://experteaseai.com/wp-content/uploads/2024/04/image-17.png) \n![task icon](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%2059%2060'%3E%3C/svg%3E)![task icon](https://experteaseai.com/wp-content/uploads/2024/04/Group-1261154137.png)### Versatile Application Across Industries\nFrom finance and healthcare to manufacturing and logistics, our CCI's high IQ and adaptability make it suitable for a wide range of industries and applications. Whether it's processing invoices, managing inventory, or analyzing medical records, our CCI can be customized to tackle industry\\-specific challenges and automate complex workflows.\n \n![Theme](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%20179%20160'%3E%3C/svg%3E)![Theme](https://experteaseai.com/wp-content/uploads/2024/04/image-17.png) \n![Time icon](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%2060%2060'%3E%3C/svg%3E)![Time icon](https://experteaseai.com/wp-content/uploads/2024/04/Group-1261154135.png)### 24/7 Availability and Scalability\nOne of the key advantages of our CCI solution is its round\\-the\\-clock availability. Unlike human employees who require breaks, vacations, and sick days, our CCI operates continuously, ensuring that your customers always have access to support when they need it. Moreover, our solution is infinitely scalable, allowing you to handle a growing volume of customer interactions without the need to hire and train additional staff.\n \n![Theme](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%20179%20160'%3E%3C/svg%3E)![Theme](https://experteaseai.com/wp-content/uploads/2024/04/image-17.png) \n![Settings icon](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%2061%2060'%3E%3C/svg%3E)![Settings icon](https://experteaseai.com/wp-content/uploads/2024/04/Group-1261154136.png)### Multilingual Support and Personalization\nExpertEase AI's CCI supports a wide range of languages, enabling you to provide support to customers across the globe in their preferred language. Furthermore, our CCI leverages advanced natural language processing and machine learning techniques to understand the context and intent behind customer queries, allowing it to provide highly personalized and relevant responses. This level of personalization helps foster stronger customer relationships and improves overall satisfaction.\n \n \n[![ExpertEase AI Logo](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%20200%2054'%3E%3C/svg%3E)![ExpertEase AI Logo](https://experteaseai.com/wp-content/uploads/2024/04/fd8e9b6604b8c00bc961abf4737ef5a3.png)](https://experteaseai.com) \nAdvanced conversational AI platform designed to deliver ultra\\-personalized guidance and support to customers.\n \nFollow Us\n---------\n \n[![Youtube Icon](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%2040%2040'%3E%3C/svg%3E)![Youtube Icon](https://experteaseai.com/wp-content/uploads/2024/04/9999999999999999.png)](https://www.youtube.com/@ExpertEaseAI) \n[![Facebook icon](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%2040%2039'%3E%3C/svg%3E)![Facebook icon](https://experteaseai.com/wp-content/uploads/2024/04/888888888888.png)](https://www.facebook.com/ExpertEaseAI) \n[![Twitter icon](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%2041%2039'%3E%3C/svg%3E)![Twitter icon](https://experteaseai.com/wp-content/uploads/2024/04/66666666666666666.png)](https://twitter.com/ExpertEaseAI) \n[![LinkedIn Icon](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%2040%2040'%3E%3C/svg%3E)![LinkedIn Icon](https://experteaseai.com/wp-content/uploads/2024/04/7777777777.png)](https://www.linkedin.com/company/experteaseai) \nLinks\n-----\n \n * [Privacy Policy](/privacy-policy)\n* [Terms \\& Conditions](https://experteaseai.com/terms-conditions/)\n* [Contact](/contact-us)\n* About Us\nNewsletter\n----------\n \n \nLeave this field empty if you're human: \n \n Enter your Email \nSubscribe\nContact Us\n----------\n \n[![Call icon](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%2040%2040'%3E%3C/svg%3E)![Call icon](https://experteaseai.com/wp-content/uploads/2024/04/Group-1261153188.png)](#)### [\\+61 8 8472 9801](tel:+61 8 8472 9801) [\\+61 4 2401 4661](tel:+61 4 2401 4661)\n \n[![Mail Icon](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%2040%2040'%3E%3C/svg%3E)![Mail Icon](https://experteaseai.com/wp-content/uploads/2024/04/Group-1261153189.png)](#)### [support@experteaseai.com](mailto:support@experteaseai.com)\n \n![Location icon](data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%2040%2040'%3E%3C/svg%3E)![Location icon](https://experteaseai.com/wp-content/uploads/2024/04/Group-1261153190.png)### Level 21/25 Grenfell Street Adelaide SA 5000\n \n \n"
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(markdown_document)
md_header_splits

[Document(page_content='Features of Self\\-Learning Chatbots \\| ExpertEase AI\n[Skip to content](#main)  \n![Beta logo](https://experteaseai.com/wp-content/uploads/2024/04/ddsvsd-1.png)![Beta logo](https://experteaseai.com/wp-content/uploads/2024/04/ddsvsd-1.png)  \n[![megaphone icon](https://experteaseai.com/wp-content/uploads/2024/04/megaphone-1.png)![megaphone icon](https://experteaseai.com/wp-content/uploads/2024/04/megaphone-1.png)'),

In [2]:
!pip install pymupdf4llm

    opencv-python (>=3.) ; extra == 'all'
                  ~~~~^[0m[33m
[0m

In [3]:
import pymupdf4llm
md_text = pymupdf4llm.to_markdown("xframe.pdf")

In [4]:
md_text

'### Circular Interior Design Guide\n\nA handbook for circular economy interior design\n\n\n-----\n\nXFrame Circular Interior Design Guide | March 2023\n\nAuthored by G. Finch with support from K. Martin, L. Ransfield and B. Waddington.\nXFrame | Wellington, New Zealand and Adelaide, Australia\n\nAll images are the exclusive property of XFrame PTY Ltd.\n‘XFrame’ and the XFrame logo are registered trademarks of XFrame PTY Ltd.\nAll technical information supplied herewithin should be verified by independent\nprofessionals and/or XFrame.\n\n\n-----\n\n## Designed for now, built for later.\n\n\n-----\n\n-----\n\n#### Transforming Our Sector\n\nThe building and construction industry is the world’s largest consumer of raw virgin materials\nwhile also being the largest producer of solid waste. Little thought is given to how building\nmaterials might be efficiently recovered and reused. Economic pressures mean that low\nquality and chemically modified composite materials dominate modern constr

In [5]:
!pip install pymupdf

    opencv-python (>=3.) ; extra == 'all'
                  ~~~~^[0m[33m
[0m

In [6]:
import pymupdf
doc = pymupdf.open('xframe.pdf')

In [25]:
for page in doc:
    print('page num ', page.number)
    text = page.get_text()
    print('text ', text)
    if page.number == 3:
        images = page.get_images()
        print('images ', images)
        print('image svg ', page.get_svg_image())


page num  0
text  Circular Interior Design Guide
A handbook for circular economy interior design

page num  1
text  XFrame Circular Interior Design Guide | March 2023
Authored by G. Finch with support from K. Martin, L. Ransfield and B. Waddington. 
XFrame | Wellington, New Zealand and Adelaide, Australia
All images are the exclusive property of XFrame PTY Ltd. 
‘XFrame’ and the XFrame logo are registered trademarks of XFrame PTY Ltd. 
All technical information supplied herewithin should be verified by independent 
professionals and/or XFrame.
For more information, please inquire at hello@xframe.com.au

page num  2
text  Designed for now, 
built for later.

page num  3
text  ™

images  [(11, 0, 1562, 2206, 8, 'DeviceRGB', '', 'Im0', 'DCTDecode')]
image svg  <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="595.276" height="841.89" viewBox="0 0 595.276 841.89">
<defs>
<clipPath id="clip_1">
<path transform="matrix(1,0,0,-1,0,841.89)"

In [9]:
p.get

page 0 of xframe.pdf

In [10]:
from urllib.parse import urljoin, urlparse

url = "http://example.com/?a=text&q2=text2&q3=text3&q2=text4"

def remove_query_params(url: str):
    return urljoin(url, urlparse(url).path)

pp = remove_query_params(url)
print('pp ', pp)

pp  http://example.com/


In [None]:
client = chromadb.PersistentClient("./chroma_storage/")


In [1]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [2]:
def remove_punctuations(text):
    return text.translate(str.maketrans('', '', string.punctuation))

remove_punctuations("string. With. Punctuation")

'string With Punctuation'

In [6]:
len("string. With. Punctuation")

25

In [3]:
str(remove_punctuations)

'<function remove_punctuations at 0x10ace77f0>'

In [None]:
class DataTransformationDto():
    def __init__(self,
                source: str,
                url: str,
                operation: str,
                previous_text: str,
                current_text: str,
                previous_count: int,
                current_count: int) -> None:
        self.source = source
        self.url = url
        self.operation = operation
        self.previous_text = previous_text
        self.current_text = current_text
        self.previous_count = previous_count
        self.current_count = current_count


In [5]:
remove_punctuations.__name__

'remove_punctuations'