In [3]:
!pip install aiofiles

Collecting aiofiles
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Downloading aiofiles-24.1.0-py3-none-any.whl (15 kB)
Installing collected packages: aiofiles
Successfully installed aiofiles-24.1.0


In [14]:
from dotenv import load_dotenv,find_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI
from pathlib import Path

from typing import AsyncIterator,Iterator
from langchain_core.document_loaders import BaseLoader, BaseBlobParser, Blob
from langchain_core.documents import Document
import aiofiles

In [17]:
filePath=Path("data").joinpath("meow.txt")

In [6]:
class CustomDocumentLoader(BaseLoader):
    """
        An example document loader that reads a file line by line
    """
    def __init__(self,filePath:str) ->None:
        """
            Initializes the loader with a file path
        Args:
            filePath: The path to the file to load
        """
        self.filePath=filePath

    def lazy_load(self) -> Iterator[Document]:
        """
            A Lazy Loader that reads a file line by line
            When you are implementing lazy load methods, you should use a generator to yield documents one by one
        Args:
            None
        """
        with open(self.filePath,encoding="utf-8") as f:
            lineNumber=0
            for line in f:
                yield Document(page_content=line, metadata={"line_number":lineNumber,"source":self.filePath})
                lineNumber+=1

    async def alazy_load(self) -> AsyncIterator[Document]:
        """
            An Async Loader that reads a file line by line 
        """
        async with aiofiles.open(file=self.filePath,encoding="utf-8") as f:
            lineNumber=0
            async for line in f:
                yield Document(page_content=line, metadata={"line_number":lineNumber,"source":self.filePath})
                lineNumber+=1

In [8]:
with open(file="./data/meow.txt",mode="w",encoding="utf-8") as f:
    qualityContent="meow meow🐱 \n meow meow🐱 \n meow😻😻"
    f.write(qualityContent)

In [18]:
loader=CustomDocumentLoader(filePath=filePath)

In [19]:
for doc in loader.lazy_load():
    print(doc)
    print(type(doc),"\n")

page_content='meow meow🐱 
' metadata={'line_number': 0, 'source': WindowsPath('data/meow.txt')}
<class 'langchain_core.documents.base.Document'> 

page_content=' meow meow🐱 
' metadata={'line_number': 1, 'source': WindowsPath('data/meow.txt')}
<class 'langchain_core.documents.base.Document'> 

page_content=' meow😻😻' metadata={'line_number': 2, 'source': WindowsPath('data/meow.txt')}
<class 'langchain_core.documents.base.Document'> 



In [20]:
async for doc in loader.alazy_load():
    print(doc)
    print(type(doc),"\n")

page_content='meow meow🐱 
' metadata={'line_number': 0, 'source': WindowsPath('data/meow.txt')}
<class 'langchain_core.documents.base.Document'> 

page_content=' meow meow🐱 
' metadata={'line_number': 1, 'source': WindowsPath('data/meow.txt')}
<class 'langchain_core.documents.base.Document'> 

page_content=' meow😻😻' metadata={'line_number': 2, 'source': WindowsPath('data/meow.txt')}
<class 'langchain_core.documents.base.Document'> 



In [21]:
# Eager Execution which fits all the contents into the memory all at once
loader.load()

[Document(metadata={'line_number': 0, 'source': WindowsPath('data/meow.txt')}, page_content='meow meow🐱 \n'),
 Document(metadata={'line_number': 1, 'source': WindowsPath('data/meow.txt')}, page_content=' meow meow🐱 \n'),
 Document(metadata={'line_number': 2, 'source': WindowsPath('data/meow.txt')}, page_content=' meow😻😻')]

<h3>Parsing throgh Base Blob Parser</h3>

In [50]:
class MyParser(BaseBlobParser):
    """
        A simple parser that creates a document from each line
    """
    def lazy_parse(self, blob:Blob) -> Iterator[Document]:
        """
            Parse a blob into a document line by line
        """
        lineNumber=0
        with blob.as_bytes_io() as f:
            for line in f:
                lineNumber+=1
                # Parsing is possible because of the Parent BaseBlobParser
                yield Document(page_content=line, 
                               metadata={"line_number":lineNumber,"source":blob.source}
                              )

In [42]:
blob=Blob.from_path(path=filePath)

In [43]:
print(blob.as_string())

meow meow🐱 
 meow meow🐱 
 meow😻😻


In [44]:
with blob.as_bytes_io() as f:
    for line in f:
        print(line)

b'meow meow\xf0\x9f\x90\xb1 \r\n'
b' meow meow\xf0\x9f\x90\xb1 \r\n'
b' meow\xf0\x9f\x98\xbb\xf0\x9f\x98\xbb'


In [45]:
blob.source

'data\\meow.txt'

In [46]:
parser=MyParser()

In [49]:
for parsed_item in parser.lazy_parse(blob=blob):
        print(parsed_item,"\n")


page_content='meow meow🐱 
' metadata={'line_number': 1, 'source': 'data\\meow.txt'} 

page_content=' meow meow🐱 
' metadata={'line_number': 2, 'source': 'data\\meow.txt'} 

page_content=' meow😻😻' metadata={'line_number': 3, 'source': 'data\\meow.txt'} 



In [53]:
blob=Blob(data=b"some data from memory\nmeow")
for parsed_item in parser.lazy_parse(blob=blob):
        print(parsed_item,"\n")

page_content='some data from memory
' metadata={'line_number': 1, 'source': None} 

page_content='meow' metadata={'line_number': 2, 'source': None} 

