In [1]:
from dotenv import load_dotenv,find_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI
from pathlib import Path

from langchain_community.document_loaders import DirectoryLoader,TextLoader,PyPDFLoader
from unstructured.partition.pdf import partition_pdf
from time import time


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
documentPath=Path("data").joinpath("letters")

<h3>Using the Glob Parameter</h3>

In [3]:
loader=DirectoryLoader(path=documentPath,glob='*.txt')

In [4]:
docs=loader.load()

In [5]:
len(docs)

3

<h3>Showing the Progress Bar</h3>

In [6]:
start=time()
loader=DirectoryLoader(path=documentPath,glob='*.txt',show_progress=True)
docs=loader.load()
print(time()-start)

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.11it/s]

2.7049665451049805





In [7]:
len(docs)

3

<h3>Using Multithreading</h3>

In [8]:
start=time()
loader=DirectoryLoader(path=documentPath,glob="*.txt",show_progress=True,use_multithreading=True)
docs=loader.load()
print(time()-start)

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.33it/s]

2.2687950134277344





In [9]:
len(docs)

3

<h3>Changing from Unstructured Loader to Text Loader</h3>

In [12]:

start=time()

loader=DirectoryLoader(path=documentPath,glob="*.txt",show_progress=True,
                       use_multithreading=True,loader_cls=TextLoader,
                       loader_kwargs={"autodetect_encoding":True}  # This line is important
                      )

docs=loader.load()
print(time()-start)


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 34.88it/s][A

0.09299755096435547





In [13]:
len(docs)

3

In [14]:
for doc in docs:
    print(doc.page_content,"\n"*5)

Winston Churchill Speech - We Shall Fight on the Beaches
We Shall Fight on the Beaches
June 4, 1940
House of Commons
From the moment that the French defenses at Sedan and on the Meuse were broken at the end of the
second week of May, only a rapid retreat to Amiens and the south could have saved the British and
French Armies who had entered Belgium at the appeal of the Belgian King; but this strategic fact was
not immediately realized. The French High Command hoped they would be able to close the gap, and
the Armies of the north were under their orders. Moreover, a retirement of this kind would have
involved almost certainly the destruction of the fine Belgian Army of over 20 divisions and the
abandonment of the whole of Belgium. Therefore, when the force and scope of the German
penetration were realized and when a new French Generalissimo, General Weygand, assumed
command in place of General Gamelin, an effort was made by the French and British Armies in
Belgium to keep on holding the 

<h3>Silent Fail</h3>

In [15]:
start=time()

loader=DirectoryLoader(path=documentPath,glob="*.pdf",show_progress=True,
                       use_multithreading=False,loader_cls=TextLoader,
                       silent_errors=True
                      )

docs=loader.load()
print(time()-start)


  0%|                                                                                            | 0/5 [00:00<?, ?it/s][AError loading file data\letters\Accenture Reliving Letter.pdf: Error loading data\letters\Accenture Reliving Letter.pdf
Error loading file data\letters\Morgan Stanley Relieving Letter.pdf: Error loading data\letters\Morgan Stanley Relieving Letter.pdf
Error loading file data\letters\Quantiphi Relieving Letter.pdf: Error loading data\letters\Quantiphi Relieving Letter.pdf
Error loading file data\letters\TCS Reliving Letter.pdf: Error loading data\letters\TCS Reliving Letter.pdf
Error loading file data\letters\Wipro Reliving Letter.pdf: Error loading data\letters\Wipro Reliving Letter.pdf
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 713.95it/s]

0.013000249862670898





<h3>Loading PDF Files</h3>

In [16]:

start=time()

loader=DirectoryLoader(path=documentPath,glob="*.pdf",show_progress=True,
                       use_multithreading=True,loader_cls=PyPDFLoader,
                      )

docs=loader.load()
print(time()-start)


  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4

 40%|█████████████████████████████████▌                                                  | 2/5 [00:00<00:00, 14.65it/s][A
 60%|██████████████████████████████████████████████████▍                                 | 3/5 [00:00<00:00, 18.72it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 13.01it/s][A

0.3880128860473633





In [17]:
len(docs)

5