In [1]:
# Document Structure
from langchain_core.documents import Document

In [2]:
doc = Document(
    page_content = "This is the page content I am adding to the document",
    metadata = {
        "source":"example.txt",
        "author":"Renju Rajagopalan",
        "page_length":1
    }
)

doc

Document(metadata={'source': 'example.txt', 'author': 'Renju Rajagopalan', 'page_length': 1}, page_content='This is the page content I am adding to the document')

In [3]:
import os
os.makedirs("../data/sample_text_files", exist_ok=True)

sample_texts = {
 "../data/sample_text_files/python_intro.txt":
 "Python is a popular programming language. It was created by Guido van Rossum, and released in 1991.Python was designed for readability, and has some similarities to the English language with influence from mathematics.Python uses new lines to complete a command, as opposed to other programming languages which often use semicolons or parentheses.ython relies on indentation, using whitespace, to define scope; such as the scope of loops, functions and classes. Other programming languages often use curly-brackets for this purpose." ,
 
"../data/sample_text_files/deep_learning.txt":
"Deep Learning is transforming the way machines understand, learn and interact with complex data. Deep learning mimics neural networks of the human brain, it enables computers to autonomously uncover patterns and make informed decisions from vast amounts of unstructured data"
}



In [4]:
for file_path,content in sample_texts.items():
    with open(file_path,'w',encoding="utf-8") as f:
        f.write(content)

print("Sample text files created")

Sample text files created


In [5]:
### TextLoader
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/sample_text_files/python_intro.txt", encoding = "utf-8")
doc = loader.load()
print(doc)

  from .autonotebook import tqdm as notebook_tqdm


[Document(metadata={'source': '../data/sample_text_files/python_intro.txt'}, page_content='Python is a popular programming language. It was created by Guido van Rossum, and released in 1991.Python was designed for readability, and has some similarities to the English language with influence from mathematics.Python uses new lines to complete a command, as opposed to other programming languages which often use semicolons or parentheses.ython relies on indentation, using whitespace, to define scope; such as the scope of loops, functions and classes. Other programming languages often use curly-brackets for this purpose.')]


In [6]:
### Directory Loader
from langchain_community.document_loaders import DirectoryLoader

dir_loader = DirectoryLoader(
    "../data/sample_text_files",
    glob="**/*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding":"utf-8"},
    show_progress=False
)

In [7]:
documents = dir_loader.load()
documents

[Document(metadata={'source': '..\\data\\sample_text_files\\deep_learning.txt'}, page_content='Deep Learning is transforming the way machines understand, learn and interact with complex data. Deep learning mimics neural networks of the human brain, it enables computers to autonomously uncover patterns and make informed decisions from vast amounts of unstructured data'),
 Document(metadata={'source': '..\\data\\sample_text_files\\python_intro.txt'}, page_content='Python is a popular programming language. It was created by Guido van Rossum, and released in 1991.Python was designed for readability, and has some similarities to the English language with influence from mathematics.Python uses new lines to complete a command, as opposed to other programming languages which often use semicolons or parentheses.ython relies on indentation, using whitespace, to define scope; such as the scope of loops, functions and classes. Other programming languages often use curly-brackets for this purpose.'

In [8]:
from langchain_community.document_loaders import PyMuPDFLoader
## load all the PDF files from the directory
pdf_loader = DirectoryLoader(
    "../data/PDFs",
    glob = '**/*.pdf',
    loader_cls = PyMuPDFLoader,
    show_progress = False
)

In [9]:
pdf_docs = pdf_loader.load()
pdf_docs

[Document(metadata={'producer': 'pdfTeX-1.40.26', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-06-03T02:14:40+00:00', 'source': '..\\data\\PDFs\\Diffusion Model.pdf', 'file_path': '..\\data\\PDFs\\Diffusion Model.pdf', 'total_pages': 56, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-06-03T02:14:40+00:00', 'trapped': '', 'modDate': 'D:20250603021440Z', 'creationDate': 'D:20250603021440Z', 'page': 0}, page_content='MIT Class 6.S184: Generative AI With Stochastic Differential Equations, 2025\nAn Introduction to Flow Matching and Diffusion Models\nPeter Holderrieth and Ezra Erives\nWebsite: https://diffusion.csail.mit.edu/\n1\nIntroduction . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .\n2\n1.1\nOverview . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .\n2\n1.2\nCourse Structure . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .