## Data Ingestion

### Document Datastructure

In [None]:
from langchain_core.documents import Document

In [2]:
doc = Document(
    page_content = "This is the main text content I am using to create RAG",
    metadata= {
        "source":"example.txt",
        "page":1,   
        "author":"Priyanshi Jajoo",
        "data": "2025-10-24"
    }
)

In [3]:
doc

Document(metadata={'source': 'example.txt', 'page': 1, 'author': 'Priyanshi Jajoo', 'data': '2025-10-24'}, page_content='This is the main text content I am using to create RAG')

In [5]:
# create a simple txt file

import os

os.makedirs("../data/text_files", exist_ok=True)

In [None]:
sample_texts={
    "../data/text_files/python_intro.txt": """
    Python is a high-level, general-purpose programming language known for its readability and versatility. Created by Guido van Rossum and first released in 1991, it has become one of the most popular programming languages globally.
    Key Characteristics and Features:
        Easy to Learn and Use: Python's syntax is designed to be clear and concise, resembling natural language, making it an excellent choice for beginners.
        Versatile and Multi-Paradigm: Python supports various programming paradigms, including procedural, object-oriented, and functional programming. It can be used for a wide range of applications.
        Extensive Standard Library and Ecosystem: Python boasts a large and comprehensive standard library, providing modules and packages for diverse tasks. Additionally, a vast ecosystem of third-party libraries and frameworks extends its capabilities even further.
        Interpreted Language: Python code is executed line by line by an interpreter, eliminating the need for a separate compilation step and enabling rapid prototyping and development.
        Cross-Platform Compatibility: Python applications can run on various operating systems, including Windows, macOS, and Linux, without significant modifications.
        Large and Active Community: Python benefits from a vibrant and supportive community of developers, offering abundant resources, documentation, and assistance.

    """,
    "../data/text_files/machine_learning_intro.txt": """ Machine learning is a subfield of AI where computers learn to 
    identify patterns and make decisions from data without being explicitly 
    programmed for every task. It involves feeding data into algorithms to train models, 
    which can then make predictions or classify new, unseen data. 
    This allows systems to improve their performance as they are exposed to 
    more data and are used in applications like image recognition, fraud detection, 
    and personalized recommendations.
    """
}

for filepath, text in sample_texts.items():
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(text)

# create a simple pdf file
print("Sample Text files created successfully!")


Sample Text files created successfully!


In [10]:
# Textloader
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/python_intro.txt", encoding="utf-8")
docs = loader.load()

print(docs)

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content="\n   Python is a high-level, general-purpose programming language known for its readability and versatility. Created by Guido van Rossum and first released in 1991, it has become one of the most popular programming languages globally.\n    Key Characteristics and Features:\n        Easy to Learn and Use: Python's syntax is designed to be clear and concise, resembling natural language, making it an excellent choice for beginners.\n        Versatile and Multi-Paradigm: Python supports various programming paradigms, including procedural, object-oriented, and functional programming. It can be used for a wide range of applications.\n        Extensive Standard Library and Ecosystem: Python boasts a large and comprehensive standard library, providing modules and packages for diverse tasks. Additionally, a vast ecosystem of third-party libraries and frameworks extends its capabilities even further.\n        Int

In [13]:
# Directory Loader

from langchain_community.document_loaders import DirectoryLoader

dir_loader = DirectoryLoader(
    "../data/text_files", 
    glob="**/*.txt", 
    loader_cls=TextLoader,
    loader_kwargs={"encoding": "utf-8"}
)
documents = dir_loader.load()

documents


[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content="\n   Python is a high-level, general-purpose programming language known for its readability and versatility. Created by Guido van Rossum and first released in 1991, it has become one of the most popular programming languages globally.\n    Key Characteristics and Features:\n        Easy to Learn and Use: Python's syntax is designed to be clear and concise, resembling natural language, making it an excellent choice for beginners.\n        Versatile and Multi-Paradigm: Python supports various programming paradigms, including procedural, object-oriented, and functional programming. It can be used for a wide range of applications.\n        Extensive Standard Library and Ecosystem: Python boasts a large and comprehensive standard library, providing modules and packages for diverse tasks. Additionally, a vast ecosystem of third-party libraries and frameworks extends its capabilities even further.\n        Int

In [15]:
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

pdf_dir_loader = DirectoryLoader(
    "../data/pdf",
    glob="**/*.pdf",
    loader_cls=PyMuPDFLoader
)

pdf_documents = pdf_dir_loader.load()

pdf_documents





[Document(metadata={'producer': 'Microsoft® Word for Office 365', 'creator': 'Microsoft® Word for Office 365', 'creationdate': '2019-04-20T14:20:28+05:30', 'source': '../data/pdf/PT-365-INTERNATIONAL-RELATIONS-2019.pdf', 'file_path': '../data/pdf/PT-365-INTERNATIONAL-RELATIONS-2019.pdf', 'total_pages': 42, 'format': 'PDF 1.7', 'title': '', 'author': 'prnk.mshr@gmail.com', 'subject': '', 'keywords': '', 'moddate': '2019-04-20T14:21:38+05:30', 'trapped': '', 'modDate': "D:20190420142138+05'30'", 'creationDate': "D:20190420142028+05'30'", 'page': 0}, page_content=''),
 Document(metadata={'producer': 'Microsoft® Word for Office 365', 'creator': 'Microsoft® Word for Office 365', 'creationdate': '2019-04-20T14:20:28+05:30', 'source': '../data/pdf/PT-365-INTERNATIONAL-RELATIONS-2019.pdf', 'file_path': '../data/pdf/PT-365-INTERNATIONAL-RELATIONS-2019.pdf', 'total_pages': 42, 'format': 'PDF 1.7', 'title': '', 'author': 'prnk.mshr@gmail.com', 'subject': '', 'keywords': '', 'moddate': '2019-04-20

In [16]:
type(pdf_documents[0])

langchain_core.documents.base.Document