### Data Ingestion

In [26]:
### document structure
from langchain_core.documents import Document

doc = Document(
    page_content="Polireddi Govind",
    metadata = {
        "source":"profile.txt",
        "pages":1,
        "author":"Govind",
        "Page_created":"30/10/2003"
    }
)
doc

Document(metadata={'source': 'profile.txt', 'pages': 1, 'author': 'Govind', 'Page_created': '30/10/2003'}, page_content='Polireddi Govind')

In [27]:
## making a text files
import os

os.makedirs("../data/textfiles",exist_ok=True)

sample_files = {
    "../data/textfiles/file1.txt" : """Machine learning (ML) is a branch of artificial intelligence that enables systems to learn patterns 
    from data and make predictions or decisions without being explicitly programmed. It includes techniques like supervised learning, 
    unsupervised learning, and reinforcement learning. ML powers applications such as recommendation systems, fraud detection, and 
    self-driving cars.""",
    "../data/textfiles/file2.txt" : """K-means clustering is an unsupervised machine learning algorithm used to group data points into 
    K distinct clusters based on similarity. It works by repeatedly assigning points to the nearest cluster center and then updating the 
    centers until the groups stabilize. This method is widely used in customer segmentation, image compression, and pattern recognition 
    because it is simple, fast, and effective for large datasets."""
}

for filename,text in sample_files.items():
    with open(filename,'w',encoding="utf-8") as f:
        f.write(text)

In [28]:
###Textloader
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/textfiles/file1.txt",encoding="utf-8")
ducument_loader = loader.load()
print(ducument_loader)

[Document(metadata={'source': '../data/textfiles/file1.txt'}, page_content='Machine learning (ML) is a branch of artificial intelligence that enables systems to learn patterns \n    from data and make predictions or decisions without being explicitly programmed. It includes techniques like supervised learning, \n    unsupervised learning, and reinforcement learning. ML powers applications such as recommendation systems, fraud detection, and \n    self-driving cars.')]


In [29]:
### Directory loader
from langchain_community.document_loaders import DirectoryLoader

dir_loader = DirectoryLoader(
    path="../data/textfiles",
    glob="**/*.txt",
    loader_cls=TextLoader,
    loader_kwargs={'encoding':'utf-8'},
    show_progress=False
)
documents = dir_loader.load();
documents

[Document(metadata={'source': '../data/textfiles/file2.txt'}, page_content='K-means clustering is an unsupervised machine learning algorithm used to group data points into \n    K distinct clusters based on similarity. It works by repeatedly assigning points to the nearest cluster center and then updating the \n    centers until the groups stabilize. This method is widely used in customer segmentation, image compression, and pattern recognition \n    because it is simple, fast, and effective for large datasets.'),
 Document(metadata={'source': '../data/textfiles/file1.txt'}, page_content='Machine learning (ML) is a branch of artificial intelligence that enables systems to learn patterns \n    from data and make predictions or decisions without being explicitly programmed. It includes techniques like supervised learning, \n    unsupervised learning, and reinforcement learning. ML powers applications such as recommendation systems, fraud detection, and \n    self-driving cars.')]

In [30]:
### pdfloader
from langchain_community.document_loaders import PyPDFLoader,PyMuPDFLoader
dir_loader = DirectoryLoader(
    path="../data/pdffiles",
    glob="**/*.pdf",
    loader_cls=PyMuPDFLoader,
    show_progress=False
)
pdf_documents = dir_loader.load();
pdf_documents

[Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2025-09-22T13:41:50+05:30', 'source': '../data/pdffiles/RETAIL_project.pdf', 'file_path': '../data/pdffiles/RETAIL_project.pdf', 'total_pages': 18, 'format': 'PDF 1.7', 'title': '', 'author': 'Polireddi Govind', 'subject': '', 'keywords': '', 'moddate': '2025-09-22T13:41:50+05:30', 'trapped': '', 'modDate': "D:20250922134150+05'30'", 'creationDate': "D:20250922134150+05'30'", 'page': 0}, page_content='RETAIL – CASE - STUDY \nNAME: Govind Polireddi \n \n \n \n \n \nEmployee Id: AS1552 \n \nProblem Statement \nRetail companies struggle with integrating in-store POS transactions, online orders, and \ninventory systems into a single analytics-ready platform. \nThey need: \n• \nBatch processing for daily sales reconciliation. \n• \nReal-time ingestion for fraud detection & stock alerts. \n• \nCloud-native scalability to support peak season sales. \n• \nUnified warehouse (Snowflake / S