In [10]:
import os
from typing import List,Dict,Any
import pandas as pd



In [71]:
from langchain_core.documents import Document
from langchain_text_splitters import (RecursiveCharacterTextSplitter
                                       ,CharacterTextSplitter,
                                       TokenTextSplitter)

### Understanding the Document Structure

In [12]:
doc =Document(
    page_content="This is the main content of the document.",
    metadata={
        "source":"example.txt",
        'page':1,
        "author":"Santhosh",
        "date_Created":"2025-10-23",
        "custom_field":"Custom Value"
    }
)
print("Document_Structure")
print("Document Content:", doc.page_content)
print("Document Metadata:", doc.metadata)

### Understanding the Document Structure
print("\n Metadata is crucial for RAG as it helps in tracing the origin of information, ensuring data integrity, and providing context during retrieval and generation processes.")
print("\n -Filtering search results")
print("\n -Trcaking document sources")
print("\n -Providing context in responses")
print("\n -debugging and auditing")

Document_Structure
Document Content: This is the main content of the document.
Document Metadata: {'source': 'example.txt', 'page': 1, 'author': 'Santhosh', 'date_Created': '2025-10-23', 'custom_field': 'Custom Value'}

 Metadata is crucial for RAG as it helps in tracing the origin of information, ensuring data integrity, and providing context during retrieval and generation processes.

 -Filtering search results

 -Trcaking document sources

 -Providing context in responses

 -debugging and auditing


### Creating text document and perform splittin on that document \

In [15]:
os.makedirs("sample/text_files",exist_ok=True)

In [35]:
sample_text={
"sample/text_files/langchain_splitter.txt":"""This is a sample text file to demonstrate text splitting using LangChain.
This is about the langchain framework. 
It provides various utilities for building applications with LLMs.
Text splitting is essential for handling large documents effectively.
Various text splitters are available in LangChain, such as CharacterTextSplitter and RecursiveCharacterText""",
"sample/text_files/langchain_loaders.txt":"""LangChain offers a variety of document loaders to facilitate data ingestion.
These loaders support multiple file formats, including text, PDF, and HTML.
Using the appropriate loader ensures efficient data processing and integration with LLMs.
Document loaders are designed to handle different data sources seamlessly.
They play a crucial role in preparing data for downstream tasks in RAG applications."""
}
for file_path,content in sample_text.items():
    with open(file_path,"w")as f:
        f.write(content)
print("\n Sample text files created.")


 Sample text files created.


### TextLoader -read Single file

In [36]:
from langchain_community.document_loaders import TextLoader

loader=TextLoader(file_path="sample/text_files/langchain_splitter.txt",encoding="utf-8")
loader

<langchain_community.document_loaders.text.TextLoader at 0x2457e2bc3d0>

In [37]:
documents=loader.load()
documents

[Document(metadata={'source': 'sample/text_files/langchain_splitter.txt'}, page_content='This is a sample text file to demonstrate text splitting using LangChain.\nThis is about the langchain framework. \nIt provides various utilities for building applications with LLMs.\nText splitting is essential for handling large documents effectively.\nVarious text splitters are available in LangChain, such as CharacterTextSplitter and RecursiveCharacterText')]

### Directory loader - read multiple files from a directory

In [53]:
from langchain_community.document_loaders import DirectoryLoader

director_loader=DirectoryLoader(
    path='sample/text_files',
    glob="**/*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding":"utf-8"}
)

In [54]:
director_loader.load()

[Document(metadata={'source': 'sample\\text_files\\langchain_loaders.txt'}, page_content='LangChain offers a variety of document loaders to facilitate data ingestion.\nThese loaders support multiple file formats, including text, PDF, and HTML.\nUsing the appropriate loader ensures efficient data processing and integration with LLMs.\nDocument loaders are designed to handle different data sources seamlessly.\nThey play a crucial role in preparing data for downstream tasks in RAG applications.'),
 Document(metadata={'source': 'sample\\text_files\\langchain_splitter.txt'}, page_content='This is a sample text file to demonstrate text splitting using LangChain.\nThis is about the langchain framework. \nIt provides various utilities for building applications with LLMs.\nText splitting is essential for handling large documents effectively.\nVarious text splitters are available in LangChain, such as CharacterTextSplitter and RecursiveCharacterText')]

In [40]:
print("Directory loader characteristics:")
print("Advantages of Directory Loader:")
print("\n -Bulk Loading: Can load multiple files from a specified directory.")
print("\n -File Filtering: Supports glob patterns to filter specific file types.")
print("\n -Custom Loaders: Allows specifying different loader classes for various file formats.")
print("\n -Scalability: Efficient for large datasets organized in directories.")
print("Disadvantages of Directory Loader:")
print("\n -Complexity: Slightly more complex setup compared to single file loaders.")
print("\n -Overhead: May introduce overhead if the directory contains many irrelevant files.")
print("\n -Error Handling: Requires robust error handling for diverse file types and structures.")

Directory loader characteristics:
Advantages of Directory Loader:

 -Bulk Loading: Can load multiple files from a specified directory.

 -File Filtering: Supports glob patterns to filter specific file types.

 -Custom Loaders: Allows specifying different loader classes for various file formats.

 -Scalability: Efficient for large datasets organized in directories.
Disadvantages of Directory Loader:

 -Complexity: Slightly more complex setup compared to single file loaders.

 -Overhead: May introduce overhead if the directory contains many irrelevant files.

 -Error Handling: Requires robust error handling for diverse file types and structures.


In [49]:
char_splitter=CharacterTextSplitter(
    separator="\n",
    chunk_size=100,
    chunk_overlap=40,
)

In [68]:
char_chunks=char_splitter.split_text(documents[0].page_content)
char_chunks

['This is a sample text file to demonstrate text splitting using LangChain.',
 'This is about the langchain framework.',
 'It provides various utilities for building applications with LLMs.',
 'Text splitting is essential for handling large documents effectively.',
 'Various text splitters are available in LangChain, such as CharacterTextSplitter and RecursiveCharacterText']

In [64]:
print(char_chunks[0])
print("\n Total Chunks Created:", len(char_chunks))
print(char_chunks[1])
print(char_chunks[2])


This is a sample text file to demonstrate text splitting using LangChain.

 Total Chunks Created: 5
This is about the langchain framework.
It provides various utilities for building applications with LLMs.


In [69]:
recursive_splitter=RecursiveCharacterTextSplitter(
    separators=["\n\n","\n","."," ",""],
    chunk_size=80,
    chunk_overlap=20
)

In [70]:
recursive_splitter.split_text(documents[0].page_content)

['This is a sample text file to demonstrate text splitting using LangChain.',
 'This is about the langchain framework.',
 'It provides various utilities for building applications with LLMs.',
 'Text splitting is essential for handling large documents effectively.',
 'Various text splitters are available in LangChain, such as',
 'LangChain, such as CharacterTextSplitter and RecursiveCharacterText']

In [85]:
token_splitter=TokenTextSplitter(
    chunk_size=30,
    chunk_overlap=10
)

In [86]:
text="This is a sample text to demonstrate token-based text splitting. Tokenization helps in managing text data effectively, especially when dealing with language models that operate on tokens rather than raw text."
chunks=token_splitter.split_text(text)


In [87]:
print("total chunks created:",len(chunks))

total chunks created: 2


In [89]:
for i,chunk in enumerate(chunks):
    print(f"chunks {i} is : {chunk}")

chunks 0 is : This is a sample text to demonstrate token-based text splitting. Tokenization helps in managing text data effectively, especially when dealing with language models that operate
chunks 1 is :  effectively, especially when dealing with language models that operate on tokens rather than raw text.


In [None]:
print("\n TextCharacterSplitter")
print("advantages:  ")
print("\n -Simplicity: Easy to implement and understand.")
print("\n -Control: Provides direct control over chunk size and overlap.")
print("\n disadvantages:  ")
print("\n -Context Loss: May split sentences or ideas, leading to loss of context.")
print("---------------------------------------------------------\n recursive character splitter")
print("advantages:  ")
print("\n -Context Preservation: Attempts to maintain the integrity of sentences and ideas by using multiple separators.")
print("\n -Flexibility: Adapts to different text structures by trying various separators.")
print("\n disadvantages:  ")
print("\n -Complexity: More complex to configure and may require tuning of separators.")
print("-----------------------------------------------------------\n token-based splitter")
print("advantages:  ")
print("\n -Language Model Alignment: Works well with models that operate on tokens, ensuring better compatibility.")
print("\n -Efficiency: Can lead to more efficient use of model input limits by focusing on token counts.")
print("\n disadvantages:  ")
print("\n -Tokenization Dependency: Requires a reliable tokenization method, which may vary between models.")


 TextCharacterSplitter
advantages:  

 -Simplicity: Easy to implement and understand.

 -Control: Provides direct control over chunk size and overlap.

 disadvantages:  

 -Context Loss: May split sentences or ideas, leading to loss of context.
---------------------------------------------------------
 recursive character splitter
advantages:  

 -Context Preservation: Attempts to maintain the integrity of sentences and ideas by using multiple separators.

 -Flexibility: Adapts to different text structures by trying various separators.

 disadvantages:  

 -Complexity: More complex to configure and may require tuning of separators.

 token-based splitter
advantages:  

 -Language Model Alignment: Works well with models that operate on tokens, ensuring better compatibility.

 -Efficiency: Can lead to more efficient use of model input limits by focusing on token counts.

 disadvantages:  

 -Tokenization Dependency: Requires a reliable tokenization method, which may vary between models.

### Document loader with Pypdf and pymudf loader

In [395]:
from langchain_community.document_loaders import PyMuPDFLoader,PyPDFLoader

pypdfloader=PyPDFLoader(file_path="attention-is-all-you-need-Paper.pdf")
pypdf_documents=pypdfloader.load()
print(f"Total pages in PDF document: {len(pypdf_documents)}")

print(f"page metadata{pypdf_documents[0].metadata}")
print(f"page content snippet: {pypdf_documents[0].page_content[:500]}")  # Print first 500 characters of the first page


Total pages in PDF document: 11
page metadata{'producer': 'PyPDF2', 'creator': 'PyPDF', 'creationdate': '', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'publisher': 'Curran Associates, Inc.', 'language': 'en-US', 'created': '2017', 'eventtype': 'Poster', 'description-abstract': 'The dominant sequence transduction models are based on complex recurrent orconvolutional neural networks in an encoder and decoder configuration. The best performing such models also connect the encoder and decoder through an attentionm echanisms.  We propose a novel, simple network architecture based solely onan attention mechanism, dispensing with recurrence and convolutions entirely.Experiments on two machine translation tasks show these models to be superiorin quality while being more parallelizable and requiring significantly less timeto train. Our single model with 165 million parameters, achieves 27.5 BLEU onEnglish-to-German translation, improving over the existing best ensemble 

In [398]:
try:
    pymudfloader=PyMuPDFLoader(file_path="attention-is-all-you-need-Paper.pdf")
    pymudf_documents=pymudfloader.load()
    print(f"Total pages in PDF document using PyMuPDFLoader: {len(pymudf_documents)}")
    print(f"page metadata{pymudf_documents[0].metadata}")
    print(f"pymudf docs:{pymudf_documents}")
except Exception as e:
    print("Error loading PDF with PyMuPDFLoader:", e)

Total pages in PDF document using PyMuPDFLoader: 11
page metadata{'producer': 'PyPDF2', 'creator': '', 'creationdate': '', 'source': 'attention-is-all-you-need-Paper.pdf', 'file_path': 'attention-is-all-you-need-Paper.pdf', 'total_pages': 11, 'format': 'PDF 1.3', 'title': 'Attention is All you Need', 'author': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, ≈Åukasz Kaiser, Illia Polosukhin', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'keywords': '', 'moddate': '2018-02-12T21:22:10-08:00', 'trapped': '', 'modDate': "D:20180212212210-08'00'", 'creationDate': '', 'page': 0}
pymudf docs:[Document(metadata={'producer': 'PyPDF2', 'creator': '', 'creationdate': '', 'source': 'attention-is-all-you-need-Paper.pdf', 'file_path': 'attention-is-all-you-need-Paper.pdf', 'total_pages': 11, 'format': 'PDF 1.3', 'title': 'Attention is All you Need', 'author': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aid

In [405]:
class PdfProcessor:
    def __init__(self,chunk_size,chunk_overlap):
        self.chunk_size=chunk_size
        self.chunk_overlap=chunk_overlap
        self.text_splitter=RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=[" "]
        )

    def preprocess(self,path:str)->List[Document]:
        """Preprocess the PDF document and return list of Document chunks"""
        loader=PyPDFLoader(file_path=path)
        pages=loader.load()
        all_chunks=[]

        for page_num,page in enumerate(pages):

            clean_text=self.clean_text(page.page_content)
            if len(clean_text.strip())<=50:
                continue
            chunks=self.text_splitter.create_documents(
                texts=[clean_text],
                metadatas=[{
                    **page.metadata,
                    "page":page_num+1,
                    "total_pages":len(pages),
                    "chunk_size":self.chunk_size,
                    "char_count":len(clean_text)
                }])
            all_chunks.extend(chunks)
        return all_chunks
    def clean_text(self,text:str)->str:
        """Clean the text by removing unwanted characters"""
        text=" ".join(text.split())
        return text.replace("\n"," ").replace("\r"," ").strip()


In [408]:
preprocessor=PdfProcessor(chunk_size=500,chunk_overlap=50)
documents=preprocessor.preprocess(path="attention-is-all-you-need-Paper.pdf")
documents[0].metadata

{'producer': 'PyPDF2',
 'creator': 'PyPDF',
 'creationdate': '',
 'subject': 'Neural Information Processing Systems http://nips.cc/',
 'publisher': 'Curran Associates, Inc.',
 'language': 'en-US',
 'created': '2017',
 'eventtype': 'Poster',
 'description-abstract': 'The dominant sequence transduction models are based on complex recurrent orconvolutional neural networks in an encoder and decoder configuration. The best performing such models also connect the encoder and decoder through an attentionm echanisms.  We propose a novel, simple network architecture based solely onan attention mechanism, dispensing with recurrence and convolutions entirely.Experiments on two machine translation tasks show these models to be superiorin quality while being more parallelizable and requiring significantly less timeto train. Our single model with 165 million parameters, achieves 27.5 BLEU onEnglish-to-German translation, improving over the existing best ensemble result by over 1 BLEU. On English-to-

### Load the Word Documents and preprocess it 

In [1]:
from docx import Document as DocxDocument
from langchain_community.document_loaders import Docx2txtLoader,UnstructuredWordDocumentLoader


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("Using Docx2txtLoader")

doc2txtloader=Docx2txtLoader(file_path="SanthoshReddyBora.docx")
documents_docs=doc2txtloader.load()
print(f"Total pages in DOCX document: {len(documents_docs)}")
print(f"page metadata{documents_docs[0].metadata}")
documents_docs[0].page_content[:500]

Using Docx2txtLoader
Total pages in DOCX document: 1
page metadata{'source': 'SanthoshReddyBora.docx'}


'Santhosh Reddy Bora\n\nüìû +91 9182335865 | üìß borasanthosh921@gmail.com | üåê LinkedIn \n\n\n\nProfile Summary\n\nMachine Learning Engineer with 2.5 years of ML experience and 4.5 years of IT industry experience. Skilled in building end-to-end ML pipelines, deploying scalable AI solutions, and applying MLOps best practices. Experienced in computer vision, NLP, predictive modeling. Proficient in Python, SQL, TensorFlow, Keras, MLflow, and Azure Cloud. Strong track record of improving model performance, reduc'

In [None]:
# print("Using UnstructuredWordDocumentLoader")
# unstructured_docloader=UnstructuredWordDocumentLoader(file_path="SanthoshReddyBora.docx",mode="single")
# unstructured_documents=unstructured_docloader.load()
# print(f"Total pages in DOCX document using UnstructuredWordDocumentLoader: {len(unstructured_documents)}")
# print(f"metadata:{unstructured_documents[0].metadata}")
# print(f"page content snippet: {unstructured_documents[0].page_content[:500]}")



Using UnstructuredWordDocumentLoader


### CSV and excel file -Structured Data

In [2]:
import pandas as pd


In [5]:
pd.read_csv("data/data.csv")

Unnamed: 0,name,age,gender,occupation
0,Alice,30,Female,Engineer
1,Bob,25,Male,Designer
2,Charlie,35,Male,Teacher
3,Diana,28,Female,Doctor
4,Ethan,40,Male,Lawyer
5,Fiona,32,Female,Architect
6,George,29,Male,Chef
7,Hannah,27,Female,Scient
8,George,29,Male,Chef
9,Hannah,27,Female,Scientist


In [6]:
from langchain_community.document_loaders import CSVLoader,UnstructuredCSVLoader

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
csv_loader=CSVLoader(
    file_path="data/data.csv",
    encoding="utf-8"
)
csv_documents=csv_loader.load()
print(f"Total documents loaded from CSV: {len(csv_documents)}")
print(f"Metadata of first document: {csv_documents[0].metadata}")
print(f"Content of first document: {csv_documents[0].page_content[:500]}")  # Print first 500 characters of the content


Total documents loaded from CSV: 11
Metadata of first document: {'source': 'data/data.csv', 'row': 0}
Content of first document: name: Alice
age: 30
gender: Female
occupation: Engineer


In [17]:
from typing import List
from langchain_core.documents import Document
def process_csv_file(file_path:str)->List[Document]:
    
    df=pd.read_csv(file_path)
    documents=[]

    for idx,row in df.iterrows():
        content=f"""customer Information
        NAME: {row['name']}\n Age: {row['age']}\n Gender: {row['gender']}\n occupation: {row['occupation']}
        """

        doc=Document(
            page_content=content,
            metadata={
                "source":file_path,
                "file_type":"csv",
                "row_index":idx
            }
        )
        documents.append(doc)
    return documents

In [18]:
process_csv_file(file_path="data/data.csv")

[Document(metadata={'source': 'data/data.csv', 'file_type': 'csv', 'row_index': 0}, page_content='customer Information\n        NAME: Alice\n Age: 30\n Gender: Female\n occupation: Engineer\n        '),
 Document(metadata={'source': 'data/data.csv', 'file_type': 'csv', 'row_index': 1}, page_content='customer Information\n        NAME: Bob\n Age: 25\n Gender: Male\n occupation: Designer\n        '),
 Document(metadata={'source': 'data/data.csv', 'file_type': 'csv', 'row_index': 2}, page_content='customer Information\n        NAME: Charlie\n Age: 35\n Gender: Male\n occupation: Teacher \n        '),
 Document(metadata={'source': 'data/data.csv', 'file_type': 'csv', 'row_index': 3}, page_content='customer Information\n        NAME: Diana\n Age: 28\n Gender: Female\n occupation: Doctor  \n        '),
 Document(metadata={'source': 'data/data.csv', 'file_type': 'csv', 'row_index': 4}, page_content='customer Information\n        NAME: Ethan\n Age: 40\n Gender: Male\n occupation: Lawyer\n     

### Excel Processing

In [22]:
print("Pandas based excel processing")
def excel_preprocessing(file_path:str)->List[Document]:
    documents=[]

    excel=pd.ExcelFile(file_path)

    for sheet_name in excel.sheet_names:
        df=pd.read_excel(file_path,sheet_name=sheet_name)

        sheet_content=f"Sheet:{sheet_name}\n"
        sheet_content+=f"columns:{df.columns}"
        sheet_content+=f"Rows:{len(df)}\n\n"
        sheet_content+=df.to_string(index=False)

        doc=Document(
            page_content=sheet_content,
            metadata={
                "source":file_path,
                "sheet_name":sheet_name,
                "rows":len(df),
                "columns":len(df.columns)
            }
        )
        documents.append(doc)
    return documents

Pandas based excel processing


In [23]:
excel_preprocessing(file_path="data/Actor.xlsx")

[Document(metadata={'source': 'data/Actor.xlsx', 'sheet_name': 'Sheet1', 'rows': 5, 'columns': 5}, page_content="Sheet:Sheet1\ncolumns:Index(['Name', 'Age ', 'Country', 'Occupation', 'salary'], dtype='object')Rows:5\n\n Name  Age  Country Occupation    salary\n  Bob    50  India       Actor 100000000\nNakka    42  India      Actor    1000000\n Lion    60  India      Actor   10000000\n King    65  India       Actor   5000000\nChiru    70  India       Actor  44555555"),
 Document(metadata={'source': 'data/Actor.xlsx', 'sheet_name': 'Sheet2', 'rows': 5, 'columns': 5}, page_content="Sheet:Sheet2\ncolumns:Index(['Name', 'Age ', 'Country', 'Occupation', 'salary'], dtype='object')Rows:5\n\n Name  Age  Country Occupation    salary\n  Bob    50  India       Actor 100000000\nNakka    42  India      Actor    1000000\n Lion    60  India      Actor   10000000\n King    65  India       Actor   5000000\nChiru    70  India       Actor  44555555")]

In [24]:
from langchain_community.document_loaders import UnstructuredExcelLoader

In [None]:
excel_loader=UnstructuredExcelLoader(
    file_path="data/Actor.xlsx",
    mode='single'
)
doc_load = excel_loader.load()
doc_load

[Document(metadata={'source': 'data/Actor.xlsx'}, page_content='Name Age Country Occupation salary Bob 50 India Actor 100000000 Nakka 42 India Actor 1000000 Lion 60 India Actor 10000000 King 65 India Actor 5000000 Chiru 70 India Actor 44555555\n\nName Age Country Occupation salary Bob 50 India Actor 100000000 Nakka 42 India Actor 1000000 Lion 60 India Actor 10000000 King 65 India Actor 5000000 Chiru 70 India Actor 44555555')]

In [35]:
json_data={
    "company":"TCS",
    "empolyees":[
        {
            "id":1,
            "name":"Nagendra",
            "age":26,
            "role":"SAP",
            "projects":[
                {"name":"SAP ABAP","status":"In Progress"},
                {"name":"Data Pipeline","status":"Completed"}
            ]
        },
        {
            "id":2,
            "name":"Santhosh",
            "age":26,
            "role":"SQL",
            "projects":[
                {"name":"Humana","status":"In progress"},
                {"name":"nationwide","status":"Completed"}
            ]
        }
    ],
    "Company":"Air india",
    "empolyee":[
        {
            "id":1,
            "name":"Vishnu",
            "age":26,
            "role":".NET",
            "projects":[
                {"name":"Humana","status":"In Progress"},
                {"name":"Data Pipeline","status":"Completed"}
            ]
        }
    ]
}
json_data


{'company': 'TCS',
 'empolyees': [{'id': 1,
   'name': 'Nagendra',
   'age': 26,
   'role': 'SAP',
   'projects': [{'name': 'SAP ABAP', 'status': 'In Progress'},
    {'name': 'Data Pipeline', 'status': 'Completed'}]},
  {'id': 2,
   'name': 'Santhosh',
   'age': 26,
   'role': 'SQL',
   'projects': [{'name': 'Humana', 'status': 'In progress'},
    {'name': 'nationwide', 'status': 'Completed'}]}],
 'Company': 'Air india',
 'empolyee': [{'id': 1,
   'name': 'Vishnu',
   'age': 26,
   'role': '.NET',
   'projects': [{'name': 'Humana', 'status': 'In Progress'},
    {'name': 'Data Pipeline', 'status': 'Completed'}]}]}

In [37]:
import json
with open('data/company.json',"w")as f:
    json.dump(json_data,f,indent=2)

In [43]:
from langchain_community.document_loaders import JSONLoader

json_loader=JSONLoader(file_path="data/company.json",jq_schema='.empolyees[]',text_content=False)
json_loader

<langchain_community.document_loaders.json_loader.JSONLoader at 0x2b94a907550>

In [47]:
docs=json_loader.load()
print(f"len of the documents {len(docs)}")
print(f"{docs[0].page_content}")
print(docs)

len of the documents 2
{"id": 1, "name": "Nagendra", "age": 26, "role": "SAP", "projects": [{"name": "SAP ABAP", "status": "In Progress"}, {"name": "Data Pipeline", "status": "Completed"}]}
[Document(metadata={'source': 'D:\\GENAI\\RAG\\01-DataIngestionParsing\\data\\company.json', 'seq_num': 1}, page_content='{"id": 1, "name": "Nagendra", "age": 26, "role": "SAP", "projects": [{"name": "SAP ABAP", "status": "In Progress"}, {"name": "Data Pipeline", "status": "Completed"}]}'), Document(metadata={'source': 'D:\\GENAI\\RAG\\01-DataIngestionParsing\\data\\company.json', 'seq_num': 2}, page_content='{"id": 2, "name": "Santhosh", "age": 26, "role": "SQL", "projects": [{"name": "Humana", "status": "In progress"}, {"name": "nationwide", "status": "Completed"}]}')]


In [51]:
jsonl_data=[
    {"timestamp":"2024-01-01", "event":"user_login", "user_id":123},
    {"timestamp":"2024-01-01", "event":"user_login", "user_id":124}
]

In [80]:
with open("data/user_data.jsonl",'w')as f:
    json.dump(jsonl_data,f,indent=2)

In [81]:
from langchain_community.document_loaders import JSONLoader
jsonl_loader=JSONLoader(
    file_path="data/user_data.jsonl",
    jq_schema='.',
    text_content=False
)

In [91]:
jsonl=jsonl_loader.load()
print(f"len of the documents {len(jsonl)}")
print(f"{jsonl[0].page_content}")
print(jsonl)

len of the documents 1
[{"timestamp": "2024-01-01", "event": "user_login", "user_id": 123}, {"timestamp": "2024-01-01", "event": "user_login", "user_id": 124}]
[Document(metadata={'source': 'D:\\GENAI\\RAG\\01-DataIngestionParsing\\data\\user_data.jsonl', 'seq_num': 1}, page_content='[{"timestamp": "2024-01-01", "event": "user_login", "user_id": 123}, {"timestamp": "2024-01-01", "event": "user_login", "user_id": 124}]')]


## DataBase parsing

In [97]:
import sqlite3
import os
os.makedirs('data/databases',exist_ok=True)

In [115]:
conn=sqlite3.connect('data/databases/company.db')
cursor=conn.cursor()

In [117]:
cursor.execute("create table if not exists employees (id integer primary key,name text)")


<sqlite3.Cursor at 0x2b94b6a4dc0>

In [118]:
cursor.execute("Create table if not exists projects (id integer primaary key,name text)")

<sqlite3.Cursor at 0x2b94b6a4dc0>

In [119]:
empolyees=[
    (1,'Santhosh'),
    (2,'nagendra'),
    (3,'jampa')
]
projects=[
    (1,'humana'),
    (2,'pepisco'),
    (3,'Ariba')
]

In [120]:
cursor.executemany('Insert or replace into employees values(?,?)',empolyees)
cursor.executemany('Insert or replace into projects values(?,?)',projects)

<sqlite3.Cursor at 0x2b94b6a4dc0>

In [121]:
cursor.execute('select * from employees').fetchall()

[(1, 'Santhosh'), (2, 'nagendra'), (3, 'jampa')]

In [122]:
conn.commit()
conn.close()

## Database content extraction

In [123]:
from langchain_community.utilities import SQLDatabase
from langchain_community.document_loaders import SQLDatabaseLoader

In [125]:
db=SQLDatabase.from_uri("sqlite:///data/databases/company.db")

In [126]:
db.get_usable_table_names()

['employees', 'projects']

In [129]:
print(db.get_table_info())


CREATE TABLE employees (
	id INTEGER, 
	name TEXT, 
	PRIMARY KEY (id)
)

/*
3 rows from employees table:
id	name
1	Santhosh
2	nagendra
3	jampa
*/


CREATE TABLE projects (
	id INTEGER, 
	name TEXT
)

/*
3 rows from projects table:
id	name
1	humana
2	pepisco
3	Ariba
*/


In [153]:
from typing import List
from langchain_core.documents import Document

def sql_to_documents(path:str)->List[Document]:
    documents=[]
    conn=sqlite3.connect(path)
    cursor=conn.cursor()
    cursor.execute("select name from sqlite_master where type='table';")
    tables=cursor.fetchall()
    # print(tables)
    for table in tables:
        table_name=table[0]
        cursor.execute(f"PRAGMA table_info({table_name});")
        columns=cursor.fetchall()
        columns_names=[col[1] for col in columns]

        #get table data
        cursor.execute(f"select * from {table_name}")
        rows=cursor.fetchall()

        table_content=f"table: {table_name}\n"
        table_content+=f"columns:{','.join(columns_names)}\n"
        table_content+=f"rows: {len(rows)}\n"

        table_content+="Sample records\n"
        for i in rows:
            record=dict(zip(columns,i))
            table_content+=f"records :{record}"
        # print(table_content)

        doc=Document(
            page_content=table_content,
            metadata={
                'source':path,
                'table_name':table_name,
                "num_records":len(rows),
                'data_type':'sql_data'
            }
        )
        documents.append(doc)
    return documents




documents_from_sql=sql_to_documents('data/databases/company.db')


In [166]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
recursive =RecursiveCharacterTextSplitter(chunk_size=100,chunk_overlap=80)


In [164]:
recursive.split_documents(documents_from_sql)

[Document(metadata={'source': 'data/databases/company.db', 'table_name': 'employees', 'num_records': 3, 'data_type': 'sql_data'}, page_content='table: employees\ncolumns:id,name\nrows: 3'),
 Document(metadata={'source': 'data/databases/company.db', 'table_name': 'employees', 'num_records': 3, 'data_type': 'sql_data'}, page_content='rows: 3\nSample records'),
 Document(metadata={'source': 'data/databases/company.db', 'table_name': 'employees', 'num_records': 3, 'data_type': 'sql_data'}, page_content="records :{(0, 'id', 'INTEGER', 0, None, 1): 1,"),
 Document(metadata={'source': 'data/databases/company.db', 'table_name': 'employees', 'num_records': 3, 'data_type': 'sql_data'}, page_content="0, None, 1): 1, (1, 'name', 'TEXT', 0, None, 0):"),
 Document(metadata={'source': 'data/databases/company.db', 'table_name': 'employees', 'num_records': 3, 'data_type': 'sql_data'}, page_content="0, None, 0): 'Santhosh'}records :{(0, 'id',"),
 Document(metadata={'source': 'data/databases/company.db',

In [None]:
def nearest_customers(data, target, k):
    sorted_data=sorted(data,key=lambda x:(abs(x[1]-target),x[1]))
    return [cust_id for cust_id,_ in sorted_data[:k]]



data = [(1, 200), (2, 150), (3, 180), (4, 300), (5, 210)]
target = 190
k = 3
print(nearest_customers(data, target, k))


In [116]:
def top_spenders(transactions):
    spend_dict={}
    for cust_id,amount in transactions:
        if cust_id in spend_dict:
            spend_dict[cust_id]+=amount
        else:
            spend_dict[cust_id]=amount
    sorted_spender=sorted(spend_dict.items(),key=lambda x:x[1],reverse=True)
    return [id for id,value in sorted_spender if value ==max(spend_dict.values())]
transactions = [(1, 200), (2, 150), (1, 100), (3, 300), (2, 100)]
print(top_spenders(transactions))


[1, 3]


In [133]:
from collections import Counter 
text = " IBM is great and IBM is global global"
counter=Counter(text.split())
list(counter.keys())[0]

'IBM'

In [136]:
dict1={}
for word in text.split():
    if word in dict1:
        dict1[word]+=1
    else:
        dict1[word]=1
list(dict1.keys())[0]

'IBM'

In [None]:
temps = [73, 74, 75, 71, 69, 72, 76, 73]

list1=[0]*len(temps)

for i in range(len(temps)):
    for j in range(i+1,len(temps)):
        if temps[j]>temps[i]:
            list1[i]=j-i
            break
        else:
            continue

print(list1)

[1, 1, 4, 2, 1, 1, 0, 0]


In [158]:
from itertools import combinations
nums = [1, 5, 7, -1, 5]
target=6

perms=set(combinations(nums,2))
print(perms)
count=0
for i in perms:
    if sum(i)==target:
        print(i)
        count+=1
print(count)

{(7, -1), (5, 5), (1, 5), (5, -1), (1, -1), (5, 7), (1, 7), (7, 5), (-1, 5)}
(7, -1)
(1, 5)
2


In [199]:
s = "abc123def45fghfdd"

char_count=0
max_len=0
for char in s:
    if char.isalpha():
        char_count+=1
        max_len=max(max_len,char_count)
    else:
        char_count=0
print(max_len)


6


In [201]:
nums = [10, 20, 30, 40, 50]

normalized=[(x-min(nums))/(max(nums)-min(nums)) for x in nums]
normalized

[0.0, 0.25, 0.5, 0.75, 1.0]

In [None]:
begin = "hit"
end = "cog"
word_list = ["hot","dot","dog","lot","log","cog"]
words=[]
for i in list('abcdefghijklmnopqrstuvwxyz'):
    current_word=begin
    print("current_word:",current_word)
    for j in range(len(current_word)):
        new_word=current_word[:j]+i+current_word[j+1:]
        if new_word in word_list:
            words.append(new_word)
print(words)

In [225]:
from collections import deque
queue =deque([(begin,1)])
queue

deque([('hit', 1)])

In [242]:
import string
begin = "hit"
end = "cog"
word_list = ["hot","dot","dog","lot","log","cog"]
def ladder_Steps(begin,end,word_list):
    word_set=set(word_list)
    queue =deque([(begin,1)])

    while queue:
        words,steps=queue.popleft()
        print("Current_word:",words," Steps:",steps)
        if words==end:
            return steps
        for i in range(len(words)):
            for c in string.ascii_lowercase:
                new_word=words[:i]+c+words[i+1:]
                if new_word in word_set:
                    queue.append((new_word,steps+1))
                    word_set.remove(new_word)
    return 0    
ladder_Steps(begin,end,word_list)

Current_word: hit  Steps: 1
Current_word: hot  Steps: 2
Current_word: dot  Steps: 3
Current_word: lot  Steps: 3
Current_word: dog  Steps: 4
Current_word: log  Steps: 4
Current_word: cog  Steps: 5


5

In [243]:
class MyQueue:
    def __init__(self):
        self.items=[]
    
    def is_empty(self):
        return len(self.items)==0
    
    def enqueue(self,item):
        self.items.append(item)
        print(f"Enqueued: {item}")
    
    def dequeue(self):
        if not self.is_empty():
            item=self.items.pop(0)
            print(f"Dequeued: {item}")
            return item
        else:
            print("Queue is empty. Cannot dequeue.")
            return None
    
    def front(self):
        if not self.is_empty():
            return self.items[0]
        else:
            print("Queue is empty.")
            return None

In [261]:
qu=MyQueue()
qu.enqueue(10)
qu.enqueue(20)
# qu.dequeue()
qu.front()
qu.items

Enqueued: 10
Enqueued: 20


[10, 20]

In [324]:
from collections import deque
stack=[]
def reverse_kelements(list1,k):
    n=len(list1)
    que=deque(list1)
    for i in range(k):
        stack.append(que.popleft())
    while stack:
        que.append(stack.pop())

    for i in range(n - k):
        que.append(que.popleft())
    print(que)



reverse_kelements([1,2,3,4,5,6,7],4)

deque([4, 3, 2, 1, 5, 6, 7])


In [353]:
from collections import deque

def minOperationsToSort(arr):
    n = len(arr)
    print("Array Length:",n)
    d = deque(arr)
    
    # Case 1: already sorted
    if list(d) == sorted(d):
        return 0
    
    # Step 2: find where order breaks
    drop_index = -1
    for i in range(n - 1):
        print(arr[i],arr[i+1])
        if arr[i] > arr[i + 1]:
            drop_index = i
            print("Drop Index Found at:",drop_index)
            break
    
    # If no break found ‚Üí already sorted
    if drop_index == -1:
        return 0
    
    # Step 3: rotate right by (n - drop_index - 1)
    rotations = n - (drop_index + 1)
    print("Rotations Needed:",rotations)
    d.rotate(rotations)
    print("deque after rotation:",d)
    
    # Step 4: check if sorted after rotation
    if list(d) == sorted(arr):
        return rotations
    else:
        return -1

print(minOperationsToSort([3,4,5,6,1]))  # Output: 2

Array Length: 5
3 4
4 5
5 6
6 1
Drop Index Found at: 3
Rotations Needed: 1
deque after rotation: deque([1, 3, 4, 5, 6])
1


In [360]:
d1=deque([3,4,5,1,2])
d1.rotate(-3)
d1

deque([1, 2, 3, 4, 5])

In [None]:
print(minOperationsToSortfromleft([3,4,5,6,1]))  # Output: 2

In [368]:
def minOperationsToSortfromleft(arr):
    n=len(arr)
    que=deque(arr)
    if arr==sorted(arr):
        return 0
    
    drop_index=-1
    for i in range(n-1):
        if arr[i]>arr[i+1]:
            drop_index=i
            break
    if drop_index==-1:
        return 0
    que.rotate(- (drop_index + 1))

    if list(que)==sorted(arr):
        return drop_index + 1
    else:
        return -1
print(minOperationsToSortfromleft([3,4,5,1,2]))  # Output: 2

3


In [380]:
lis=[1,0,1,0,1]

actual_indexes=[i for i,val in enumerate(lis) if val==1]
sum1=0
median_index=len(actual_indexes)//2
print("median_index:",median_index)
median_value=actual_indexes[median_index]
print("median_value:",median_value)

for i,val in enumerate(actual_indexes):
    target_pos=median_value-median_index + i
    print("target_pos:",target_pos)
    sum1+=abs(val - target_pos)
    




    

print(sum1)

median_index: 1
median_value: 2
target_pos: 1
target_pos: 2
target_pos: 3
2
