In [1]:
1+1

2

In [3]:
a = 1
b = 2
c = a + b
print(c)

3


In [4]:
### Introduction to the data ingestion module


In [5]:
import os
from typing import List, Dict, Any
import pandas as pd
import json


In [10]:
from langchain_core.documents import Document
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter)
print("Data ingestion module loaded successfully.")

Data ingestion module loaded successfully.


## understand the document structure

In [16]:
##create a simple document
doc = Document(page_content="This is a sample document for data ingestion.", 
               metadata={"source": "sample.txt",
                         "page": 1,
                         "title": "Sample Document",
                         "author": "John Doe" })
print(f"Document Structure:")
print(doc.page_content)
print(json.dumps(doc.metadata))
print("\n Metadata is crucial for filtering and retrieval in RAG systems.")
print("Tracking document source")
print("providing context in responses")
print("debuging and auditing")

Document Structure:
This is a sample document for data ingestion.
{"source": "sample.txt", "page": 1, "title": "Sample Document", "author": "John Doe"}

 Metadata is crucial for filtering and retrieval in RAG systems.
Tracking document source
providing context in responses
debuging and auditing


In [17]:
type(doc)

langchain_core.documents.base.Document

In [None]:
import os
os.makedirs('data', exist_ok=True)

In [29]:
import os

# Create data/txt_files folder in the current Data_Ingestion_Parsing directory
txt_files_dir = os.path.join(os.getcwd(), 'data', 'txt_files')
os.makedirs(txt_files_dir, exist_ok=True)

print(f"Current working directory: {os.getcwd()}")
print(f"Text files directory: {txt_files_dir}\n")

sample_text = {
    os.path.join(txt_files_dir, "python.intro.txt"): """Python is a high-level, interpreted programming language known for its simple and readable syntax. It supports multiple programming paradigms and is widely used in web development, data science, machine learning, and automation.""",
    os.path.join(txt_files_dir, "machine_intro.txt"): """Machine Learning is a subset of artificial intelligence that enables computer systems to learn and improve from data without being explicitly programmed. ML algorithms identify patterns and make predictions, powering applications like recommendation systems, image recognition, and natural language processing."""    
}

print("Sample text dictionary created:")
print(f"Output directory: {txt_files_dir}\n")

for key, value in sample_text.items():
    print(f"Writing: {os.path.basename(key)}")
    print(f"  Preview: {value[:60]}...")
    try:
        with open(key, 'w', encoding="utf-8") as f:
            f.write(value)
        print(f"  ✓ Successfully written\n")
    except Exception as e:
        print(f"  ✗ Error: {e}\n")

print("Sample text files written to 'data/txt_files/' directory (under Data_Ingestion_Parsing).")

Current working directory: /home/ram/apps/langchain/project1/rag_langchain/Data_Ingestion_Parsing
Text files directory: /home/ram/apps/langchain/project1/rag_langchain/Data_Ingestion_Parsing/data/txt_files

Sample text dictionary created:
Output directory: /home/ram/apps/langchain/project1/rag_langchain/Data_Ingestion_Parsing/data/txt_files

Writing: python.intro.txt
  Preview: Python is a high-level, interpreted programming language kno...
  ✓ Successfully written

Writing: machine_intro.txt
  Preview: Machine Learning is a subset of artificial intelligence that...
  ✓ Successfully written

Sample text files written to 'data/txt_files/' directory (under Data_Ingestion_Parsing).


In [30]:
### read the single file and create document object

In [32]:
from langchain_community.document_loaders import TextLoader

# Load the first text file (python.intro.txt)
python_file_path = os.path.join(txt_files_dir, "python.intro.txt")
loader = TextLoader(python_file_path, encoding="utf-8")
python_docs = loader.load()

print("Loaded Python intro document:")
print(f"Type: {type(python_docs)}")
print(f"Number of documents: {len(python_docs)}")
print(f"Content preview: {python_docs[0].page_content[:100]}...\n")
print(f"metadata: {python_docs[0].metadata}\n")

# Load the second text file (machine_intro.txt)
ml_file_path = os.path.join(txt_files_dir, "machine_intro.txt")
loader = TextLoader(ml_file_path, encoding="utf-8")
ml_docs = loader.load()

print("Loaded Machine Learning intro document:")
print(f"Type: {type(ml_docs)}")
print(f"Number of documents: {len(ml_docs)}")
print(f"Content preview: {ml_docs[0].page_content[:100]}...\n")
print(f"metadata: {ml_docs[0].metadata}\n")

Loaded Python intro document:
Type: <class 'list'>
Number of documents: 1
Content preview: Python is a high-level, interpreted programming language known for its simple and readable syntax. I...

metadata: {'source': '/home/ram/apps/langchain/project1/rag_langchain/Data_Ingestion_Parsing/data/txt_files/python.intro.txt'}

Loaded Machine Learning intro document:
Type: <class 'list'>
Number of documents: 1
Content preview: Machine Learning is a subset of artificial intelligence that enables computer systems to learn and i...

metadata: {'source': '/home/ram/apps/langchain/project1/rag_langchain/Data_Ingestion_Parsing/data/txt_files/machine_intro.txt'}



### DirectoryLoader - multiple files ingestion

In [48]:
from langchain_community.document_loaders import DirectoryLoader
# Load all text files from the txt_files directory
loader = DirectoryLoader(
    path=txt_files_dir, glob="**/*.txt",
    loader_cls=TextLoader, show_progress=True,
    loader_kwargs={'encoding': 'utf-8'})
docs = loader.load()
print(f"Loaded documents from DirectoryLoader:")
print(f"Type: {type(docs)}")

for i, doc in enumerate(docs):
    print(f"\nDocument {i+1}:")
    print(f"  Content preview: {doc.page_content[:100]}...")
    print(f"  Metadata: {doc.metadata}")
    print(f"  source: {doc.metadata['source']}")
    print(f"  source: {doc.metadata.get('source')}")

# advantages and disadvantages of DirectoryLoader
print("\nAdvantages of DirectoryLoader:")
print("1. Efficiently loads multiple files from a directory structure.")
print("2. Supports various file types through different loader classes.")
print("3. Simplifies batch processing of documents for ingestion.")

print("\nDisadvantages of DirectoryLoader:")
print("1. Limited to files within a specified directory; may miss external sources.")
print("2. Requires appropriate loader classes for different file formats.")
print("3. can be memory intensive for larger directory.")

100%|██████████| 2/2 [00:00<00:00, 810.73it/s]

Loaded documents from DirectoryLoader:
Type: <class 'list'>

Document 1:
  Content preview: Machine Learning is a subset of artificial intelligence that enables computer systems to learn and i...
  Metadata: {'source': '/home/ram/apps/langchain/project1/rag_langchain/Data_Ingestion_Parsing/data/txt_files/machine_intro.txt'}
  source: /home/ram/apps/langchain/project1/rag_langchain/Data_Ingestion_Parsing/data/txt_files/machine_intro.txt
  source: /home/ram/apps/langchain/project1/rag_langchain/Data_Ingestion_Parsing/data/txt_files/machine_intro.txt

Document 2:
  Content preview: Python is a high-level, interpreted programming language known for its simple and readable syntax. I...
  Metadata: {'source': '/home/ram/apps/langchain/project1/rag_langchain/Data_Ingestion_Parsing/data/txt_files/python.intro.txt'}
  source: /home/ram/apps/langchain/project1/rag_langchain/Data_Ingestion_Parsing/data/txt_files/python.intro.txt
  source: /home/ram/apps/langchain/project1/rag_langchain/Data_Inges




### Text splitter Strategy

In [49]:
### Different Text Splitting Strategies
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter)
print(docs)

[Document(metadata={'source': '/home/ram/apps/langchain/project1/rag_langchain/Data_Ingestion_Parsing/data/txt_files/machine_intro.txt'}, page_content='Machine Learning is a subset of artificial intelligence that enables computer systems to learn and improve from data without being explicitly programmed. ML algorithms identify patterns and make predictions, powering applications like recommendation systems, image recognition, and natural language processing.\nKey features of Machine Learning include:\n1. Data-driven learning\n2. Pattern recognition\n3. Predictive analytics\n4. Adaptability to new data\n5. Automation of decision-making processes\n6. Support for various data types (structured, unstructured)\n7. Integration with big data technologies\n8. Continuous improvement through feedback loops\n9. Wide range of algorithms (supervised, unsupervised, reinforcement learning)\n10. Applications across diverse industries such as healthcare, finance, and marketing.\nSome popular Machine Le

In [54]:
### Method -1 - Character Text Splitter
print(docs[0].page_content[:100] , docs[0].metadata)
text = docs[0].page_content;
char_splitter = CharacterTextSplitter(
    separator="\n", # split by new line
    chunk_size=200, # number of characters in each chunk
    chunk_overlap=50, # number of characters to overlap between chunks
    length_function=len, # function to calculate length
    is_separator_regex=False,
)

char_chunks = char_splitter.split_text(text)
print(f"\nCharacter Text Splitter produced {len(char_chunks)} chunks:")
print(f"Chunk 1: {char_chunks[0]}")
print(f"Chunk 2: {char_chunks[1]}")

Created a chunk of size 310, which is longer than the specified 200


Machine Learning is a subset of artificial intelligence that enables computer systems to learn and i {'source': '/home/ram/apps/langchain/project1/rag_langchain/Data_Ingestion_Parsing/data/txt_files/machine_intro.txt'}

Character Text Splitter produced 6 chunks:
Chunk 1: Machine Learning is a subset of artificial intelligence that enables computer systems to learn and improve from data without being explicitly programmed. ML algorithms identify patterns and make predictions, powering applications like recommendation systems, image recognition, and natural language processing.
Chunk 2: Key features of Machine Learning include:
1. Data-driven learning
2. Pattern recognition
3. Predictive analytics
4. Adaptability to new data
5. Automation of decision-making processes


In [55]:
print(char_chunks)

['Machine Learning is a subset of artificial intelligence that enables computer systems to learn and improve from data without being explicitly programmed. ML algorithms identify patterns and make predictions, powering applications like recommendation systems, image recognition, and natural language processing.', 'Key features of Machine Learning include:\n1. Data-driven learning\n2. Pattern recognition\n3. Predictive analytics\n4. Adaptability to new data\n5. Automation of decision-making processes', '5. Automation of decision-making processes\n6. Support for various data types (structured, unstructured)\n7. Integration with big data technologies\n8. Continuous improvement through feedback loops', '8. Continuous improvement through feedback loops\n9. Wide range of algorithms (supervised, unsupervised, reinforcement learning)', '10. Applications across diverse industries such as healthcare, finance, and marketing.', 'Some popular Machine Learning frameworks and libraries are TensorFlow

In [58]:
print("\n---\n")
print("RECURSIVE CHARACTER TEXT SPLITTER")
### Method -2 - Recursive Character Text Splitter
recursive_char_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""], # hierarchy of separators
    chunk_size=200,
    chunk_overlap=20,
    length_function=len,
)

recursive_char_chunks = recursive_char_splitter.split_text(text)
print(f"\nRecursive Character Text Splitter produced {len(recursive_char_chunks)} chunks:")
print(f"Chunk 1: {recursive_char_chunks[0]}")
print(f"Chunk 1: {recursive_char_chunks[1]}")


---

RECURSIVE CHARACTER TEXT SPLITTER

Recursive Character Text Splitter produced 6 chunks:
Chunk 1: Machine Learning is a subset of artificial intelligence that enables computer systems to learn and improve from data without being explicitly programmed. ML algorithms identify patterns and make
Chunk 1: patterns and make predictions, powering applications like recommendation systems, image recognition, and natural language processing.
