In [None]:
# importing dependencies
# Pull in the specialized tools for reading local text and spreadsheet files
from langchain_community.document_loaders import TextLoader, CSVLoader

# Bring in the utility for scraping and cleaning content from websites
from langchain_community.document_loaders import UnstructuredURLLoader

# Use the modern standard for smart text splitting to keep context intact
from langchain_text_splitters import RecursiveCharacterTextSplitter

### Document Loaders In LangChain

#### TextLoader & CSVLoader

In [1]:
# Load a simple text file into a list of Document objects
text_loader = TextLoader("nvda_news_1.txt")
text_docs = text_loader.load()

# Load a CSV and specify which column should act as the "Source" in metadata
csv_loader = CSVLoader(file_path="movies.csv", source_column="title")
csv_docs = csv_loader.load()

# Quick check to see the first row's content
print(f"First Movie Title: {csv_docs[0].metadata['source']}")

First Movie Title: K.G.F: Chapter 2


#### UnstructuredURLLoader

In [3]:
# List of URLs to scrape
urls = [
    "https://www.moneycontrol.com/news/business/banks/hdfc-bank-re-appoints-sanmoy-chakrabarti-as-chief-risk-officer-11259771.html",
    "https://www.moneycontrol.com/news/business/markets/market-corrects-post-rbi-ups-inflation-forecast-icrr-bet-on-these-top-10-rate-sensitive-stocks-ideas-11142611.html"
]

# Pull the text content from the web pages
url_loader = UnstructuredURLLoader(urls=urls)
web_data = url_loader.load()

print(f"Successfully loaded {len(web_data)} web pages.")

Successfully loaded 2 web pages.


### Text Splitters

In [4]:
# Some sample long text about Interstellar
big_text = "Interstellar is a 2014 epic science fiction film..."

# Set up the splitter to intelligently break text at natural points (paragraphs, then sentences)
r_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""], # Priority list of where to cut
    chunk_size=200,                     # Maximum characters per chunk
    chunk_overlap=20,                   # Small overlap to keep context between chunks
    length_function=len                 # Use standard string length for measuring
)

# Perform the split
chunks = r_splitter.split_text(big_text)

# Preview the results
for i, chunk in enumerate(chunks[:3]):
    print(f"Chunk {i+1} (Size {len(chunk)}): {chunk[:50]}...")

Chunk 1 (Size 51): Interstellar is a 2014 epic science fiction film.....
