In [1]:
!pip install -q langchain pypdf requests beautifulsoup4 langchain-community

import requests
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# --- PDF Loading and Processing (Local File) ---
pdf_file_path = "academic-paper.pdf" # Replace with your PDF file name.

try:
    pdf_loader = PyPDFLoader(pdf_file_path)
    pdf_documents = pdf_loader.load()
    print(f"Successfully loaded {len(pdf_documents)} documents from {pdf_file_path}")
except FileNotFoundError:
    print(f"Error: PDF file '{pdf_file_path}' not found. Please upload the file to Google Colab's file explorer.")
    pdf_documents = []
except Exception as e:
    print(f"An error occurred during PDF loading: {e}")
    pdf_documents = []

pdf_text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
pdf_chunks = pdf_text_splitter.split_documents(pdf_documents)
if pdf_documents:
  print(f"Split PDF into {len(pdf_chunks)} chunks.")

if pdf_chunks:
    print("\nPDF Chunk Metadata Example:")
    print(pdf_chunks[0].metadata)

# --- Web Page Loading and Processing (Wikipedia Example) ---
wikipedia_url = "https://en.wikipedia.org/wiki/LangChain"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

try:
    response = requests.get(wikipedia_url, headers=headers)
    response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)

    web_loader = WebBaseLoader(wikipedia_url, requests_kwargs={'headers': headers})
    web_documents = web_loader.load()
    print(f"\nSuccessfully loaded {len(web_documents)} documents from {wikipedia_url}")
except requests.exceptions.RequestException as e:
    print(f"Error loading web page {wikipedia_url}: {e}")
    web_documents = []
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    web_documents = []

web_text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
web_chunks = web_text_splitter.split_documents(web_documents)
if web_documents:
  print(f"Split Wikipedia content into {len(web_chunks)} chunks.")

if web_chunks:
    print("\nWikipedia Chunk Metadata Example:")
    print(web_chunks[0].metadata)



Successfully loaded 7 documents from academic-paper.pdf
Split PDF into 32 chunks.

PDF Chunk Metadata Example:
{'producer': 'Microsoft® Word Microsoft 365 için', 'creator': 'Microsoft® Word Microsoft 365 için', 'creationdate': '2023-07-22T11:45:02+03:00', 'author': 'TuRGuT', 'moddate': '2023-07-22T11:45:02+03:00', 'source': 'academic-paper.pdf', 'total_pages': 7, 'page': 0, 'page_label': '1'}

Successfully loaded 1 documents from https://en.wikipedia.org/wiki/LangChain
Split Wikipedia content into 23 chunks.

Wikipedia Chunk Metadata Example:
{'source': 'https://en.wikipedia.org/wiki/LangChain', 'title': 'LangChain - Wikipedia', 'language': 'en'}
