<a href="https://colab.research.google.com/github/sriharshamutnuru/AI_Learning/blob/main/Day16_Document_Chunking_%26_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# ============================================================
# 📘 Day 16 — Working Version (Chunking Real PDFs)
# ============================================================

!pip install --quiet pymupdf langchain tiktoken pandas matplotlib requests

import os, requests, fitz
from pathlib import Path
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken
import matplotlib.pyplot as plt

# ============================================================
# STEP 1 — Download Stable, Public PDFs
# ============================================================
os.makedirs("/content/sample_pdfs", exist_ok=True)

pdf_urls = {
    "ai_overview.pdf": "https://arxiv.org/pdf/2001.09977.pdf",  # AI survey
    "data_engineering.pdf": "https://storage.googleapis.com/gweb-cloudblog-publish/images/Data_Lake_on_GCP_whitepaper.max-1300x1300.jpgpagespeed.ce.U6kIvYz8-F.pdf"  # Google Data Lake whitepaper
}

for name, url in pdf_urls.items():
    print(f"⬇️ Downloading {name} ...")
    r = requests.get(url)
    with open(f"/content/sample_pdfs/{name}", "wb") as f:
        f.write(r.content)

print("\n✅ PDFs ready in /content/sample_pdfs/")
!ls -lh /content/sample_pdfs

# ============================================================
# STEP 2 — Extract Text, Chunk & Save
# ============================================================
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text") + "\n"
    return text.strip()

def count_tokens(text):
    enc = tiktoken.get_encoding("cl100k_base")
    return len(enc.encode(text))

def chunk_text(text, chunk_size=1000, overlap=150):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap,
        separators=["\n\n", "\n", ".", "!", "?"]
    )
    return splitter.split_text(text)

pdf_dir = Path("/content/sample_pdfs")
all_chunks = []

for pdf_file in pdf_dir.glob("*.pdf"):
    print(f"\n📘 Processing: {pdf_file.name}")
    text = extract_text_from_pdf(pdf_file)
    print(f"🧾 Text length: {len(text)} characters")

    chunks = chunk_text(text, chunk_size=1000, overlap=150)
    print(f"✅ Extracted {len(chunks)} chunks")

    for i, chunk in enumerate(chunks, 1):
        all_chunks.append({
            "pdf_name": pdf_file.name,
            "chunk_id": i,
            "token_count": count_tokens(chunk),
            "text_preview": chunk[:300].replace("\n", " ") + "..."
        })

# ============================================================
# STEP 3 — Save Results & Visualize
# ============================================================
if all_chunks:
    df = pd.DataFrame(all_chunks)
    df.to_csv("chunked_output.csv", index=False)
    print("\n✅ Chunked output saved as chunked_output.csv")
    print(f"📊 Total Chunks: {len(df)}")
    display(df.head(5))

    plt.figure(figsize=(8,4))
    df["token_count"].hist(bins=20)
    plt.title("Distribution of Token Counts per Chunk")
    plt.xlabel("Tokens per Chunk")
    plt.ylabel("Frequency")
    plt.grid(False)
    plt.show()
else:
    print("⚠️ No valid text extracted. Check PDF sources.")

⬇️ Downloading ai_overview.pdf ...
⬇️ Downloading data_engineering.pdf ...

✅ PDFs ready in /content/sample_pdfs/
total 1.7M
-rw-r--r-- 1 root root 1012K Oct 28 12:21 ai_overview.pdf
-rw-r--r-- 1 root root  1.2K Oct 28 12:17 azure_cloud_overview.pdf
-rw-r--r-- 1 root root  660K Oct 28 12:17 data_engineering_basics.pdf
-rw-r--r-- 1 root root   264 Oct 28 12:21 data_engineering.pdf

📘 Processing: data_engineering.pdf


FileDataError: Failed to open file '/content/sample_pdfs/data_engineering.pdf'.