In [8]:
import pdfplumber
import re
import json
from pathlib import Path
from typing import List, Dict
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# OPTIMIZED: Fixed performance issue, added error handling
def extract_text_from_pdf(path: str) -> str:
    """Extract text from PDF with proper error handling"""
    if not Path(path).exists():
        raise FileNotFoundError(f"PDF not found: {path}")
    
    try:
        text_parts = []  # Use list instead of string concatenation (faster!)
        
        with pdfplumber.open(path) as pdf:
            logger.info(f"Processing {len(pdf.pages)} pages from {path}")
            
            for page_num, page in enumerate(pdf.pages):
                page_text = page.extract_text()
                
                if page_text:
                    text_parts.append(page_text)
                else:
                    logger.warning(f"Page {page_num + 1} has no extractable text")
            
            if not text_parts:
                raise ValueError(f"No extractable text found in {path}")
            
            full_text = "\n".join(text_parts)
            logger.info(f"Extracted {len(full_text)} characters")
            return full_text
            
    except Exception as e:
        logger.error(f"Failed to extract from {path}: {e}")
        raise RuntimeError(f"PDF extraction failed: {e}") from e

# Test it
sample_pdf = "./data/terminal.pdf"
text = extract_text_from_pdf(sample_pdf)
print(f"Extracted {len(text)} characters")
print(text[:500])

INFO:__main__:Processing 2 pages from ./data/terminal.pdf
INFO:__main__:Extracted 1570 characters


Extracted 1570 characters
Linux & Terminal Command Cheat Sheet
1. File & Directory Navigation
pwd
ls
ls -l
ls -a
cd folder
cd ..
cd ../..
cd /path/to/dir
cd ~
cd -
2. File & Directory Management
touch file.txt
mkdir folder
mkdir -p a/b/c
rm file
rm -r folder
rm -rf folder
cp f1 f2
cp -r dir1 dir2
mv old new
cat file.txt
head file.txt
tail file.txt
tail -f file.txt
3. Editing Files
nano file.txt
vim file.txt
code .
4. Permissions & Ownership
chmod 755 file
chmod +x script.sh
chown user:group file
sudo command
5. Search & 


In [9]:
# OPTIMIZED: Added input validation
def basic_clean(text: str) -> str:
    """Clean extracted text"""
    if not isinstance(text, str):
        raise TypeError(f"Expected string, got {type(text)}")
    
    text = text.replace("\r", "\n")
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

# Test it
clean_text = basic_clean(text)
print(f"Cleaned text length: {len(clean_text)}")
print(clean_text[:500])


Cleaned text length: 1570
Linux & Terminal Command Cheat Sheet
1. File & Directory Navigation
pwd
ls
ls -l
ls -a
cd folder
cd ..
cd ../..
cd /path/to/dir
cd ~
cd -
2. File & Directory Management
touch file.txt
mkdir folder
mkdir -p a/b/c
rm file
rm -r folder
rm -rf folder
cp f1 f2
cp -r dir1 dir2
mv old new
cat file.txt
head file.txt
tail file.txt
tail -f file.txt
3. Editing Files
nano file.txt
vim file.txt
code .
4. Permissions & Ownership
chmod 755 file
chmod +x script.sh
chown user:group file
sudo command
5. Search & 


In [10]:
# OPTIMIZED: Added min length filter
def split_into_paragraphs(text: str, min_paragraph_length: int = 10) -> List[str]:
    """Split text into paragraphs"""
    raw_paragraphs = re.split(r"\n{2,}", text)
    paragraphs = [
        p.strip() 
        for p in raw_paragraphs 
        if p.strip() and len(p.strip()) >= min_paragraph_length
    ]
    return paragraphs

# Test it
paragraphs = split_into_paragraphs(clean_text)
print(f"Number of paragraphs: {len(paragraphs)}")
print(f"First paragraph preview:\n{paragraphs[0][:300]}")

Number of paragraphs: 1
First paragraph preview:
Linux & Terminal Command Cheat Sheet
1. File & Directory Navigation
pwd
ls
ls -l
ls -a
cd folder
cd ..
cd ../..
cd /path/to/dir
cd ~
cd -
2. File & Directory Management
touch file.txt
mkdir folder
mkdir -p a/b/c
rm file
rm -r folder
rm -rf folder
cp f1 f2
cp -r dir1 dir2
mv old new
cat file.txt
head


In [11]:
# NEW: Sliding window chunking (works for ALL document types)
def build_chunks_sliding_window(
    text: str,
    chunk_size: int = 300,
    overlap: int = 50,
    source_name: str = "document.pdf"
) -> List[Dict]:
    """Build chunks using sliding window approach"""
    
    # ========== INPUT VALIDATION ==========
    if not isinstance(text, str):
        raise TypeError(f"text must be string, got {type(text)}")
    
    if chunk_size <= 0:
        raise ValueError(f"chunk_size must be positive, got {chunk_size}")
    
    if overlap < 0:
        raise ValueError(f"overlap must be non-negative, got {overlap}")
    
    if overlap >= chunk_size:
        raise ValueError(f"overlap ({overlap}) must be less than chunk_size ({chunk_size})")
    
    if not text.strip():
        logger.warning("Empty text provided")
        return []
    
    # ========== SLIDING WINDOW ==========
    words = text.split()
    chunks = []
    step = chunk_size - overlap
    
    for i in range(0, len(words), step):
        chunk_words = words[i:i + chunk_size]
        
        # Don't create tiny final chunks
        if len(chunk_words) < 10 and chunks:
            chunks[-1]["text"] += " " + " ".join(chunk_words)
            chunks[-1]["n_words"] = len(chunks[-1]["text"].split())
        else:
            chunk_text = " ".join(chunk_words)
            chunks.append({
                "id": len(chunks),
                "text": chunk_text,
                "n_words": len(chunk_words),
                "char_count": len(chunk_text),
                "word_range": (i, i + len(chunk_words)),
                "source": source_name
            })
    
    logger.info(f"Created {len(chunks)} chunks from {len(words)} words")
    return chunks

# Test it with the clean text
chunks = build_chunks_sliding_window(clean_text, chunk_size=300, overlap=50, source_name="terminal.pdf")
print(f"Number of chunks: {len(chunks)}")
print(f"First chunk ({chunks[0]['n_words']} words):")
print(chunks[0]['text'][:400])


INFO:__main__:Created 2 chunks from 278 words


Number of chunks: 2
First chunk (278 words):
Linux & Terminal Command Cheat Sheet 1. File & Directory Navigation pwd ls ls -l ls -a cd folder cd .. cd ../.. cd /path/to/dir cd ~ cd - 2. File & Directory Management touch file.txt mkdir folder mkdir -p a/b/c rm file rm -r folder rm -rf folder cp f1 f2 cp -r dir1 dir2 mv old new cat file.txt head file.txt tail file.txt tail -f file.txt 3. Editing Files nano file.txt vim file.txt code . 4. Permi


In [12]:
# OPTIMIZED: Added error handling
def save_chunks(chunks: List[Dict], output_path: str) -> None:
    """Save chunks to JSONL file"""
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    try:
        with output_path.open("w", encoding="utf-8") as f:
            for chunk in chunks:
                f.write(json.dumps(chunk, ensure_ascii=False) + "\n")
        
        logger.info(f"Saved {len(chunks)} chunks to {output_path}")
    except Exception as e:
        logger.error(f"Failed to save chunks: {e}")
        raise

# Save the chunks
output_path = "data/chunks/terminal_chunks.jsonl"
save_chunks(chunks, output_path)
print(f"Saved {len(chunks)} chunks to {output_path}")

INFO:__main__:Saved 2 chunks to data/chunks/terminal_chunks.jsonl


Saved 2 chunks to data/chunks/terminal_chunks.jsonl


In [13]:
# BONUS: Complete end-to-end pipeline
def process_pdf(
    pdf_path: str,
    output_dir: str = "data/chunks",
    chunk_size: int = 300,
    overlap: int = 50
) -> List[Dict]:
    """Complete pipeline: PDF -> Chunks -> Save"""
    logger.info(f"Processing {pdf_path}")
    
    # Extract and clean
    text = extract_text_from_pdf(pdf_path)
    clean_text = basic_clean(text)
    
    # Build chunks
    source_name = Path(pdf_path).stem + ".pdf"
    chunks = build_chunks_sliding_window(clean_text, chunk_size, overlap, source_name)
    
    # Save
    output_path = Path(output_dir) / f"{Path(pdf_path).stem}_chunks.jsonl"
    save_chunks(chunks, output_path)
    
    return chunks



In [14]:
# Test with smaller chunks to see the sliding window in action
print("\n" + "="*60)
print("TESTING WITH DIFFERENT CHUNK SIZES")
print("="*60)

# Test 1: Large chunks (current)
chunks_large = build_chunks_sliding_window(clean_text, chunk_size=300, overlap=50, source_name="terminal.pdf")
print(f"\nChunk size 300: {len(chunks_large)} chunks")

# Test 2: Medium chunks
chunks_medium = build_chunks_sliding_window(clean_text, chunk_size=100, overlap=20, source_name="terminal.pdf")
print(f"Chunk size 100: {len(chunks_medium)} chunks")

# Test 3: Small chunks
chunks_small = build_chunks_sliding_window(clean_text, chunk_size=50, overlap=10, source_name="terminal.pdf")
print(f"Chunk size 50: {len(chunks_small)} chunks")

# Show details for medium chunks
print(f"\n--- Medium Chunk Breakdown ---")
for i, chunk in enumerate(chunks_medium):
    preview = chunk['text'][:60].replace('\n', ' ')
    print(f"Chunk {i}: {chunk['n_words']} words - '{preview}...'")




INFO:__main__:Created 2 chunks from 278 words
INFO:__main__:Created 4 chunks from 278 words
INFO:__main__:Created 7 chunks from 278 words



TESTING WITH DIFFERENT CHUNK SIZES

Chunk size 300: 2 chunks
Chunk size 100: 4 chunks
Chunk size 50: 7 chunks

--- Medium Chunk Breakdown ---
Chunk 0: 100 words - 'Linux & Terminal Command Cheat Sheet 1. File & Directory Nav...'
Chunk 1: 100 words - 'Ownership chmod 755 file chmod +x script.sh chown user:group...'
Chunk 2: 100 words - 'apt install pkg sudo apt remove pkg 9. Git Commands git init...'
Chunk 3: 38 words - 'script.sh 13. Docker CLI Essentials docker pull image docker...'
