# Data Engineering and Extraction

### Imports and Setup

In [1]:
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m68.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.5


In [2]:
import re
import fitz
from pathlib import Path
from PIL import Image
import io

# for OCR of images
try:
    import pytesseract
    OCR_AVAILABLE = True
except:
    OCR_AVAILABLE = False

### PDF Extractor and Cleaner

In [3]:
class PDFProcessor:
    def __init__(self, pdf_paths, output_dir):
        self.pdf_paths = pdf_paths
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
    
    def run(self):
        # Main pipeline execution
        print("="*80)
        print("PDF TO DATASET PIPELINE")
        print("="*80)
        
        # Extract
        print("\nExtracting from PDFs...")
        all_content = []
        for pdf_path in self.pdf_paths:
            print(f"  - {Path(pdf_path).name}")
            content = self._extract_pdf(pdf_path)
            all_content.append(content)
        
        # Clean
        print("\nCleaning and structuring...")
        combined = "\n\n" + "="*80 + "\n\n".join(all_content)
        final_text = self._clean(combined)
        
        # Save
        output = self.output_dir / "master_dataset.txt"
        output.write_text(final_text, encoding='utf-8')
        
        print(f"\n✓ Complete: {output}")
        print(f"  Size: {len(final_text):,} chars")
        return final_text
    
    def _extract_pdf(self, path):
        # Extract text and images from single file
        doc = fitz.open(path)
        pages = []
        
        for page_num, page in enumerate(doc):
            text = page.get_text()
            
            # Process images
            for img_idx, img_info in enumerate(page.get_images(full=True)):
                try:
                    xref = img_info[0]
                    img_data = doc.extract_image(xref)
                    img = Image.open(io.BytesIO(img_data["image"]))
                    
                    # OCR if available
                    ocr_text = ""
                    if OCR_AVAILABLE and img:
                        try:
                            ocr_text = pytesseract.image_to_string(img).strip()
                        except:
                            pass
                    
                    # Create figure tag
                    fig = f"<figure>Page {page_num+1}, Image {img_idx+1}"
                    if ocr_text:
                        fig += f": {ocr_text[:150]}"
                    fig += "</figure>"
                    text += f"\n\n{fig}\n\n"
                except:
                    pass
            
            pages.append(text)
        
        doc.close()
        return "\n\n".join(pages)
    
    def _clean(self, text):
        # Fix ligatures
        for old, new in [('ﬁ','fi'), ('ﬂ','fl'), ('ﬀ','ff'), ('ﬃ','ffi'), ('ﬄ','ffl')]:
            text = text.replace(old, new)
        
        # Fix encoding artifacts
        for old, new in [('—','-'), ('–','-'), (''',"'"), (''',"'"), ('"','"'), ('"','"')]:
            text = text.replace(old, new)
        
        # Fix broken code spacing
        text = self._fix_code(text)
        
        # Remove PDF artifacts
        lines = text.split('\n')
        keep = []
        
        # Patterns to skip
        skip = [
            r'^Sec\.\s+\d+',                    # Sec. 1.1 Headers
            r'^\d+\s+Chap\.',                   # Chapter markers
            r'^Chapter\s+\d+.+\d+$',            # Chapter headers
            r'^[ivxlcdm]+$',                    # Roman numerals
            r'^\d+$',                            # Page numbers
            r'^Page\s+\d+',                      # Page markers
            r'^SOURCE:',                         # Source tags
            r'<metadata',                        # Metadata
        ]
        
        for line in lines:
            if not any(re.search(p, line.strip(), re.I) for p in skip):
                keep.append(line)
        
        text = '\n'.join(keep)
        
        # Fix hyphenation across lines
        text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text)
        
        # Normalize spaces
        text = re.sub(r' {2,}', ' ', text)
        text = re.sub(r'\n{3,}', '\n\n', text)
        
        # Clean line endings
        text = '\n'.join(line.rstrip() for line in text.split('\n'))
        
        return text.strip() + '\n'
    
    def _fix_code(self, text):
        # Fix code blocks with excessive spacing (r a n g e → range)
        lines = text.split('\n')
        fixed = []
        
        for line in lines:
            words = line.split()
            
            # if >60% are single chars, likely broken code
            if len(words) > 5 and sum(len(w)==1 for w in words)/len(words) > 0.6:
                # Remove spaces between single characters
                line = re.sub(r'\b([a-zA-Z0-9])\s+(?=[a-zA-Z0-9]\b)', r'\1', line)
                
                # Fix common keywords
                for broken, fixed_word in [
                    (r'f\s*o\s*r\b', 'for'),
                    (r'i\s*n\b', 'in'),
                    (r'i\s*f\b', 'if'),
                    (r'd\s*e\s*f\b', 'def'),
                    (r'r\s*a\s*n\s*g\s*e', 'range'),
                    (r'o\s*r\b', 'or'),
                    (r'a\s*n\s*d\b', 'and'),
                    (r'n\s*o\s*t\b', 'not'),
                    (r'T\s*r\s*u\s*e', 'True'),
                    (r'F\s*a\s*l\s*s\s*e', 'False'),
                ]:
                    line = re.sub(broken, fixed_word, line)
            
            fixed.append(line)
        
        return '\n'.join(fixed)


### Main execution

In [5]:
books = ["algo-li_yin", "computational_algos-jorg", "dsa_analysis-cliff", "ods-python"]
PDF_PATHS = [f"/kaggle/input/dsa-books/{book}.pdf" for book in books]

# Run data extraction pipeline
pipeline = PDFProcessor(PDF_PATHS, output_dir="/kaggle/working")
pipeline.run()

print("\nReady for RAG!")

PDF TO DATASET PIPELINE

Extracting from PDFs...
  - algo-li_yin.pdf
  - computational_algos-jorg.pdf
  - dsa_analysis-cliff.pdf
  - ods-python.pdf

Cleaning and structuring...

✓ Complete: /kaggle/working/master_dataset.txt
  Size: 4,753,030 chars

Ready for RAG!
