In [None]:
!python3.10 -m pip install --upgrade pip
!python3.10 -m pip install torch torchvision torchaudio
!python3.10 -m pip install transformers pymupdf pytesseract pillow

/Users/mac/.zshenv:1: command not found: 0x0:0x0


In [8]:
# Core dependencies
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install torch torchvision torchaudio
!{sys.executable} -m pip install transformers pymupdf pytesseract pillow

# LLM and text processing
!{sys.executable} -m pip install --force-reinstall --no-cache-dir llama-cpp-python
!{sys.executable} -m pip install safetensors

# Additional utilities
!{sys.executable} -m pip install tqdm numpy opencv-python

/Users/mac/.zshenv:1: command not found: 0x0:0x0
/Users/mac/.zshenv:1: command not found: 0x0:0x0
Collecting torch
  Using cached torch-2.2.2-cp310-none-macosx_10_9_x86_64.whl.metadata (25 kB)
Collecting torchvision
  Using cached torchvision-0.17.2-cp310-cp310-macosx_10_13_x86_64.whl.metadata (6.6 kB)
Collecting torchaudio
  Using cached torchaudio-2.2.2-cp310-cp310-macosx_10_13_x86_64.whl.metadata (6.4 kB)
Collecting filelock (from torch)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting sympy (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec (from torch)
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
Collecting numpy (from torchvision)
  Using cached numpy-2.2.6-cp310-cp310-macosx_14_0_x86_64.whl.metadata (6

In [16]:
import sys
print(f"Python path: {sys.executable}")
!{sys.executable} -m pip list | grep -E "torch|transformers|pymupdf|llama|safetensors"

Python path: /Users/mac/Desktop/pdf_extracter/.venv/bin/python
/Users/mac/.zshenv:1: command not found: 0x0:0x0
llama_cpp_python   0.3.9
safetensors        0.5.3
torch              2.2.2
torchaudio         2.2.2
torchvision        0.17.2
transformers       4.52.4


In [20]:
from llama_cpp import Llama

llm = Llama(model_path="/Users/mac/Desktop/pdf_extracter/tinyllama-1.1b-chat-v1.0.Q6_K.gguf")

llama_model_load_from_file_impl: using device Metal (AMD Radeon Pro 560X) - 4087 MiB free
llama_model_loader: loaded meta data with 23 key-value pairs and 201 tensors from /Users/mac/Desktop/pdf_extracter/tinyllama-1.1b-chat-v1.0.Q6_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = tinyllama_tinyllama-1.1b-chat-v1.0
llama_model_loader: - kv   2:                       llama.context_length u32              = 2048
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 2048
llama_model_loader: - kv   4:                          llama.block_count u32              = 22
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 5632
llama_model_loader: - kv

In [21]:
import os
import json
import fitz
import torch
from PIL import Image
from llama_cpp import Llama
from transformers import BlipProcessor, BlipForConditionalGeneration
from typing import Dict, List, Optional, Tuple, Any


### Initialising BLIP and TinyLLama

In [22]:
def initialize_models(model_path: str) -> Tuple[Llama, BlipProcessor, BlipForConditionalGeneration]:
    """Initialize LLM and BLIP models"""
    llm = Llama(model_path=model_path)
    processor = BlipProcessor.from_pretrained(
        "Salesforce/blip-image-captioning-base",
        use_safetensors=True
    )
    model = BlipForConditionalGeneration.from_pretrained(
        "Salesforce/blip-image-captioning-base",
        use_safetensors=True
    )
    return llm, processor, model

### Chunking page text

In [23]:
def chunk_text(text: str, max_length: int = 400) -> List[str]:
    """Split text into smaller chunks"""
    sentences = text.split('.')
    chunks = []
    current_chunk = []
    current_length = 0
    
    for sentence in sentences:
        sentence = sentence.strip() + '.'
        if current_length + len(sentence) > max_length:
            if current_chunk:
                chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_length = len(sentence)
        else:
            current_chunk.append(sentence)
            current_length += len(sentence)
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

### Summarising text

In [None]:
def summarize_text(text: str, llm: Llama) -> str:
    """Generate a well-structured summary using LLM"""
    if not text.strip():
        return "Empty text"
        
    chunks = chunk_text(text, max_length=300)  # Reduced for better context
    summaries = []
    
    for chunk in chunks:
        prompt = """Please provide a comprehensive summary of the following text. 
Focus on key points and maintain clarity:

Text to summarize:
----------------
{text}
----------------

Guidelines:
- Capture main ideas and important details
- Use clear, professional language
- Maintain logical flow
- Be concise but informative

Summary:""".format(text=chunk)

        try:
            response = llm(
                prompt, 
                max_tokens=150,
                stop=["----------------", "\n\n"],
                temperature=0.3,  # Reduced for more focused output
                top_p=0.9,
                repeat_penalty=1.2
            )
            summary = response["choices"][0]["text"].strip()
            if summary:
                summaries.append(summary)
        except Exception as e:
            print(f"Error summarizing chunk: {e}")
            continue
    
    if not summaries:
        return "Summary generation failed"
    
    # Combine summaries with proper formatting
    final_summary = " ".join(summaries)
    
    return final_summary

### Preprocessing images

In [25]:
def preprocess_image(image_path: str, target_size: Tuple[int, int] = (384, 384)) -> Optional[Image.Image]:
    """Preprocess and resize image"""
    try:
        with Image.open(image_path) as img:
            img = img.convert('RGB')
            img.thumbnail(target_size, Image.Resampling.LANCZOS)
            new_img = Image.new('RGB', target_size, (255, 255, 255))
            offset = ((target_size[0] - img.size[0]) // 2,
                     (target_size[1] - img.size[1]) // 2)
            new_img.paste(img, offset)
            return new_img
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return None

### Generating image captions

In [26]:
def generate_image_caption(image: Image.Image, processor: BlipProcessor, 
                         model: BlipForConditionalGeneration) -> str:
    """Generate caption for an image using BLIP"""
    try:
        inputs = processor(
            image, 
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=77
        )
        
        out = model.generate(
            **inputs,
            max_length=30,
            num_beams=4,
            min_length=5,
            no_repeat_ngram_size=2
        )
        return processor.decode(out[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Error generating caption: {e}")
        return "Caption generation failed"

### Saving data to output_dir and Metadata to JSON file

In [33]:
def save_outputs(output_dir: str, metadata: Dict[str, Any], 
                full_text: str, summaries: Dict[int, str]) -> None:
    """Save all outputs to files"""
    # Save text
    text_file = os.path.join(output_dir, "extracted_text.txt")
    with open(text_file, "w", encoding="utf-8") as f:
        f.write(full_text)

    # Save summaries
    summaries_file = os.path.join(output_dir, "page_summaries.txt")
    with open(summaries_file, "w", encoding="utf-8") as f:
        f.write("PDF CONTENT SUMMARIES\n")
        f.write("=" * 80 + "\n\n")

        for page_num, summary in summaries.items():
            f.write(f"Page {page_num}\n")
            f.write("-" * 40 + "\n")
            f.write(f"{summary}\n\n")

    # Save metadata
    json_file = os.path.join(output_dir, "summary.json")
    with open(json_file, "w", encoding="utf-8") as f:
        json.dump(metadata, f, indent=4)

### storing image binary

In [37]:
def process_page_image(img: Dict, page_num: int, img_index: int, 
                      doc: fitz.Document, output_dir: str) -> Optional[str]:
    """Process a single image from a PDF page"""
    try:
        xref = img[0]
        base_image = doc.extract_image(xref)
        image_filename = f"page_{page_num}_img_{img_index+1}.{base_image['ext']}"
        image_path = os.path.join(output_dir, image_filename)
        
        with open(image_path, "wb") as f:
            f.write(base_image["image"])
        print(f"Saved: {image_path}")
        return image_path
    except Exception as e:
        print(f"Error extracting image: {e}")
        return None

### Running All functions

In [38]:
def extract_pdf_content(pdf_path: str, output_dir: str = "output1", 
                       model_path: str = "/Users/mac/Desktop/pdf_extracter/tinyllama-1.1b-chat-v1.0.Q6_K.gguf") -> Dict[str, Any]:
    """Main function to extract and process PDF content"""
    os.makedirs(output_dir, exist_ok=True)
    llm, processor, model = initialize_models(model_path)
    doc = fitz.open(pdf_path)
    
    full_text = ""
    total_images = 0
    image_captions = []
    summaries = {}

    for page_num, page in enumerate(doc, start=1):
        # Process text
        page_text = page.get_text()
        full_text += f"\n--- Page {page_num} ---\n{page_text}"
        print(f"Summarizing page {page_num}...")
        summaries[page_num] = summarize_text(page_text, llm)

        # Process images
        for img_index, img in enumerate(page.get_images(full=True)):
            image_path = process_page_image(img, page_num, img_index, doc, output_dir)
            if image_path:
                total_images += 1
                raw_image = preprocess_image(image_path)
                if raw_image:
                    caption = generate_image_caption(raw_image, processor, model)
                    image_captions.append({
                        "page": page_num,
                        "image": os.path.basename(image_path),
                        "caption": caption
                    })

    metadata = {
        "total_pages": len(doc),
        "total_images": total_images,
        "text_file": "extracted_text.txt",
        "image_captions": image_captions,
        "summaries": summaries
    }

    save_outputs(output_dir, metadata, full_text, summaries)
    return metadata



In [36]:
if __name__ == "__main__":
    pdf_path = "pdf2.pdf"
    metadata = extract_pdf_content(pdf_path)

llama_model_load_from_file_impl: using device Metal (AMD Radeon Pro 560X) - 4087 MiB free
llama_model_loader: loaded meta data with 23 key-value pairs and 201 tensors from /Users/mac/Desktop/pdf_extracter/tinyllama-1.1b-chat-v1.0.Q6_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = tinyllama_tinyllama-1.1b-chat-v1.0
llama_model_loader: - kv   2:                       llama.context_length u32              = 2048
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 2048
llama_model_loader: - kv   4:                          llama.block_count u32              = 22
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 5632
llama_model_loader: - kv

Summarizing page 1...


llama_perf_context_print:        load time =    2872.92 ms
llama_perf_context_print: prompt eval time =    2872.71 ms /   100 tokens (   28.73 ms per token,    34.81 tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    2873.11 ms /   101 tokens
Llama.generate: 16 prefix-match hit, remaining 99 prompt tokens to eval
llama_perf_context_print:        load time =    2872.92 ms
llama_perf_context_print: prompt eval time =    2261.98 ms /    99 tokens (   22.85 ms per token,    43.77 tokens per second)
llama_perf_context_print:        eval time =     280.89 ms /     7 runs   (   40.13 ms per token,    24.92 tokens per second)
llama_perf_context_print:       total time =    2544.92 ms /   106 tokens
Llama.generate: 16 prefix-match hit, remaining 57 prompt tokens to eval
llama_perf_context_print:        load time =    2872.92 ms
llama_perf_context_print: p

Saved: output1/page_1_img_1.jpeg
Summarizing page 2...


Llama.generate: 16 prefix-match hit, remaining 121 prompt tokens to eval
llama_perf_context_print:        load time =    2872.92 ms
llama_perf_context_print: prompt eval time =    1533.02 ms /   121 tokens (   12.67 ms per token,    78.93 tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    1533.79 ms /   122 tokens
Llama.generate: 16 prefix-match hit, remaining 95 prompt tokens to eval
llama_perf_context_print:        load time =    2872.92 ms
llama_perf_context_print: prompt eval time =    1917.07 ms /    95 tokens (   20.18 ms per token,    49.55 tokens per second)
llama_perf_context_print:        eval time =    2798.81 ms /    71 runs   (   39.42 ms per token,    25.37 tokens per second)
llama_perf_context_print:       total time =    4735.95 ms /   166 tokens
Llama.generate: 16 prefix-match hit, remaining 102 prompt tokens to eval
llama_perf_c

Saved: output1/page_2_img_1.jpeg
Summarizing page 3...


Llama.generate: 16 prefix-match hit, remaining 99 prompt tokens to eval
llama_perf_context_print:        load time =    2872.92 ms
llama_perf_context_print: prompt eval time =    1713.30 ms /    99 tokens (   17.31 ms per token,    57.78 tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    1714.28 ms /   100 tokens
Llama.generate: 16 prefix-match hit, remaining 111 prompt tokens to eval
llama_perf_context_print:        load time =    2872.92 ms
llama_perf_context_print: prompt eval time =    1628.47 ms /   111 tokens (   14.67 ms per token,    68.16 tokens per second)
llama_perf_context_print:        eval time =    2957.53 ms /    70 runs   (   42.25 ms per token,    23.67 tokens per second)
llama_perf_context_print:       total time =    4607.31 ms /   181 tokens
Llama.generate: 16 prefix-match hit, remaining 85 prompt tokens to eval
llama_perf_co

Saved: output1/page_3_img_1.png
Saved: output1/page_3_img_2.jpeg


ggml_metal_free: deallocating
ggml_metal_mem_pool_free: freeing memory pool, num heaps = 0 (total = 0)
ggml_metal_mem_pool_free: freeing memory pool, num heaps = 0 (total = 0)
ggml_metal_mem_pool_free: freeing memory pool, num heaps = 0 (total = 0)
ggml_metal_mem_pool_free: freeing memory pool, num heaps = 0 (total = 0)
ggml_metal_mem_pool_free: freeing memory pool, num heaps = 0 (total = 0)
ggml_metal_mem_pool_free: freeing memory pool, num heaps = 0 (total = 0)
ggml_metal_mem_pool_free: freeing memory pool, num heaps = 0 (total = 0)
ggml_metal_mem_pool_free: freeing memory pool, num heaps = 0 (total = 0)
