# dots.ocr Inference Notebook

Run OCR on your documents using the quantized dots.ocr model.

**Setup:**
1. Enable GPU: Settings → Accelerator → **GPU T4 x2**
2. Enable Internet: Settings → Internet → **On**
3. Upload your image in the left panel

In [None]:
# Install required dependencies (Kaggle)
# NOTE: Kaggle often ships a torchvision build that is incompatible with Pillow 12+.
# Pin Pillow < 12 and Transformers < 5 to avoid common import conflicts.
# After this cell finishes, use Kaggle: "Restart session" (top right) and rerun from the top.
!pip -q install -U --force-reinstall \
  "pillow<12" \
  "transformers>=4.46.0,<5" \
  "accelerate>=0.26.0,<1" \
  "pdf2image" \
  "qwen-vl-utils"

In [None]:
# (Optional) sanity check versions
import PIL
import transformers
import torch

print("python:", __import__("sys").version)
print("torch:", torch.__version__)
print("transformers:", transformers.__version__)
print("pillow:", PIL.__version__)

# If you still see a Pillow/torchvision import error, restart the session and rerun from the top.

In [None]:
# Verify GPU
!nvidia-smi

In [None]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## Step 1: Load Model

In [None]:
import torch
from transformers import AutoImageProcessor, AutoModelForCausalLM, AutoTokenizer

# Your quantized model
MODEL_ID = "sugam24/dots-ocr-awq-4bit"

print(f"Loading {MODEL_ID}...")

# IMPORTANT:
# - `device_map="cuda"` is NOT valid in Transformers. Use "auto".
# - `AutoProcessor` can fail on some setups for this model (video-processor auto-detection).
#   We avoid it by using `tokenizer` + `AutoImageProcessor` directly.
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
image_processor = AutoImageProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)

if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"✓ Model loaded! GPU mem: {torch.cuda.memory_allocated()/1024**3:.2f} GB")

## Step 2: Upload Your Image

Upload your image using the file browser on the left, or run the cell below to upload.

In [None]:
# Kaggle: put your file in either:
# - /kaggle/input/<your-dataset>/...
# - /kaggle/working/...
# Then set IMAGE_PATH below.

from pathlib import Path
from IPython.display import display, Image as IPImage

# Option A (recommended): set this manually
# IMAGE_PATH = "/kaggle/input/your-dataset/image.jpg"
# IMAGE_PATH = "/kaggle/input/your-dataset/document.pdf"

# Option B: auto-pick the first image/pdf found under /kaggle/input or /kaggle/working
candidates = []
for root in ("/kaggle/input", "/kaggle/working"):
    p = Path(root)
    if p.exists():
        for ext in ("*.png", "*.jpg", "*.jpeg", "*.webp", "*.tif", "*.tiff", "*.pdf"):
            candidates.extend(list(p.rglob(ext)))

if not candidates:
    raise FileNotFoundError(
        "No input files found. Add a dataset (left panel) or upload a file into /kaggle/working, then set IMAGE_PATH."
    )

IMAGE_PATH = str(candidates[0])
print(f"✓ Using: {IMAGE_PATH}")

# Display if it's an image
if Path(IMAGE_PATH).suffix.lower() != ".pdf":
    display(IPImage(filename=IMAGE_PATH, width=700))

In [None]:
# Option 2: Specify path directly (if you uploaded via file browser)
# IMAGE_PATH = "/kaggle/input/your-dataset/image.png"
# or
# IMAGE_PATH = "/kaggle/working/uploaded_image.png"

## Step 3: Run OCR

In [None]:
import torch
from PIL import Image
from pathlib import Path


def load_image_or_pdf(path: str) -> Image.Image:
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(path)

    if p.suffix.lower() == ".pdf":
        # First page only (you can extend this later)
        from pdf2image import convert_from_path

        pages = convert_from_path(str(p), first_page=1, last_page=1)
        return pages[0].convert("RGB")

    return Image.open(str(p)).convert("RGB")


def run_ocr(image_path: str, prompt: str = "Extract all the text from this image.", max_new_tokens: int = 1024) -> str:
    """Run OCR on an image/PDF and return extracted text."""
    image = load_image_or_pdf(image_path)
    print(f"Image size: {image.size}")

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": prompt},
            ],
        }
    ]

    # Build model inputs without `AutoProcessor` (more reliable across environments)
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    text_inputs = tokenizer([text], padding=True, return_tensors="pt")
    image_inputs = image_processor(images=image, return_tensors="pt")

    device = next(model.parameters()).device
    model_dtype = torch.float16 if device.type == "cuda" else torch.float32

    inputs = {**text_inputs, **image_inputs}
    for k, v in list(inputs.items()):
        if isinstance(v, torch.Tensor):
            v = v.to(device)
            if torch.is_floating_point(v):
                v = v.to(model_dtype)
            inputs[k] = v

    with torch.no_grad():
        if device.type == "cuda":
            with torch.autocast(device_type="cuda", dtype=model_dtype):
                output_ids = model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    do_sample=False,
                    pad_token_id=tokenizer.pad_token_id,
                )
        else:
            output_ids = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                pad_token_id=tokenizer.pad_token_id,
            )

    generated_ids = output_ids[0][inputs["input_ids"].shape[1] :]
    return tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

In [None]:
# Run OCR on your image
print("Running OCR...")
result = run_ocr(IMAGE_PATH)

print("\n" + "="*50)
print("EXTRACTED TEXT:")
print("="*50)
print(result)
print("="*50)

## Step 4: Try Different Prompts (Optional)

In [None]:
# Try a custom prompt
custom_prompt = "Extract all text including headers, paragraphs, and any tables from this document."

result = run_ocr(IMAGE_PATH, prompt=custom_prompt)
print(result)

In [None]:
# Process multiple images
import glob

# Find all images in a folder
image_files = glob.glob("/kaggle/working/*.png") + glob.glob("/kaggle/working/*.jpg")

for img_path in image_files:
    print(f"\n--- Processing: {img_path} ---")
    result = run_ocr(img_path)
    print(result[:500] + "..." if len(result) > 500 else result)

## Save Results

In [None]:
# Save OCR result to Markdown (Kaggle output)
from datetime import datetime

output_file = "/kaggle/working/ocr_result.md"

markdown_content = f"""# OCR Results - dots.ocr AWQ 4-bit

## Inference Details
- **Model**: {MODEL_ID}
- **Input**: {IMAGE_PATH}
- **Timestamp**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## Prompt
```text
{custom_prompt if 'custom_prompt' in globals() else 'Extract all the text from this image.'}
```

## Extracted Text
```text
{result}
```
"""

with open(output_file, "w", encoding="utf-8") as f:
    f.write(markdown_content)

print(f"✓ Saved to {output_file}")
print("You can download it from the Kaggle 'Output' pane.")