# AWQ Quantization for dots.ocr Model

Using **llm-compressor** (vLLM's official successor to AutoAWQ)

**‚ö†Ô∏è BEFORE RUNNING - Verify GPU is enabled:**
1. Right panel ‚Üí Settings ‚Üí Accelerator ‚Üí **GPU T4 x2**
2. Right panel ‚Üí Settings ‚Üí Internet ‚Üí **On**
3. You should see "GPU T4 x2" in the top right corner

## Step 0: Verify GPU is Active

In [None]:
# Run this FIRST to verify GPU
!nvidia-smi

In [None]:
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")

if torch.cuda.is_available():
    print(f"GPU count: {torch.cuda.device_count()}")
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    print("\n‚úì GPU is ready!")
else:
    print("\n‚ùå NO GPU DETECTED!")
    print("\nTo fix:")
    print("1. Go to Settings (right panel)")
    print("2. Under 'Accelerator', select 'GPU T4 x2'")
    print("3. Wait for session to restart")
    print("4. Re-run this cell")
    raise RuntimeError("GPU not available. Enable it in Kaggle settings.")

## Step 1: Install Dependencies

In [None]:
!pip install llmcompressor --quiet
!pip install accelerate --quiet

## Step 2: Download Model

In [None]:
from huggingface_hub import snapshot_download
import os

MODEL_ID = "rednote-hilab/dots.ocr"
MODEL_DIR = "/kaggle/working/dots_ocr_original"
OUTPUT_DIR = "/kaggle/working/dots_ocr_awq_4bit"

print(f"Downloading {MODEL_ID}...")
snapshot_download(repo_id=MODEL_ID, local_dir=MODEL_DIR, local_dir_use_symlinks=False)
print(f"Downloaded to {MODEL_DIR}")

In [None]:
# Show files
for f in os.listdir(MODEL_DIR):
    path = os.path.join(MODEL_DIR, f)
    if os.path.isfile(path):
        print(f"{f}: {os.path.getsize(path)/1024**2:.1f} MB")

## Step 3: Load Model, Tokenizer, and Processor

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor
import gc

# Clear GPU memory
torch.cuda.empty_cache()
gc.collect()

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

In [None]:
# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
print(f"Tokenizer loaded. Vocab size: {tokenizer.vocab_size}")

In [None]:
# Load processor (includes image processor and video processor)
print("Loading processor...")
processor = AutoProcessor.from_pretrained(MODEL_DIR, trust_remote_code=True)
print(f"Processor loaded: {type(processor).__name__}")

In [None]:
# Load model on GPU
print("Loading model to GPU...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_DIR,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map="cuda",  # Explicitly use CUDA
)
print(f"Model loaded! GPU mem: {torch.cuda.memory_allocated()/1024**3:.2f} GB")

## Step 4: Prepare Calibration Data

In [None]:
from datasets import Dataset

# Calibration prompts
calibration_texts = [
    "What text is shown in this image?",
    "Please read and transcribe the text from this document.",
    "Extract all the text content from the provided image.",
    "Can you identify and read the text in this picture?",
    "Transcribe the handwritten text in this image.",
    "What does this document say?",
    "Read the printed text from this image.",
    "Please OCR this image and provide the text.",
    "Extract text content including numbers and special characters.",
    "Identify all readable text elements in this image.",
    "What is written on this page?",
    "Transcribe the following scanned document.",
    "Please read the text from this screenshot.",
    "Extract and format the text content visible in the image.",
    "Read all text including headers and footnotes.",
    "What are the words shown in this picture?",
    "Describe what you see in the image.",
    "List all words visible in this photograph.",
    "Convert the printed text to digital format.",
    "Read and output the document content.",
    "Extract text from this scanned page.",
    "What words appear in this image?",
    "Transcribe the visible text accurately.",
    "Identify all text elements in the picture.",
    "Read the content of this document image.",
    "Extract and list all readable text.",
    "What is the text content of this image?",
    "Please digitize the text in this scan.",
    "Convert image text to editable format.",
    "Read aloud what this document says.",
    "Extract every word from this image.",
    "What textual information is present?"
]

calibration_dataset = Dataset.from_dict({"text": calibration_texts})
print(f"Calibration dataset: {len(calibration_dataset)} samples")

In [None]:
# Tokenize calibration data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True, max_length=512)

tokenized_dataset = calibration_dataset.map(tokenize_function, batched=True)
print("Calibration data tokenized!")

## Step 5: Quantize with AWQ

In [None]:
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor import oneshot

# AWQ-style quantization recipe
recipe = QuantizationModifier(
    targets="Linear",
    scheme="W4A16",  # 4-bit weights, 16-bit activations
    ignore=["lm_head"],  # Don't quantize the output layer
)

print("Quantization config:")
print("  - Scheme: W4A16 (4-bit weights)")
print("  - Target: Linear layers")
print("  - Ignored: lm_head")

In [None]:
print("="*50)
print("Starting Quantization (15-30 mins)...")
print("="*50)

oneshot(
    model=model,
    tokenizer=tokenizer,
    dataset=tokenized_dataset,
    recipe=recipe,
    output_dir=OUTPUT_DIR,
    num_calibration_samples=len(calibration_texts),
    save_compressed=True,
)

print("="*50)
print("Quantization Complete!")
print("="*50)

## Step 6: Save Processor and Config Files

In [None]:
import shutil

# Save tokenizer and processor
tokenizer.save_pretrained(OUTPUT_DIR)
processor.save_pretrained(OUTPUT_DIR)
print("Saved tokenizer and processor")

# Copy extra config files
for f in ["preprocessor_config.json", "generation_config.json", "chat_template.jinja",
          "modeling_dots.py", "modeling_dots_vision.py", "configuration_dots.py",
          "image_processing_dots.py", "processing_dots.py"]:
    src = os.path.join(MODEL_DIR, f)
    if os.path.exists(src):
        shutil.copy2(src, os.path.join(OUTPUT_DIR, f))
        print(f"Copied: {f}")

print(f"\nSaved to: {OUTPUT_DIR}")

In [None]:
# Show output files
total = 0
print("\nQuantized model files:")
for f in sorted(os.listdir(OUTPUT_DIR)):
    path = os.path.join(OUTPUT_DIR, f)
    if os.path.isfile(path):
        size = os.path.getsize(path) / 1024**2
        total += size
        print(f"  {f}: {size:.1f} MB")
print(f"\nTotal: {total:.1f} MB ({total/1024:.2f} GB)")

## Step 7: Verify Quantized Model

In [None]:
# Clear memory
del model
torch.cuda.empty_cache()
gc.collect()

print("Loading quantized model...")
quantized_model = AutoModelForCausalLM.from_pretrained(
    OUTPUT_DIR,
    trust_remote_code=True,
    device_map="cuda"
)
print(f"Loaded! GPU mem: {torch.cuda.memory_allocated()/1024**3:.2f} GB")

In [None]:
# Quick inference test
test_input = tokenizer("What text is in this image?", return_tensors="pt").to("cuda")
with torch.no_grad():
    output = quantized_model.generate(**test_input, max_new_tokens=20, do_sample=False)
print(f"Output: {tokenizer.decode(output[0], skip_special_tokens=True)}")
print("\n‚úì Quantized model working!")

## Step 8: Create ZIP for Download

In [None]:
!cd /kaggle/working && zip -r dots_ocr_awq_4bit.zip dots_ocr_awq_4bit/

zip_size = os.path.getsize("/kaggle/working/dots_ocr_awq_4bit.zip") / 1024**3
print(f"\n‚úì Created: dots_ocr_awq_4bit.zip ({zip_size:.2f} GB)")
print("\nDownload from Output tab ‚Üí")

## Done! üéâ

**Usage:**
```python
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor

model = AutoModelForCausalLM.from_pretrained(
    "dots_ocr_awq_4bit",
    trust_remote_code=True,
    device_map="cuda"
)
tokenizer = AutoTokenizer.from_pretrained("dots_ocr_awq_4bit", trust_remote_code=True)
processor = AutoProcessor.from_pretrained("dots_ocr_awq_4bit", trust_remote_code=True)
```