# Upscale-Augmented Spot-the-Difference Pipeline

This notebook builds a reproducible workflow to upscale and enhance image pairs before running detection. It also constrains all detection prompts to the vocabulary extracted from `train.csv`.

**Dependencies**

Ensure `realesrgan`, `basicsr`, `transformers`, `accelerate`, and related vision libraries are installed before running the pipeline.

In [1]:
# Optional dependency install (run if packages are missing)
#%pip install -q realesrgan basicsr transformers accelerate einops safetensors

In [22]:
import logging
import os
import re
import time
from collections import Counter
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
from IPython.display import display
from PIL import Image, ImageEnhance

import torch
import torchvision.transforms as T

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("upscale_pipeline")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

DATA_ROOT = Path("data")
TRAIN_CSV = Path("train.csv") if Path("train.csv").exists() else DATA_ROOT / "train.csv"
TEST_CSV = Path("test.csv") if Path("test.csv").exists() else DATA_ROOT / "test.csv"

CANDIDATE_IMAGE_DIRS = [
    Path("data") / "data",
    DATA_ROOT / "data",
    Path("data") / "raw",
    Path("data")
]
IMAGE_DIR = None
for candidate in CANDIDATE_IMAGE_DIRS:
    if candidate.exists():
        IMAGE_DIR = candidate
        break
if IMAGE_DIR is None:
    IMAGE_DIR = Path("data")
    logger.warning("Defaulting image directory to %s", IMAGE_DIR.resolve())

UPSCALED_DIR = Path("outputs") / "upscaled"
UPSCALED_DIR.mkdir(parents=True, exist_ok=True)

logger.info("Using device %s", DEVICE)
logger.info("Images will be read from %s", IMAGE_DIR.resolve())
logger.info("Upscaled images will be stored in %s", UPSCALED_DIR.resolve())

2025-10-16 07:27:06,221 - INFO - Using device cuda
2025-10-16 07:27:06,221 - INFO - Images will be read from C:\Users\This PC\Downloads\Octwave\data\data
2025-10-16 07:27:06,221 - INFO - Upscaled images will be stored in C:\Users\This PC\Downloads\Octwave\outputs\upscaled


In [23]:
if not TRAIN_CSV.exists():
    raise FileNotFoundError(f"Could not find train.csv at {TRAIN_CSV}")

train_df = pd.read_csv(TRAIN_CSV)
logger.info("Loaded %d training rows", len(train_df))
display(train_df.head())

if TEST_CSV.exists():
    test_df = pd.read_csv(TEST_CSV)
    logger.info("Loaded %d test rows", len(test_df))
else:
    test_df = None
    logger.info("Test CSV not found at %s", TEST_CSV.resolve())

2025-10-16 07:27:06,245 - INFO - Loaded 4536 training rows


Unnamed: 0,img_id,added_objs,removed_objs,changed_objs
0,35655,none,none,none
1,30660,none,person vehicle,none
2,34838,man person,car person,none
3,34045,person,none,car
4,30596,none,bicycle person,none


2025-10-16 07:27:06,257 - INFO - Loaded 1482 test rows


In [24]:
TOKEN_SPLIT_REGEX = re.compile(r"[;,/]|\band\b|\bor\b")
STOP_WORDS = {"", "none", "null", "nan"}
DROP_TERMS = {"object", "item", "thing"}

def clean_token(token: str) -> str:
    token = re.sub(r"[^a-z0-9\- ]", " ", token.lower())
    token = re.sub(r"\s+", " ", token).strip()
    return token

def tokenize_label_string(label_str: str) -> list:
    parts = TOKEN_SPLIT_REGEX.split(label_str.lower())
    tokens = []
    for part in parts:
        cleaned = clean_token(part)
        if cleaned in STOP_WORDS:
            continue
        for sub_token in cleaned.split():
            if sub_token in STOP_WORDS:
                continue
            tokens.append(sub_token)
    return tokens

def extract_training_vocabulary(df: pd.DataFrame, min_freq: int = 1):
    counter = Counter()
    for column in ["added_objs", "removed_objs", "changed_objs"]:
        if column not in df.columns:
            continue
        for label_str in df[column].dropna():
            if not isinstance(label_str, str):
                continue
            tokens = tokenize_label_string(label_str)
            counter.update(tokens)
    vocab = [
        token for token, freq in counter.most_common()
        if freq >= min_freq and token not in DROP_TERMS
    ]
    return vocab, counter

MIN_FREQUENCY = 1
train_vocab, vocab_counts = extract_training_vocabulary(train_df, min_freq=MIN_FREQUENCY)
logger.info("Vocabulary size (min freq %d): %d", MIN_FREQUENCY, len(train_vocab))
vocab_preview = pd.DataFrame(
    [(token, freq) for token, freq in vocab_counts.items()],
    columns=["token", "frequency"]
).sort_values("frequency", ascending=False).head(25)
display(vocab_preview)

MAX_VOCAB_TERMS = 128
detection_vocab = train_vocab[:MAX_VOCAB_TERMS]
logger.info("Detection vocabulary truncated to %d terms", len(detection_vocab))

2025-10-16 07:27:06,370 - INFO - Vocabulary size (min freq 1): 49


Unnamed: 0,token,frequency
1,person,3216
4,car,2146
2,vehicle,1104
0,man,301
9,object,103
5,guy,51
27,traffic,32
12,umbrella,29
11,cart,27
3,group,20


2025-10-16 07:27:06,377 - INFO - Detection vocabulary truncated to 49 terms


**Pipeline Overview**

1. Build or load a Real-ESRGAN upscaler (scale x4 by default).
2. Upscale both images in each pair and apply lightweight contrast/sharpness enhancements.
3. Persist upscaled images in `outputs/upscaled` with `_up` suffix to keep originals untouched.
4. Run detection against the upscaled imagery with the training-derived vocabulary only.
5. Compare detections between original and enhanced imagery to audit recall gains.

In [25]:
try:
    from realesrgan import RealESRGANer
    from basicsr.archs.rrdbnet_arch import RRDBNet
except ImportError:
    RealESRGANer = None
    RRDBNet = None
    logger.warning("Real-ESRGAN dependencies missing. Install realesrgan and basicsr to enable upscaling.")

MODEL_DIR = Path("models") / "realesrgan"
MODEL_DIR.mkdir(parents=True, exist_ok=True)
MODEL_URLS = {
    4: "https://github.com/xinntao/Real-ESRGAN/releases/download/v0.3.0/RealESRGAN_x4plus.pth"
}

def download_file(url: str, destination: Path, chunk_size: int = 8192) -> None:
    response = requests.get(url, stream=True, timeout=60)
    response.raise_for_status()
    total = int(response.headers.get("content-length", 0))
    downloaded = 0
    with open(destination, "wb") as file:
        for chunk in response.iter_content(chunk_size=chunk_size):
            if not chunk:
                continue
            file.write(chunk)
            downloaded += len(chunk)
            if total:
                percent = (downloaded / total) * 100
                if percent % 5 < (chunk_size / max(total, 1)) * 100:
                    logger.info("Download progress: %.1f%%", percent)
    logger.info("Saved weights to %s", destination)

def build_realesrgan(scale: int = 4, fp32: bool = False, tile: int = 0):
    if RealESRGANer is None or RRDBNet is None:
        raise ImportError("Real-ESRGAN libraries are not installed. Run the install cell above.")
    if scale not in MODEL_URLS:
        raise ValueError(f"No weight url configured for scale {scale}.")
    weights_filename = MODEL_URLS[scale].split("/")[-1]
    weights_path = MODEL_DIR / weights_filename
    if not weights_path.exists():
        logger.info("Downloading Real-ESRGAN weights to %s", weights_path)
        download_file(MODEL_URLS[scale], weights_path)
    model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=scale)
    upscaler = RealESRGANer(
        scale=scale,
        model_path=str(weights_path),
        model=model,
        tile=tile,
        tile_pad=10,
        pre_pad=0,
        half=(not fp32 and DEVICE == "cuda"),
        device=DEVICE
    )
    logger.info("Real-ESRGAN upscaler ready (scale x%d)", scale)
    return upscaler



In [26]:
def enhance_image(image: Image.Image, contrast: float = 1.15, sharpness: float = 1.05, brightness: float = 1.0, color: float = 1.0) -> Image.Image:
    enhanced = ImageEnhance.Contrast(image).enhance(contrast)
    enhanced = ImageEnhance.Sharpness(enhanced).enhance(sharpness)
    enhanced = ImageEnhance.Brightness(enhanced).enhance(brightness)
    enhanced = ImageEnhance.Color(enhanced).enhance(color)
    return enhanced

def upscale_and_enhance(image_path: Path, upscaler, output_path: Path, enhancement_kwargs: dict | None = None, outscale: int = 4) -> Path:
    if upscaler is None:
        raise ValueError("Upscaler is not initialized.")
    enhancement_kwargs = enhancement_kwargs or {}
    image = Image.open(image_path).convert("RGB")
    # Convert RGB to BGR for Real-ESRGAN
    lr = np.array(image)[:, :, ::-1]
    sr, _ = upscaler.enhance(lr, outscale=outscale)
    # Switch back to RGB before saving
    sr_rgb = sr[:, :, ::-1]
    sr_image = Image.fromarray(sr_rgb)
    sr_image = enhance_image(
        sr_image,
        contrast=enhancement_kwargs.get("contrast", 1.15),
        sharpness=enhancement_kwargs.get("sharpness", 1.05),
        brightness=enhancement_kwargs.get("brightness", 1.0),
        color=enhancement_kwargs.get("color", 1.0)
    )
    output_path.parent.mkdir(parents=True, exist_ok=True)
    sr_image.save(output_path)
    return output_path

In [27]:
def prepare_upscaled_dataset(
    df: pd.DataFrame,
    img_dir: Path,
    output_dir: Path,
    upscaler,
    enhancement_kwargs: dict | None = None,
    overwrite: bool = False,
    limit: int | None = None
) -> pd.DataFrame:
    if upscaler is None:
        raise ValueError("Upscaler must be initialized before processing.")
    rows = df if limit is None else df.head(limit)
    records = []
    for index, row in rows.iterrows():
        img_id = row["img_id"]
        src1 = img_dir / f"{img_id}_1.png"
        src2 = img_dir / f"{img_id}_2.png"
        if not src1.exists() or not src2.exists():
            logger.warning("Skipping %s due to missing original files.", img_id)
            continue
        dest1 = output_dir / f"{img_id}_1_up.png"
        dest2 = output_dir / f"{img_id}_2_up.png"
        start = time.time()
        try:
            if overwrite or not dest1.exists():
                upscale_and_enhance(src1, upscaler, dest1, enhancement_kwargs=enhancement_kwargs)
            if overwrite or not dest2.exists():
                upscale_and_enhance(src2, upscaler, dest2, enhancement_kwargs=enhancement_kwargs)
            elapsed = time.time() - start
            records.append({"img_id": img_id, "seconds": elapsed})
            if len(records) % 20 == 0:
                logger.info("Processed %d image pairs", len(records))
        except Exception as err:
            logger.exception("Failed to process %s: %s", img_id, err)
    return pd.DataFrame(records)

In [28]:
ENHANCEMENT_DEFAULTS = {"contrast": 1.2, "sharpness": 1.1, "brightness": 1.05, "color": 1.0}

try:
    upscaler = build_realesrgan(scale=4)
except ImportError as exc:
    print(exc)
    upscaler = None

if upscaler is not None:
    sample_pairs = train_df.sample(min(3, len(train_df)), random_state=42)
    processing_log = prepare_upscaled_dataset(
        sample_pairs,
        IMAGE_DIR,
        UPSCALED_DIR,
        upscaler,
        enhancement_kwargs=ENHANCEMENT_DEFAULTS,
        overwrite=False
    )
    display(processing_log)
else:
    print("Install Real-ESRGAN dependencies and rerun to generate upscaled samples.")

Real-ESRGAN libraries are not installed. Run the install cell above.
Install Real-ESRGAN dependencies and rerun to generate upscaled samples.


In [29]:
def show_before_after(img_id: str, frame_index: int = 1) -> None:
    original_path = IMAGE_DIR / f"{img_id}_{frame_index}.png"
    upscaled_path = UPSCALED_DIR / f"{img_id}_{frame_index}_up.png"
    if not original_path.exists() or not upscaled_path.exists():
        print(f"Missing files for {img_id}_{frame_index}, skipping.")
        return
    original = Image.open(original_path).convert("RGB")
    upscaled = Image.open(upscaled_path).convert("RGB")
    fig, axes = plt.subplots(1, 2, figsize=(12, 6))
    axes[0].imshow(original)
    axes[0].set_title("Original")
    axes[0].axis("off")
    axes[1].imshow(upscaled)
    axes[1].set_title("Upscaled + Enhanced")
    axes[1].axis("off")
    plt.tight_layout()
    plt.show()

if "processing_log" in globals() and not processing_log.empty:
    show_before_after(processing_log.iloc[0]["img_id"], frame_index=1)

In [30]:
try:
    from transformers import OwlViTForObjectDetection, OwlViTProcessor
except ImportError as exc:
    raise ImportError("Install transformers to enable detection.") from exc

processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
owlvit_model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32").to(DEVICE)

def detect_with_vocab(image_path: Path, vocab_terms: list[str], threshold: float = 0.12):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(text=vocab_terms, images=image, return_tensors="pt")
    inputs = {key: value.to(DEVICE) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = owlvit_model(**inputs)
    target_sizes = torch.tensor([image.size[::-1]])
    results = processor.post_process_object_detection(
        outputs=outputs,
        target_sizes=target_sizes,
        threshold=threshold
    )[0]
    boxes = results["boxes"].detach().cpu().numpy()
    scores = results["scores"].detach().cpu().numpy()
    label_indices = results["labels"].detach().cpu().numpy()
    labels = [vocab_terms[idx] for idx in label_indices]
    return boxes, scores, labels

In [31]:
def run_detection_comparison(img_id: str, vocab_terms: list[str], threshold: float = 0.12) -> None:
    original_paths = [IMAGE_DIR / f"{img_id}_1.png", IMAGE_DIR / f"{img_id}_2.png"]
    upscaled_paths = [UPSCALED_DIR / f"{img_id}_1_up.png", UPSCALED_DIR / f"{img_id}_2_up.png"]
    for frame_index, (orig_path, up_path) in enumerate(zip(original_paths, upscaled_paths), start=1):
        if not orig_path.exists() or not up_path.exists():
            logger.warning("Skipping %s_%d due to missing files.", img_id, frame_index)
            continue
        print(f"Image {img_id}_{frame_index} original detections:")
        boxes_o, scores_o, labels_o = detect_with_vocab(orig_path, vocab_terms, threshold=threshold)
        print(list(zip(labels_o, np.round(scores_o, 3))))
        print(f"Image {img_id}_{frame_index} upscaled detections:")
        boxes_u, scores_u, labels_u = detect_with_vocab(up_path, vocab_terms, threshold=threshold)
        print(list(zip(labels_u, np.round(scores_u, 3))))
        print("---")

if upscaler is not None and "processing_log" in globals() and not processing_log.empty:
    run_detection_comparison(processing_log.iloc[0]["img_id"], detection_vocab, threshold=0.12)
else:
    print("Generate upscaled samples before running the detection comparison.")

Generate upscaled samples before running the detection comparison.


**Next Steps**

- Re-run `prepare_upscaled_dataset` on the full training and test sets once the sample results look good.
- Feed `detection_vocab` into downstream matching logic so every module shares the same vocabulary.
- Track precision/recall deltas between original and upscaled runs to quantify the benefit of enhancement.