# 01 — Preprocessing Pipeline (X-ray Images + COCO Annotations)

This notebook demonstrates the **preprocessing pipeline** used in the bone fracture detection project:

- Resize raw X-ray images to a fixed resolution (default: **512×512**)  
- Update/scale COCO bounding boxes to match the resized images  
- Perform quick visual sanity checks (before/after + bounding boxes)



In [None]:
# --- Setup imports & repo paths ---
from pathlib import Path
import sys
import json
import cv2
import matplotlib.pyplot as plt

# Make sure repo root is on PYTHONPATH so `src/` imports work
ROOT = Path("..").resolve()   # this notebook lives in notebooks/
sys.path.append(str(ROOT))

print("Repo root:", ROOT)

# Import your preprocessing functions
from src.preprocessing.preprocess import resize_images, update_coco_annotations


## 1) Configure paths (edit these to match your local data)

Expected local structure (example):

```
raw_data/
  images/
  train_annotations.coco.json
  valid_annotations.coco.json

processed_data/
  images/
  annotations/
```



In [None]:

RAW_IMAGE_FOLDER = (ROOT / "raw_data" / "images").resolve()
PROCESSED_IMAGE_FOLDER = (ROOT / "processed_data" / "images").resolve()

TRAIN_ANN_IN = (ROOT / "raw_data" / "train_annotations.coco.json").resolve()
VALID_ANN_IN = (ROOT / "raw_data" / "valid_annotations.coco.json").resolve()

TRAIN_ANN_OUT = (ROOT / "processed_data" / "annotations" / "train_resized.json").resolve()
VALID_ANN_OUT = (ROOT / "processed_data" / "annotations" / "valid_resized.json").resolve()

TARGET_SIZE = (512, 512)

print("RAW_IMAGE_FOLDER:", RAW_IMAGE_FOLDER)
print("PROCESSED_IMAGE_FOLDER:", PROCESSED_IMAGE_FOLDER)
print("TRAIN_ANN_IN:", TRAIN_ANN_IN)
print("VALID_ANN_IN:", VALID_ANN_IN)
print("TRAIN_ANN_OUT:", TRAIN_ANN_OUT)
print("VALID_ANN_OUT:", VALID_ANN_OUT)
print("TARGET_SIZE:", TARGET_SIZE)


## 2) Preview a raw image (sanity check)
 If the folder is empty, the cell will skip.


In [None]:
from glob import glob

def show_image(path, title):
    img = cv2.imread(str(path))
    if img is None:
        print(f"Could not read image: {path}")
        return
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.figure(figsize=(5, 5))
    plt.imshow(img_rgb)
    plt.axis("off")
    plt.title(title)
    plt.show()

raw_candidates = sorted(list(RAW_IMAGE_FOLDER.glob("*.png")) + list(RAW_IMAGE_FOLDER.glob("*.jpg")) + list(RAW_IMAGE_FOLDER.glob("*.jpeg")))
if not raw_candidates:
    print("No raw images found. Add a few images locally to run this preview.")
else:
    show_image(raw_candidates[0], f"Raw image: {raw_candidates[0].name}")


## 3) Resize images


In [None]:

if not RAW_IMAGE_FOLDER.exists():
    raise FileNotFoundError(f"RAW_IMAGE_FOLDER not found: {RAW_IMAGE_FOLDER}")

resize_images(
    input_folder=str(RAW_IMAGE_FOLDER),
    output_folder=str(PROCESSED_IMAGE_FOLDER),
    target_size=TARGET_SIZE
)


## 4) Update COCO annotations (scale bounding boxes)



In [None]:
# Update COCO annotation JSONs (safe to rerun)
if TRAIN_ANN_IN.exists():
    update_coco_annotations(str(TRAIN_ANN_IN), str(TRAIN_ANN_OUT), TARGET_SIZE)
else:
    print(f"Train annotation JSON not found at: {TRAIN_ANN_IN}")

if VALID_ANN_IN.exists():
    update_coco_annotations(str(VALID_ANN_IN), str(VALID_ANN_OUT), TARGET_SIZE)
else:
    print(f"Validation annotation JSON not found at: {VALID_ANN_IN}")


## 5) Preview a resized image (sanity check)

In [None]:
processed_candidates = sorted(list(PROCESSED_IMAGE_FOLDER.glob("*.png")) + list(PROCESSED_IMAGE_FOLDER.glob("*.jpg")) + list(PROCESSED_IMAGE_FOLDER.glob("*.jpeg")))
if not processed_candidates:
    print("No processed images found yet.")
else:
    show_image(processed_candidates[0], f"Processed image ({TARGET_SIZE[0]}×{TARGET_SIZE[1]}): {processed_candidates[0].name}")


## 6) Visualize bounding boxes from the resized COCO JSON

This confirms your bounding boxes still align after resizing.

- Looks for the first image entry in `train_resized.json`
- Finds its annotations
- Draws the bboxes on the corresponding resized image


In [None]:
def draw_coco_bboxes_on_image(image_path, coco_json_path, max_boxes=10):
    img = cv2.imread(str(image_path))
    if img is None:
        raise FileNotFoundError(f"Could not read image: {image_path}")

    with open(coco_json_path, "r") as f:
        coco = json.load(f)

    # pick first image entry
    img_entry = coco["images"][0]
    image_id = img_entry["id"]
    file_name = img_entry["file_name"]

    anns = [a for a in coco["annotations"] if a["image_id"] == image_id][:max_boxes]

    vis = img.copy()
    for ann in anns:
        x, y, w, h = ann["bbox"]  # COCO bbox format: [x, y, width, height]
        x1, y1 = int(x), int(y)
        x2, y2 = int(x + w), int(y + h)
        cv2.rectangle(vis, (x1, y1), (x2, y2), (0, 255, 0), 2)

    vis_rgb = cv2.cvtColor(vis, cv2.COLOR_BGR2RGB)
    plt.figure(figsize=(6, 6))
    plt.imshow(vis_rgb)
    plt.axis("off")
    plt.title(f"COCO bboxes (resized) on: {file_name}")
    plt.show()

# Try to visualize from train_resized.json if it exists
if TRAIN_ANN_OUT.exists():
    with open(TRAIN_ANN_OUT, "r") as f:
        coco_tmp = json.load(f)
    if coco_tmp.get("images"):
        fname = coco_tmp["images"][0]["file_name"]
        candidate_img = PROCESSED_IMAGE_FOLDER / fname
        if candidate_img.exists():
            draw_coco_bboxes_on_image(candidate_img, TRAIN_ANN_OUT)
        else:
            print("Could not find the resized image referenced by COCO JSON:")
            print("Expected:", candidate_img)
            print("Tip: ensure your processed images keep the same filenames as COCO 'file_name'.")
    else:
        print("train_resized.json has no images.")
else:
    print("train_resized.json not found. Run the annotation update step first.")
