In [None]:
!unzip "/content/drive/MyDrive/MSCOCO/Flicker8k_Dataset.zip" -d "/content"

In [2]:
import os
import cv2
import csv
import numpy as np
import tensorflow as tf
from typing import List, Dict
from tqdm import tqdm
import tensorflow_hub as hub

# ---------------------------
# CONFIG
# ---------------------------
DATASET_PATH = "/content/Flicker8k_Dataset"
CSV_FILE = "/content/spatial_features_padded.csv"

MAX_OBJECTS = 10
FEATURE_DIM = 10

# ---------------------------
# SPATIAL OBJECT DETECTOR
# ---------------------------
class SpatialObjectDetector:

    def __init__(self):
        self.model = hub.load("https://tfhub.dev/tensorflow/ssd_mobilenet_v2/2")
        self.class_names = self._load_coco_names()

    def _load_coco_names(self) -> List[str]:
        return [
            'person','bicycle','car','motorcycle','airplane','bus','train',
            'truck','boat','traffic light','fire hydrant','stop sign',
            'parking meter','bench','bird','cat','dog','horse','sheep',
            'cow','elephant','bear','zebra','giraffe','backpack','umbrella',
            'handbag','tie','suitcase','frisbee','skis','snowboard',
            'sports ball','kite','baseball bat','baseball glove','skateboard',
            'surfboard','tennis racket','bottle','wine glass','cup','fork',
            'knife','spoon','bowl','banana','apple','sandwich','orange',
            'broccoli','carrot','hot dog','pizza','donut','cake','chair',
            'couch','potted plant','bed','dining table','toilet','tv',
            'laptop','mouse','remote','keyboard','cell phone','microwave',
            'oven','toaster','sink','refrigerator','book','clock','vase',
            'scissors','teddy bear','hair drier','toothbrush'
        ]

    # ---------------------------
    # OBJECT DETECTION
    # ---------------------------
    def detect_objects(self, image, conf_threshold=0.5):
        rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        input_tensor = tf.convert_to_tensor(rgb)[tf.newaxis, ...]
        detections = self.model(input_tensor)

        boxes = detections['detection_boxes'][0].numpy()
        classes = detections['detection_classes'][0].numpy().astype(int)
        scores = detections['detection_scores'][0].numpy()

        h, w = image.shape[:2]
        objects = []

        for i in range(len(scores)):
            if scores[i] < conf_threshold:
                continue

            class_id = classes[i] - 1
            if class_id < 0 or class_id >= len(self.class_names):
                continue

            ymin, xmin, ymax, xmax = boxes[i]
            x1, y1 = int(xmin * w), int(ymin * h)
            x2, y2 = int(xmax * w), int(ymax * h)

            cx, cy = (x1 + x2) / 2, (y1 + y2) / 2
            area = (x2 - x1) * (y2 - y1)

            objects.append({
                "bbox": [x1, y1, x2, y2],
                "center": (cx, cy),
                "area": area,
                "class_id": class_id,
                "confidence": float(scores[i])
            })

        return objects

    # ---------------------------
    # FEATURE PIPELINE
    # ---------------------------
    def extract_features(self, image):
        objects = self.detect_objects(image)
        if len(objects) == 0:
            return None

        largest = max(objects, key=lambda o: o["area"])
        lx, ly = largest["center"]

        for obj in objects:
            ox, oy = obj["center"]
            obj["distance"] = np.sqrt((ox - lx)**2 + (oy - ly)**2)

        objects = sorted(objects, key=lambda o: o["distance"])

        h, w = image.shape[:2]
        features = []

        for rank, obj in enumerate(objects):
            x1, y1, x2, y2 = obj["bbox"]
            cx, cy = obj["center"]

            vec = [
                obj["class_id"],
                x1 / w, y1 / h, x2 / w, y2 / h,
                cx / w, cy / h,
                obj["area"] / (w * h),
                obj["distance"] / max(w, h),
                rank
            ]
            features.append(vec)

        return np.array(features, dtype=np.float32)

# ---------------------------
# PADDING FUNCTIONS
# ---------------------------
def pad_object_features(features, max_objects=MAX_OBJECTS, feature_dim=FEATURE_DIM):
    padded = np.zeros((max_objects, feature_dim), dtype=np.float32)
    if features is None:
        return padded

    n = min(features.shape[0], max_objects)
    padded[:n] = features[:n]
    return padded

def create_object_mask(features, max_objects=MAX_OBJECTS):
    mask = np.zeros((max_objects,), dtype=np.float32)
    if features is None:
        return mask

    n = min(features.shape[0], max_objects)
    mask[:n] = 1.0
    return mask

# ---------------------------
# MAIN DATASET LOOP
# ---------------------------
detector = SpatialObjectDetector()

image_files = [
    f for f in os.listdir(DATASET_PATH)
    if f.lower().endswith((".jpg", ".jpeg", ".png"))
]

with open(CSV_FILE, "w", newline="") as f:
    writer = csv.writer(f)

    # HEADER
    header = ["image"]
    for i in range(MAX_OBJECTS):
        for j in range(FEATURE_DIM):
            header.append(f"obj{i}_feat{j}")
    for i in range(MAX_OBJECTS):
        header.append(f"mask{i}")

    writer.writerow(header)

    # PROCESS IMAGES
    for img_name in tqdm(image_files):
        img_path = os.path.join(DATASET_PATH, img_name)
        image = cv2.imread(img_path)

        features = detector.extract_features(image)
        padded = pad_object_features(features)
        mask = create_object_mask(features)

        row = [img_name]
        row.extend(padded.flatten().tolist())
        row.extend(mask.tolist())

        writer.writerow(row)

print("\n CSV saved to:", CSV_FILE)

100%|██████████| 8091/8091 [14:29<00:00,  9.31it/s]


 CSV saved to: /content/spatial_features_padded.csv



