# Build a Visual Scene Recognition System

In this assignment, we will implement a visual scene recognition system using the Bag-of-Visual-Words (BoVW) model. The system comprises three main components:

    Convert Image to Word Map
    Use a precomputed visual dictionary (e.g., obtained by clustering SIFT descriptors or filter responses) and assign each pixel or keypoint in an image to its closest visual word.

    Get Image Features
    From the word map, compute a histogram (or other feature vector) that represents the image’s visual content.

    Build Recognition System – Nearest Neighbors
    Build a scene classifier using a nearest neighbors approach where images are classified based on the similarity of their feature histograms.

In [1]:
import numpy as np
import cv2
import os
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from typing import List, Dict, Any

### Part 1: Convert Image to Word Map

In [4]:
def convert_image_to_wordmap(image: np.ndarray, dictionary: np.ndarray) -> np.ndarray:
    """
    Given an image and a visual dictionary, convert the image into a word map.
    
    For each pixel (or keypoint region), extract a feature vector (e.g., raw pixel intensities,
    filter responses, or SIFT features) and then assign it to the nearest visual word (dictionary entry).
    
    Args:
        image: Input image (RGB) as a numpy array.
        dictionary: A 2D numpy array where each row is a visual word (centroid).
    
    Returns:
        wordmap: A 2D numpy array with the same spatial dimensions as the input image, where each value 
                 corresponds to the index of the nearest visual word.
    """
    # Convert image to grayscale (or use any other feature extraction method)
    gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    
    # TODO: Extract features for each pixel or keypoint.
    # For simplicity, you may use small patches around each pixel.
    # For example, you could use a sliding window to get patches and then flatten them.
    # Here, we provide a very basic pixel intensity as the "feature".
    features = gray.reshape(-1, 1)  # Shape: (H*W, 1)
    
    # TODO: Compute the distance between each feature and each word in the dictionary.
    # Hint: You can use np.linalg.norm or np.sum((features - dictionary)**2, axis=1) for each feature.
    
    H, W = gray.shape
    wordmap = np.zeros((H * W,), dtype=int)
    for i in range(features.shape[0]):
        # Compute distances between feature i and each word in the dictionary.
        # TODO: Replace the line below with your distance computation.
        distances = np.linalg.norm(dictionary - features[i], axis=1)
        # TODO: Assign the index of the nearest word to wordmap.
        wordmap[i] = np.argmin(distances)
    
    wordmap = wordmap.reshape(H, W)
    return wordmap

### Part 2: Get Image Features

In [6]:
def get_image_features(wordmap: np.ndarray, dict_size: int) -> np.ndarray:
    """
    Compute the histogram of visual words for the given wordmap.
    
    Args:
        wordmap: 2D numpy array where each element is an index of a visual word.
        dict_size: The number of visual words (size of the dictionary).
        
    Returns:
        feature_hist: 1D numpy array of normalized histogram counts with length dict_size.
    """
    # TODO: Compute the histogram of visual words in the wordmap.
    hist, _ = np.histogram(wordmap, bins=np.arange(dict_size + 1))
    
    # TODO: Normalize the histogram.
    feature_hist = hist.astype(float) / (np.sum(hist) + 1e-6)
    
    return feature_hist

### Part 3: Build Recognition System - Nearest Neighbors

In [8]:
def build_recognition_system(train_features: np.ndarray, train_labels: np.ndarray) -> Any:
    """
    Build a recognition system using a nearest neighbors classifier.
    
    Args:
        train_features: 2D numpy array where each row is an image feature histogram.
        train_labels: 1D numpy array of labels corresponding to the training images.
        
    Returns:
        knn: Trained nearest neighbors classifier.
    """
    # TODO: Create and train a KNeighborsClassifier from scikit-learn.
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(train_features, train_labels)
    return knn

def evaluate_recognition_system(knn: Any, test_features: np.ndarray, test_labels: np.ndarray) -> None:
    """
    Evaluate the recognition system and print accuracy and confusion matrix.
    
    Args:
        knn: The trained nearest neighbors classifier.
        test_features: 2D numpy array of test image features.
        test_labels: 1D numpy array of test image labels.
    """
    # TODO: Predict labels for the test set using the trained classifier.
    pred_labels = knn.predict(test_features)
    
    # Compute accuracy
    accuracy = np.mean(pred_labels == test_labels)
    print(f"Accuracy: {accuracy*100:.2f}%")
    
    # TODO: Optionally, compute and display the confusion matrix.
    # You can use sklearn.metrics.confusion_matrix for this.
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(test_labels, pred_labels)
    print("Confusion Matrix:")
    print(cm)

In [12]:
import os
import shutil
import random

src_dir = 'dataset/caltech-101/caltech-101/101_ObjectCategories'
train_dir = 'dataset/caltech-101/caltech-101/train'
test_dir = 'dataset/caltech-101/caltech-101/test'

os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

for category in os.listdir(src_dir):
    src_category_path = os.path.join(src_dir, category)
    if not os.path.isdir(src_category_path) or category == 'BACKGROUND_Google':
        continue

    files = [f for f in os.listdir(src_category_path) if f.endswith(('.jpg', '.jpeg', '.png'))]
    random.shuffle(files)
    split_idx = int(len(files) * 0.8)

    train_files = files[:split_idx]
    test_files = files[split_idx:]

    os.makedirs(os.path.join(train_dir, category), exist_ok=True)
    os.makedirs(os.path.join(test_dir, category), exist_ok=True)

    for f in train_files:
        shutil.copy(os.path.join(src_category_path, f), os.path.join(train_dir, category, f))
    for f in test_files:
        shutil.copy(os.path.join(src_category_path, f), os.path.join(test_dir, category, f))


Run tests to verify the implementations.

In [15]:
train_dir = "dataset/caltech-101/caltech-101/train"
test_dir = "dataset/caltech-101/caltech-101/test"

# For this example, assume dictionary is precomputed. 
# In a full system, you would extract features from training images and run k-means.
dict_size = 50  # Example dictionary size
# TODO: Load or compute your visual dictionary.
# Here, we randomly initialize a dummy dictionary.

# dictionary = np.random.rand(dict_size, 1)
dictionary = np.load("dictionary.npy")
dict_size = dictionary.shape[0]
print(f"Loaded dictionary with {dict_size} visual words")


# Process training images
train_features = []
train_labels = []
train_classes = os.listdir(train_dir)  # each subfolder is a class
for label, cls in enumerate(train_classes):
    cls_dir = os.path.join(train_dir, cls)
    for filename in os.listdir(cls_dir):
        if filename.endswith(('.jpg', '.png', '.jpeg')):
            img_path = os.path.join(cls_dir, filename)
            image = cv2.imread(img_path)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            image = cv2.resize(image, (256, 256))
            wordmap = convert_image_to_wordmap(image, dictionary)
            feat = get_image_features(wordmap, dict_size)
            train_features.append(feat)
            train_labels.append(label)

train_features = np.array(train_features)
train_labels = np.array(train_labels)

# Build recognition system
knn = build_recognition_system(train_features, train_labels)

# Process test images
test_features = []
test_labels = []
test_classes = os.listdir(test_dir)
for label, cls in enumerate(test_classes):
    cls_dir = os.path.join(test_dir, cls)
    for filename in os.listdir(cls_dir):
        if filename.endswith(('.jpg', '.png', '.jpeg')):
            img_path = os.path.join(cls_dir, filename)
            image = cv2.imread(img_path)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            image = cv2.resize(image, (256, 256))
            wordmap = convert_image_to_wordmap(image, dictionary)
            feat = get_image_features(wordmap, dict_size)
            test_features.append(feat)
            test_labels.append(label)
            
test_features = np.array(test_features)
test_labels = np.array(test_labels)

# Evaluate recognition system
evaluate_recognition_system(knn, test_features, test_labels)

Loaded dictionary with 100 visual words
Accuracy: 13.05%
Confusion Matrix:
[[  4   0   0 ...   0   0   0]
 [  0 149   0 ...   0   0   0]
 [  0   5   0 ...   0   0   0]
 ...
 [  0  10   0 ...   0   0   0]
 [  0   8   0 ...   0   0   0]
 [  0   6   0 ...   0   0   0]]
