
## Run 2:
You should develop a set of linear classifiers (an ensemble of 15 one-vs-all classifiers) using a bag-of-visual-words feature based on fixed size densely-sampled pixel patches. We recommend that you start with 8x8 patches, sampled every 4 pixels in the x and y directions. A sample of these should be clustered using K-Means to learn a vocabulary (try ~500 clusters to start). You might want to consider mean-centring and normalising each patch before clustering/quantisation. Note: we're not asking you to use SIFT features here - just take the pixels from the patches and flatten them into a vector & then use vector quantisation to map each patch to a visual word.d.

In [4]:
#Import packages
import os
import cv2
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC


In [5]:
# learn the training images
# Set the path for the training data
img_path = 'training\\training\\livingroom\\2.jpg'
image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
print(image.shape)

(220, 292)


In [5]:
# Patch size for extracting features
patch_size = 16
# Step size for extracting patches
patch_step = 8
# Number of clusters for KMeans
num_clusters = 500

# Extract patches from images
def extract_patches(image):
    patches = []
    for y in range(0, image.shape[0] - patch_size + 1, patch_step):
        for x in range(0, image.shape[1] - patch_size + 1, patch_step):
            patch = image[y:y+patch_size, x:x+patch_size]
            patches.append(patch.flatten())
    return np.array(patches)

# Mean-centring and normalising each patch
def mean_center_and_normalize(patch):
    patch_means = np.mean(patch)
    patch_mean_centered = patch - patch_means
    patch_stds = np.std(patch_mean_centered)
    patch_normalized = patch_mean_centered / patch_stds
    return patch_normalized

# Load and preprocess training data
def load_data(data_dir):
    X = []
    y = []
    all_class_names = []
    for class_name in os.listdir(data_dir):
        if class_name == '.DS_Store':  # Skip .DS_Store files
            continue
        class_dir = os.path.join(data_dir, class_name)
        all_class_names.append(class_name)
        for filename in os.listdir(class_dir):
            if filename == '.DS_Store':  # Skip .DS_Store files
                continue
            img_path = os.path.join(class_dir, filename)
            image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            image = cv2.resize(image, (256, 256))
            # Extract patches from images
            patches = extract_patches(image)
            # Mean-centring and normalising each patch
            patches = mean_center_and_normalize(patches)
            X.append(patches)
            y.append(class_name)
    label_encoder.fit(all_class_names)
    y = label_encoder.transform(y)
    return np.array(X),np.array(y)

# Initialize LabelEncoder
label_encoder = LabelEncoder()
train_dir = "training\\training"
# Load and preprocess training data
X_train, y_train = load_data(train_dir)

In [31]:
# Print the shape of the training data (features and labels)
print("Training data:", X_train.shape, y_train.shape)
# Print the number of classes encoded by the LabelEncoder
print("class_num: ", label_encoder.classes_)

Training data: (1500, 961, 256) (1500,)
class_num:  ['Coast' 'Forest' 'Highway' 'Insidecity' 'Mountain' 'Office' 'OpenCountry'
 'Street' 'Suburb' 'TallBuilding' 'bedroom' 'industrial' 'kitchen'
 'livingroom' 'store']


In [7]:
#  Learn a vocabulary and cluster using k-means
kmeans = KMeans(n_clusters=num_clusters, n_init=1, algorithm='elkan')
# Train KMeans model
kmeans.fit(X_train.reshape(-1, patch_size*patch_size))

In [8]:
import joblib
# Save the KMeans model
joblib.dump(kmeans, 'kmeans_model.pkl')

['kmeans_model.pkl']

In [9]:
# Display the input data size for KMeans clustering
print("kmeans input data size: ", X_train.reshape(-1, patch_size*patch_size).shape)
# Predict a label using KMeans for a single data point
lable = kmeans.predict(X_train[0][0].reshape(1, -1))
# Display the output data size after predicting with KMeans
print("kmeans one output data size: ", X_train[0][0].reshape(1, -1).shape)
print(lable)

(1441500, 256)
(1, 256)
[301]


In [10]:
# Vector quantization to learn bag-of-visual-words feature 
def quantize_features(X,y,kmeans_model):
    features = []
    labels = []
    for i, img in enumerate(X):
        img_hist = np.zeros(num_clusters)
        for patch in img:
            word = kmeans_model.predict(patch.reshape(1, -1))
            hist, _ = np.histogram(word, bins=range(num_clusters + 1))
            img_hist += hist
        if i % 100 == 0:
            print(i)
        features.append(img_hist)
        labels.append(y[i])
    return np.array(features), np.array(labels)

In [12]:
# Get training features
features, lables = quantize_features(X_train,y_train, kmeans)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400


In [13]:
# Save the NumPy array
np.save('features.npy', features)
np.save('lables.npy', lables)

In [14]:
# Inverse transform to get the class name from encoded label
print(label_encoder.inverse_transform([0]))

# Display the shape of the extracted features
print(features.shape)   

['Coast']
(1500, 500)


In [15]:
# Evaluate classifiers
def evaluate_classifiers(classifiers, X_test, y_test):
    predictions = np.zeros((len(X_test), len(classifiers)))
    for i, (_, classifier) in enumerate(classifiers.items()):
        predictions[:, i] = classifier.predict(X_test)
    # Predicted class label is the one with maximum score
    predicted_labels = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(y_test, predicted_labels)
    return accuracy

# Split the data into training and evaluation sets
x_train, X_eval, Y_train, y_eval = train_test_split(features, lables, test_size=0.2, random_state=42)
print("Train data:", X_train.shape, y_train.shape)
print("Eval data:", X_eval.shape, y_eval.shape)

# Train one-vs-all linear classifiers
classifiers = {}
for class_label in range(15):
        # Create binary labels for the current class vs the rest
        binary_labels = (Y_train == class_label).astype(int)
        # Train a Support Vector Machine classifier for the current class
        classifier = SVC(kernel='linear')
        classifier.fit(x_train, binary_labels)
        classifiers[class_label] = classifier

# Evaluate the trained classifiers on the evaluation dataset
test_accuracy = evaluate_classifiers(classifiers, X_eval, y_eval)
print("Test Accuracy:", test_accuracy)

Train data: (1500, 961, 256) (1500,)
Eval data: (300, 500) (300,)
Test Accuracy: 0.34


In [16]:
# Load and preprocess test data
def load_test_data(test_dir):
    X_test = []
    for filename in os.listdir(test_dir):
        img_path = os.path.join(test_dir, filename)
        image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        image = cv2.resize(image, (256, 256))
        image.astype(np.float32)
        patches = extract_patches_test(image)
        patches = mean_center_and_normalize(patches)
        X_test.append(patches)
    return np.array(X_test)

# Extract patches from images
def extract_patches_test(image):
    patches = []
    for y in range(0, image.shape[0] - patch_size + 1, patch_step):
        for x in range(0, image.shape[1] - patch_size + 1, patch_step):
            patch = image[y:y+patch_size, x:x+patch_size]
            patches.append(patch.flatten())
    return np.array(patches)

# Mean-centring and normalising each patch
def mean_center_and_normalize_test(patch):
    patch_means = np.mean(patch)
    patch_mean_centered = patch - patch_means
    patch_stds = np.std(patch_mean_centered)
    patch_normalized = patch_mean_centered / patch_stds
    return patch_normalized

# Load a single test image and preprocess it
img_path = 'testing\\testing\\4.jpg'
image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
print("one img size", image.shape)
# Load and preprocess test data
test_dir = "testing\\testing"
X_test = load_test_data(test_dir)
print("Test data:", X_test.shape)

(256, 256)
Test data: (2985, 961, 256)


In [None]:
# Vector quantization to learn bag-of-visual-words feature 
def quantize_features(X, kmeans_model):
    features = []
    for img in X:
        img_hist = np.zeros(num_clusters)
        for patch in img:
            word = kmeans_model.predict(patch.reshape(1, -1))
            hist, _ = np.histogram(word, bins=range(num_clusters + 1))
            img_hist += hist
        features.append(img_hist)
    return np.array(features)

# Evaluate classifiers
def evaluate_classifiers(classifiers, X_test):
    predictions = np.zeros((len(X_test), len(classifiers)))
    for i, (_, classifier) in enumerate(classifiers.items()):
        predictions[:, i] = classifier.predict(X_test)
    # Predicted class label is the one with maximum score
    predicted_labels = np.argmax(predictions, axis=1)
    return predicted_labels

# Get test data's bag-of-visual-words feature 
features_test = quantize_features(X_test, kmeans)

In [32]:
# Print the shape of the test data features
print('features_test.shape',features_test.shape)
# Print the features of the first sample in the test data
print("features_test[1]",features_test[1])
# Save the test data features to a NumPy file
np.save('features_test.npy', features_test)

features_test.shape (2985, 500)
features_test[1] [ 1.  2.  2.  5.  0.  0.  2.  4.  1.  5.  4.  3.  2.  4.  4.  2.  0.  2.
  1.  1.  0.  5.  0.  3.  2.  0.  1.  0.  0.  2.  2.  0.  0.  0.  0.  0.
  0.  3.  3.  5.  3.  2.  2.  1.  1.  0.  3.  1.  0.  4.  0.  1.  2.  3.
  0.  4.  0.  2.  0.  2.  0.  1.  0.  0.  0.  0.  4.  3.  0.  2.  0.  3.
  3.  0.  2.  0.  0.  5.  2.  1.  1.  4.  1.  3.  0.  5.  0.  1.  6.  0.
  0.  3.  0.  0.  0.  7.  0.  2.  2.  0.  1.  0.  0.  1.  0.  2. 23.  0.
  0.  1.  5.  0.  6.  4.  1.  0.  0.  5.  8.  0.  0.  0.  0.  0.  0.  3.
  2. 11.  8.  0.  4.  1.  0.  0.  2.  0.  2.  1.  7.  3.  0.  1.  0.  1.
  6.  4.  1.  5.  4.  4.  3.  0.  0.  2.  0.  5.  0.  7.  1.  1.  0.  4.
  0.  5.  3.  5.  2.  0.  0.  0.  4.  0.  0.  2.  2.  0.  3.  1.  7.  2.
  0.  0.  4.  0.  3.  1.  3.  3.  4.  0.  0.  0.  4.  1.  0.  3.  4.  2.
  3.  0.  1.  3.  1.  2.  2.  1.  1.  6.  3.  0.  1.  0.  1.  0.  1.  0.
  9.  0.  0.  0.  1.  0.  1.  1.  0.  0.  0.  1.  6.  4.  5.  4.  6.  7.
  

In [26]:
# Initialize an empty list to store the predictions
predictions = []
prediction = np.zeros((len(features_test), len(classifiers)))
# Iterate over each classifier and make predictions on the test features
for i, (_, classifier) in enumerate(classifiers.items()):
        prediction[:, i] = classifier.predict(features_test)
# Predicted class label is the one with maximum score
predictions = np.argmax(prediction, axis=1)

7


In [27]:
# Print the predicted class label for the 0th test sample
print(predictions[0])
# Inverse transform the predicted class label to get the original class name
print(label_encoder.inverse_transform([predictions[0]]))

7
['Street']


In [30]:
# Save the predictions to a run2.txt file
test_dir = "testing\\testing"
output_file = "run2.txt"
with open(output_file, 'w') as f:
    for filename, predicted_class in zip(os.listdir(test_dir), predictions):
        class_name = label_encoder.inverse_transform([predicted_class])[0]
        f.write(f"{filename} {class_name}\n")