# **REVERSE IMAGE SEARCH ENGINE - FEATURE EXTRACTION**

# IMPORTS

In [1]:
import numpy as np
from numpy.linalg import norm
import os
import random
import math
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model

# UTILITIES

In [2]:
def extract_features(img_path, feature_model):
    img = load_img(img_path, target_size=(img_width, img_height))
    img_array = img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)

    features = feature_model.predict(img_array)
    flattened = features.flatten()
    normalized = flattened / norm(flattened)
    return normalized

In [3]:
extensions = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG']

def get_file_paths(root_dir):
    file_list = []
    for root, directories, filenames in os.walk(root_dir):
        for filename in filenames:
            if any(ext in filename for ext in extensions):
                filepath = os.path.join(root, filename)
                if os.path.exists(filepath):
                  file_list.append(filepath)
                else:
                  print(filepath)
    return file_list

In [4]:
def define_model():
    base_model = ResNet50(include_top=False, input_shape=(img_width, img_height, 3), pooling='avg')
    for layer in base_model.layers:
        layer.trainable = False

    input_tensor = Input(shape=(img_width, img_height, 3))
    x = base_model(input_tensor)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.4)(x)
    output_tensor = Dense(num_classes, activation='softmax')(x)

    full_model = Model(inputs=input_tensor, outputs=output_tensor)
    return full_model, base_model

# DATA DOWNLOAD

In [5]:
os.makedirs('/content/caltech101', exist_ok=True)

!curl -L "https://data.caltech.edu/records/mzrjq-6wc02/files/caltech-101.zip?download=1" -o "/content/caltech101/caltech-101.zip"
!unzip "/content/caltech101/caltech-101.zip" -d "/content/caltech101/"
!tar -xzf /content/caltech101/caltech-101/101_ObjectCategories.tar.gz -C /content/caltech101/caltech-101/
!rm -rf /content/101_ObjectCategories/BACKGROUND_Google
!rm -rf /content/caltech101/__MACOSX
!rm -rf /content/caltech101/caltech-101.zip
!rm -rf /content/caltech101/caltech-101/101_ObjectCategories/BACKGROUND_Google

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   476  100   476    0     0    795      0 --:--:-- --:--:-- --:--:--   794
100  131M  100  131M    0     0  8688k      0  0:00:15  0:00:15 --:--:-- 8355k
Archive:  /content/caltech101/caltech-101.zip
   creating: /content/caltech101/caltech-101/
  inflating: /content/caltech101/__MACOSX/._caltech-101  
  inflating: /content/caltech101/caltech-101/101_ObjectCategories.tar.gz  
  inflating: /content/caltech101/__MACOSX/caltech-101/._101_ObjectCategories.tar.gz  
  inflating: /content/caltech101/caltech-101/show_annotation.m  
  inflating: /content/caltech101/__MACOSX/caltech-101/._show_annotation.m  
  inflating: /content/caltech101/caltech-101/Annotations.tar  
  inflating: /content/caltech101/__MACOSX/caltech-101/._Annotations.tar  


# FEATURE EXTRACTION

In [6]:
root_dir = '/content/caltech101/caltech-101/101_ObjectCategories'

filenames = sorted(get_file_paths(root_dir))
print(f'There are {len(filenames)} files in the dataset.')

There are 8677 files in the dataset.


In [7]:
train_samples = 8677
num_classes = 101
img_width, img_height = 224, 224
batch_size = 128

In [8]:
train_datagen = ImageDataGenerator(preprocessing_function=preprocess_input,
                                   rotation_range=25,
                                   width_shift_range=0.15,
                                   height_shift_range=0.15,
                                   zoom_range=0.3)

In [9]:
train_generator = train_datagen.flow_from_directory(root_dir,
                                                    target_size=(img_width, img_height),
                                                    shuffle=True,
                                                    seed=10000,
                                                    class_mode='categorical')

Found 8677 images belonging to 101 classes.


In [10]:
num_images = len(train_generator.filenames)
steps_per_epochs = int(math.ceil(num_images / batch_size))
print(f'Number of images: {num_images}')
print(f'Number of steps per epochs: {steps_per_epochs}')

Number of images: 8677
Number of steps per epochs: 68


In [11]:
filenames = [root_dir + '/' + s for s in train_generator.filenames]

In [12]:
model_finetuned, feature_extractor = define_model()
model_finetuned.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(0.001),
    metrics=['acc']
)

model_finetuned.fit(
    train_generator,
    steps_per_epoch=steps_per_epochs,
    epochs=10
)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 0us/step


  self._warn_if_super_not_called()


Epoch 1/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 399ms/step - acc: 0.2058 - loss: 4.0095
Epoch 2/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 458ms/step - acc: 0.4136 - loss: 2.6702
Epoch 3/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 415ms/step - acc: 0.5276 - loss: 2.0453
Epoch 4/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 404ms/step - acc: 0.5785 - loss: 1.7031
Epoch 5/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 408ms/step - acc: 0.6426 - loss: 1.4548
Epoch 6/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 398ms/step - acc: 0.6665 - loss: 1.2592
Epoch 7/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 401ms/step - acc: 0.6950 - loss: 1.1590
Epoch 8/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 409ms/step - acc: 0.7066 - loss: 1.1007
Epoch 9/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s

<keras.src.callbacks.history.History at 0x7e9bf566c250>

In [None]:
all_features = np.array([extract_features(img_path, feature_extractor) for img_path in filenames])
np.save("features.npy", all_features)
np.save("filenames.npy", filenames)

In [13]:
class_ids = train_generator.classes
np.save("class_ids.npy", class_ids)