In [1]:
import tensorflow as tf
print(tf.__version__)

2.9.1


In [26]:
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import numpy as np
from numpy.linalg import norm
import pickle

from tqdm import tqdm, tqdm_notebook
import os
import time
import math

import matplotlib.pyplot as plt
import matplotlib.image as mpimg

In [3]:
INPUT_SHAPE = (224, 224, 3)

model = ResNet50(weights='imagenet', 
                 include_top=False, 
                 input_shape=INPUT_SHAPE,
                 pooling='max')

2022-08-30 12:31:41.234494: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
def extract_features(file_path, model):
    img = image.load_img(file_path, target_size=(INPUT_SHAPE[0], INPUT_SHAPE[1]))
    img_array = image.img_to_array(img) # add channels
    img_array = img_array[np.newaxis, ...] # add batch dimension
    img_preprocessed = preprocess_input(img_array)
    
    img_features = model.predict(img_preprocessed, verbose=0)
#     print(img_features.shape) # (1, 7, 7, 2048)
    img_features = img_features.flatten()
    img_features_norm = img_features / norm(img_features)
    
    return img_features_norm

In [5]:
# !ls data/caltech-101/caltech101/cellphone/

In [6]:
def get_file_list(root_dir):
    file_list = []
    
    for root, directories, filenames in os.walk(root_dir):
        for filename in filenames:
            if 'jpg' in filename:
                file_list.append(os.path.join(root, filename))
                
    return file_list

In [7]:
file_list = get_file_list('data/caltech101')

In [9]:
# ! tree caltech101 | grep '.jpg' | wc -l   # 8677
print("Total number of files:", len(file_list))

Total number of files: 8677


In [13]:
# print the first 5 files
filenames = sorted(file_list)
filenames[:2]

['data/caltech101/Faces/image_0001.jpg',
 'data/caltech101/Faces/image_0002.jpg',
 'data/caltech101/Faces/image_0003.jpg',
 'data/caltech101/Faces/image_0004.jpg',
 'data/caltech101/Faces/image_0005.jpg',
 'data/caltech101/Faces/image_0006.jpg',
 'data/caltech101/Faces/image_0007.jpg',
 'data/caltech101/Faces/image_0008.jpg',
 'data/caltech101/Faces/image_0009.jpg',
 'data/caltech101/Faces/image_0010.jpg',
 'data/caltech101/Faces/image_0011.jpg',
 'data/caltech101/Faces/image_0012.jpg',
 'data/caltech101/Faces/image_0013.jpg',
 'data/caltech101/Faces/image_0014.jpg',
 'data/caltech101/Faces/image_0015.jpg',
 'data/caltech101/Faces/image_0016.jpg',
 'data/caltech101/Faces/image_0017.jpg',
 'data/caltech101/Faces/image_0018.jpg',
 'data/caltech101/Faces/image_0019.jpg',
 'data/caltech101/Faces/image_0020.jpg']

In [12]:
for root, directories, filenames in os.walk('data/caltech101'):
    print(f'ROOT: {root}')
    print(f'DIRECTORIES: {directories}')
    print(f'FILE:{filenames}')

ROOT: data/caltech101
DIRECTORIES: ['gerenuk', 'hawksbill', 'headphone', 'ant', 'butterfly', 'lamp', 'strawberry', 'water_lilly', 'chandelier', 'dragonfly', 'crab', 'pagoda', 'dollar_bill', 'emu', 'inline_skate', 'platypus', 'dalmatian', 'cup', 'airplanes', 'joshua_tree', 'cougar_body', 'grand_piano', 'trilobite', 'brontosaurus', 'wild_cat', 'pigeon', 'dolphin', 'soccer_ball', 'wrench', 'scorpion', 'flamingo_head', 'nautilus', 'accordion', 'cougar_face', 'pyramid', 'camera', 'barrel', 'schooner', 'cellphone', 'panda', 'revolver', 'lobster', 'menorah', 'lotus', 'stapler', 'crocodile', 'chair', 'helicopter', 'minaret', 'starfish', 'ceiling_fan', 'ketch', 'mayfly', 'wheelchair', 'bass', 'yin_yang', 'crocodile_head', 'saxophone', 'beaver', 'mandolin', 'bonsai', 'Leopards', 'car_side', 'ibis', 'electric_guitar', 'kangaroo', 'stegosaurus', 'ferry', 'snoopy', 'umbrella', 'rhino', 'okapi', 'watch', 'brain', 'gramophone', 'scissors', 'rooster', 'cannon', 'binocular', 'anchor', 'octopus', 'buddh

In [36]:
# extract all the features from all the files
feature_list = []
num_files = len(filenames)

start_time = time.time()
for i in range(num_files):
    print(f'{np.round((i+1)/num_files * 100, 3)}%     ', end='\r')
    feature_list.append(extract_features(filenames[i], model))
end_time = time.time()

100.0%      

In [45]:
end_time - start_time
(end_time - start_time)/60

27.93273154894511

In [40]:
print(len(feature_list))
print(feature_list[0].shape)


8677
(2048,)


In [41]:
img_data = ImageDataGenerator(preprocessing_function=preprocess_input)
img_generator = img_data.flow_from_directory('data/caltech101',
                                            target_size=(INPUT_SHAPE[0], INPUT_SHAPE[1]),
                                            class_mode=None,
                                            shuffle=False)

Found 8677 images belonging to 101 classes.


In [42]:
batch_size = 128
num_files = len(file_list)
num_epochs = int(np.ceil(num_files / batch_size))
print(f'Number of epochs: {num_epochs}')

Number of epochs: 68


In [46]:
num_epochs * batch_size

8704

In [43]:
# feature_list4 = model.predict(img_generator, steps=num_epochs)
feature_list4 = model.predict(img_generator)



In [44]:
feature_list4.shape

(8677, 2048)

In [52]:
batch_size = 128
datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

generator = datagen.flow_from_directory('data/caltech101',
                                        target_size=(224, 224),
                                        class_mode=None,
                                        shuffle=False)

num_images = len(generator.filenames)
num_epochs = int(math.ceil(num_images / batch_size))

print('Batch size:', batch_size)
print('Number of images:', num_images)
print('Number of epochs:', num_epochs)

start_time = time.time()
feature_list2 = []
feature_list2 = model.predict(generator, batch_size=batch_size, steps=num_epochs)
end_time = time.time()

Found 8677 images belonging to 101 classes.
Batch size: 128
Number of iamges: 8677
Number of epochs: 68


In [51]:
print((end_time - start_time)/60)
print(feature_list2.shape)
len(generator.filenames)

4.664108395576477
(2176, 2048)


8677

In [54]:
directories_list = []

for root, directories, filenames in os.walk('data/caltech101'):
    for directory in directories:
        directories_list.append(directory)
        

sorted(directories_list)

['Faces',
 'Faces_easy',
 'Leopards',
 'Motorbikes',
 'accordion',
 'airplanes',
 'anchor',
 'ant',
 'barrel',
 'bass',
 'beaver',
 'binocular',
 'bonsai',
 'brain',
 'brontosaurus',
 'buddha',
 'butterfly',
 'camera',
 'cannon',
 'car_side',
 'ceiling_fan',
 'cellphone',
 'chair',
 'chandelier',
 'cougar_body',
 'cougar_face',
 'crab',
 'crayfish',
 'crocodile',
 'crocodile_head',
 'cup',
 'dalmatian',
 'dollar_bill',
 'dolphin',
 'dragonfly',
 'electric_guitar',
 'elephant',
 'emu',
 'euphonium',
 'ewer',
 'ferry',
 'flamingo',
 'flamingo_head',
 'garfield',
 'gerenuk',
 'gramophone',
 'grand_piano',
 'hawksbill',
 'headphone',
 'hedgehog',
 'helicopter',
 'ibis',
 'inline_skate',
 'joshua_tree',
 'kangaroo',
 'ketch',
 'lamp',
 'laptop',
 'llama',
 'lobster',
 'lotus',
 'mandolin',
 'mayfly',
 'menorah',
 'metronome',
 'minaret',
 'nautilus',
 'octopus',
 'okapi',
 'pagoda',
 'panda',
 'pigeon',
 'pizza',
 'platypus',
 'pyramid',
 'revolver',
 'rhino',
 'rooster',
 'saxophone',
 'sc

In [32]:
features_file = 'data/features.pickle'
filenames_file = 'data/filenames.pickle'

# pickle.dump(feature_list, open(features_file, 'wb'))
# pickle.dump(filenames, open(filenames_file, 'wb'))
# pickle.dump()

In [33]:
# load the files
feature_list3 = pickle.load(open(features_file, 'rb'))
filenames = pickle.load(open(filenames_file, 'rb'))

In [35]:
len(feature_list3)

8677

In [None]:
# traina neares neighbors model 
# to find the neares 5 neighbors basd on Euclidean distance
from sklearn.neighbors import NearestNeighbors

neighbors_model = NearestNeighbors(n_neighbors=5, algorithm='brute', metric='euclidean')
neighbors_fitted = neighbors_model.fit(feature_list)


In [None]:
# get the 5 closest neighbors of the first image
distances, indices = neighbors_fitted.kneighbors([feature_list[0]])

In [None]:
print(distances)

In [None]:
print(indices)
print(type(indices))

In [None]:
plt.imshow(mpimg.imread(filenames[0]))

In [None]:
# unsurprisingly, the nearest neighbor is the image itself
plt.imshow(mpimg.imread(filenames[indices[0,0]]))

In [None]:
def plot_figures(indices):
    plt.figure(figsize=(20,25))

    for i in range(5):
        ax = plt.subplot(1,5, i+1)
        index = indices[i]
        ax.imshow(mpimg.imread(filenames[index]))
        
plot_figures(indices[0])

In [None]:
idx = np.random.randint(0, num_files)
distances, indices = neighbors_fitted.kneighbors([feature_list[idx]])

plot_figures(indices[0])

In [None]:
idx = np.random.randint(0, num_files)
distances, indices = neighbors_fitted.kneighbors([feature_list[idx]])

plot_figures(indices[0])

In [None]:
# repeat it 5 times
for i in range(5):
    idx = np.random.randint(0, num_files)
    distances, indices = neighbors_fitted.kneighbors([feature_list[idx]])
    
    plot_figures(indices[0])

In [None]:
 # we are going to use PCA to reduce the dimensionlity of the data
# then, we will use tsne to create a 2D representation of the clusters
from sklearn.decomposition import PCA

pca = PCA(n_components=100)
pca_fitted = pca.fit(feature_list)
feature_list_compressed = pca.transform(feature_list)

In [None]:
feature_list_compressed.shape

In [None]:
# take the first 3000
feature_compressed_3000 = feature_list_compressed[:3000, :]
filenames_3000 = filenames[:3000]
