# FAISS (Facebook AI Similarity Search) Method

In [1]:
pip install faiss-cpu

Collecting faiss-cpu
[?25l  Downloading https://files.pythonhosted.org/packages/1d/84/9de38703486d9f00b1a63590887a318d08c52f10f768968bd7626aee75da/faiss_cpu-1.6.3-cp36-cp36m-manylinux2010_x86_64.whl (7.2MB)
[K     |████████████████████████████████| 7.2MB 2.8MB/s 
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.6.3


Importing All Essential Libraries

In [2]:
import numpy as np 
import faiss 

In [3]:
import pandas as pd

In [4]:
from numpy.linalg import norm
import pickle
from tqdm import tqdm, tqdm_notebook
import os
import time
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input

Defining a function to extract image features given an image and Resnet-50 model

In [5]:
model = ResNet50(weights='imagenet', include_top=False,
                 input_shape=(180, 180, 3))
def extract_features(img_path, model):
    input_shape = (180, 180, 3)
    img = image.load_img(img_path, target_size=(
        input_shape[0], input_shape[1]))
    img_array = image.img_to_array(img)
    expanded_img_array = np.expand_dims(img_array, axis=0)
    preprocessed_img = preprocess_input(expanded_img_array)
    features = model.predict(preprocessed_img)
    flattened_features = features.flatten()
    normalized_features = flattened_features / norm(flattened_features)
    return normalized_features

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


Function to recursively get all the image files under a root directory.

In [6]:
extensions = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG']
def get_file_list(root_dir):
    file_list = []
    counter = 1
    for root, directories, filenames in os.walk(root_dir):
        for filename in filenames:
            if any(ext in filename for ext in extensions):
                file_list.append(os.path.join(root, filename))
                counter += 1
    return file_list

Now, let's run the extraction over the entire dataset and time it.

In [10]:
root_dir = './'
files = sorted(get_file_list(root_dir))

In [11]:
feature_list = []
for i in tqdm_notebook(range(len(files))):
    feature_list.append(extract_features(files[i], model))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=417.0), HTML(value='')))




In [12]:
feature_list

[array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00529767], dtype=float32),
 array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00685702], dtype=float32),
 array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00291607], dtype=float32),
 array([0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.0037624], dtype=float32),
 array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00325159], dtype=float32),
 array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00389967], dtype=float32),
 array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00476411], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00596248], dtype=float32),
 array([0.        

In [13]:
feature_list=np.array(feature_list)

In [14]:
feature_list.shape

(417, 73728)

In [15]:
dimension = 73728    # dimensions of each vector                         
n = len(files)    # number of vectors                   
np.random.seed(1)             
db_vec = feature_list #np.random.random((n, dimension)).astype('float32')

In [16]:
db_vec.shape

(417, 73728)

In [17]:
nlist = 1  # number of clusters
quantiser = faiss.IndexFlatL2(dimension)  
index = faiss.IndexIVFFlat(quantiser, dimension, nlist,   faiss.METRIC_L2)

In [18]:
print(index.is_trained)   # False
index.train(db_vec)  # train on the database vectors
print(index.ntotal)   # 0
index.add(db_vec)   # add the vectors and update the index
print(index.is_trained)  # True
print(index.ntotal)   # 200

False
0
True
417


In [19]:
nprobe = 1  # find 2 most similar clusters
n_query = 417  
k = 10  # return 3 nearest neighbours
np.random.seed(0)   
query_vectors = feature_list#np.random.random((n_query, dimension)).astype('float32')
distances, indices = index.search(query_vectors, k)

In [20]:
distances

array([[0.        , 0.46701914, 0.46701914, ..., 0.6877724 , 0.695462  ,
        0.6985197 ],
       [0.        , 0.43002647, 0.50288963, ..., 1.0575259 , 1.0718026 ,
        1.0826042 ],
       [0.        , 0.7956074 , 0.82200146, ..., 0.8908269 , 0.90117794,
        0.90334296],
       ...,
       [0.        , 1.3720865 , 1.4678246 , ..., 1.4987099 , 1.5029275 ,
        1.5029275 ],
       [0.        , 0.5195815 , 0.5829005 , ..., 0.6215944 , 0.66547275,
        0.6662417 ],
       [0.        , 0.5484388 , 0.5607537 , ..., 0.70025676, 0.7117644 ,
        0.72001064]], dtype=float32)

In [21]:
indices

array([[  0, 248, 249, ..., 276,  21,  54],
       [  1, 353, 133, ...,  53,  25,  89],
       [  2,  18, 216, ..., 320, 389, 198],
       ...,
       [414, 270, 299, ..., 413, 378, 337],
       [415,   0, 184, ..., 248, 254, 147],
       [416,  53,  62, ..., 131,  24, 139]])

In [22]:
faiss.write_index(index,"vectors.index")  # save the index to 
# diskindex = faiss.read_index("vector.index")  # load the index 

In [23]:
diskindex = faiss.read_index("vectors.index")

In [24]:
pickle.dump(feature_list, open('features-caltech101-resnet.pickle', 'wb'))
pickle.dump(filenames, open('filenames-caltech101.pickle','wb'))

In [25]:
files = pickle.load(open('filenames-caltech101.pickle', 'rb'))
feature_list = pickle.load(open('features-caltech101-resnet.pickle', 'rb'))

In [26]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

In [27]:
files[1]

'./1025295_0.jpg'

In [28]:
files

['./1024602_0.jpg',
 './1025295_0.jpg',
 './1033540_0.jpg',
 './1034992_0.jpg',
 './1034992_1.jpg',
 './1034992_2.jpg',
 './1036386_0.jpg',
 './107871_0.jpg',
 './107871_1.jpg',
 './1079989_0.jpg',
 './1079989_1.jpg',
 './1079989_2.jpg',
 './1095279_0.jpg',
 './1095279_1.jpg',
 './1108752_0.jpg',
 './1123046_0.jpg',
 './1134177_0.jpg',
 './113836_0.jpg',
 './1148936_0.jpg',
 './1153842_0.jpg',
 './1157890_0.jpg',
 './1158718_0.jpg',
 './1158718_1.jpg',
 './1159293_0.jpg',
 './1162481_0.jpg',
 './1167245_0.jpg',
 './1170602_0.jpg',
 './1170602_1.jpg',
 './1170602_2.jpg',
 './1170602_3.jpg',
 './1186833_0.jpg',
 './1186833_1.jpg',
 './1193695_0.jpg',
 './1193695_1.jpg',
 './1193695_2.jpg',
 './1193695_3.jpg',
 './1198492_0.jpg',
 './1199966_0.jpg',
 './1199966_1.jpg',
 './1199966_2.jpg',
 './1199966_3.jpg',
 './1204417_0.jpg',
 './1204417_1.jpg',
 './1204417_2.jpg',
 './1204417_3.jpg',
 './1209465_0.jpg',
 './1223775_0.jpg',
 './1239688_0.jpg',
 './1239688_1.jpg',
 './1239688_2.jpg',
 '.

In [29]:
filenamenew = [i.split('/')[1] for i in files]

In [30]:
filenamenew

['1024602_0.jpg',
 '1025295_0.jpg',
 '1033540_0.jpg',
 '1034992_0.jpg',
 '1034992_1.jpg',
 '1034992_2.jpg',
 '1036386_0.jpg',
 '107871_0.jpg',
 '107871_1.jpg',
 '1079989_0.jpg',
 '1079989_1.jpg',
 '1079989_2.jpg',
 '1095279_0.jpg',
 '1095279_1.jpg',
 '1108752_0.jpg',
 '1123046_0.jpg',
 '1134177_0.jpg',
 '113836_0.jpg',
 '1148936_0.jpg',
 '1153842_0.jpg',
 '1157890_0.jpg',
 '1158718_0.jpg',
 '1158718_1.jpg',
 '1159293_0.jpg',
 '1162481_0.jpg',
 '1167245_0.jpg',
 '1170602_0.jpg',
 '1170602_1.jpg',
 '1170602_2.jpg',
 '1170602_3.jpg',
 '1186833_0.jpg',
 '1186833_1.jpg',
 '1193695_0.jpg',
 '1193695_1.jpg',
 '1193695_2.jpg',
 '1193695_3.jpg',
 '1198492_0.jpg',
 '1199966_0.jpg',
 '1199966_1.jpg',
 '1199966_2.jpg',
 '1199966_3.jpg',
 '1204417_0.jpg',
 '1204417_1.jpg',
 '1204417_2.jpg',
 '1204417_3.jpg',
 '1209465_0.jpg',
 '1223775_0.jpg',
 '1239688_0.jpg',
 '1239688_1.jpg',
 '1239688_2.jpg',
 '1243979_0.jpg',
 '1246621_0.jpg',
 '1248979_0.jpg',
 '1252280_0.jpg',
 '1258869_0.jpg',
 '1285337_0.j

In [32]:
index = []
for i in range(len(filenamenew)):
  index.append(i)

In [33]:
index

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [34]:
df = pd.DataFrame()
# df['index'] = index
df['images'] = filenamenew

In [35]:
df

Unnamed: 0,images
0,1024602_0.jpg
1,1025295_0.jpg
2,1033540_0.jpg
3,1034992_0.jpg
4,1034992_1.jpg
...,...
412,962607_3.jpg
413,962746_0.jpg
414,962746_1.jpg
415,971996_0.jpg


In [36]:
indi_df = pd.DataFrame(indices)
indi_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,248,249,415,281,147,254,276,21,54
1,1,353,133,96,217,317,15,53,25,89
2,2,18,216,374,53,24,78,320,389,198
3,3,4,5,195,84,331,61,112,98,359
4,4,3,5,195,84,331,359,112,98,61
...,...,...,...,...,...,...,...,...,...,...
412,412,409,134,289,149,389,137,296,140,210
413,413,283,30,231,414,271,348,252,284,31
414,414,270,299,271,120,272,30,413,378,337
415,415,0,184,351,321,54,249,248,254,147


In [None]:
# # for i in indices_df.items:
#   if i = df['']

In [37]:
df

Unnamed: 0,images
0,1024602_0.jpg
1,1025295_0.jpg
2,1033540_0.jpg
3,1034992_0.jpg
4,1034992_1.jpg
...,...
412,962607_3.jpg
413,962746_0.jpg
414,962746_1.jpg
415,971996_0.jpg


In [38]:
dfnew = df.to_dict()
dfnew = dfnew['images']

In [39]:
dfnew

{0: '1024602_0.jpg',
 1: '1025295_0.jpg',
 2: '1033540_0.jpg',
 3: '1034992_0.jpg',
 4: '1034992_1.jpg',
 5: '1034992_2.jpg',
 6: '1036386_0.jpg',
 7: '107871_0.jpg',
 8: '107871_1.jpg',
 9: '1079989_0.jpg',
 10: '1079989_1.jpg',
 11: '1079989_2.jpg',
 12: '1095279_0.jpg',
 13: '1095279_1.jpg',
 14: '1108752_0.jpg',
 15: '1123046_0.jpg',
 16: '1134177_0.jpg',
 17: '113836_0.jpg',
 18: '1148936_0.jpg',
 19: '1153842_0.jpg',
 20: '1157890_0.jpg',
 21: '1158718_0.jpg',
 22: '1158718_1.jpg',
 23: '1159293_0.jpg',
 24: '1162481_0.jpg',
 25: '1167245_0.jpg',
 26: '1170602_0.jpg',
 27: '1170602_1.jpg',
 28: '1170602_2.jpg',
 29: '1170602_3.jpg',
 30: '1186833_0.jpg',
 31: '1186833_1.jpg',
 32: '1193695_0.jpg',
 33: '1193695_1.jpg',
 34: '1193695_2.jpg',
 35: '1193695_3.jpg',
 36: '1198492_0.jpg',
 37: '1199966_0.jpg',
 38: '1199966_1.jpg',
 39: '1199966_2.jpg',
 40: '1199966_3.jpg',
 41: '1204417_0.jpg',
 42: '1204417_1.jpg',
 43: '1204417_2.jpg',
 44: '1204417_3.jpg',
 45: '1209465_0.jpg',
 

In [41]:
indi_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,248,249,415,281,147,254,276,21,54
1,1,353,133,96,217,317,15,53,25,89
2,2,18,216,374,53,24,78,320,389,198
3,3,4,5,195,84,331,61,112,98,359
4,4,3,5,195,84,331,359,112,98,61
...,...,...,...,...,...,...,...,...,...,...
412,412,409,134,289,149,389,137,296,140,210
413,413,283,30,231,414,271,348,252,284,31
414,414,270,299,271,120,272,30,413,378,337
415,415,0,184,351,321,54,249,248,254,147


In [42]:
indi_df = indi_df.replace(dfnew)
indi_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1024602_0.jpg,328944_0.jpg,328944_1.jpg,971996_0.jpg,464681_0.jpg,1833356_0.jpg,359078_0.jpg,460054_0.jpg,1158718_0.jpg,1258869_0.jpg
1,1025295_0.jpg,806458_0.jpg,1723947_0.jpg,1528196_0.jpg,236641_0.jpg,635622_0.jpg,1123046_0.jpg,1252280_0.jpg,1167245_0.jpg,1507382_0.jpg
2,1033540_0.jpg,1148936_0.jpg,229094_0.jpg,861491_0.jpg,1252280_0.jpg,1162481_0.jpg,1412285_0.jpg,674913_0.jpg,917121_0.jpg,2150029_0.jpg
3,1034992_0.jpg,1034992_1.jpg,1034992_2.jpg,2101215_0.jpg,1465250_0.jpg,7242_0.jpg,1323996_0.jpg,1629568_3.jpg,1537812_1.jpg,819483_1.jpg
4,1034992_1.jpg,1034992_0.jpg,1034992_2.jpg,2101215_0.jpg,1465250_0.jpg,7242_0.jpg,819483_1.jpg,1629568_3.jpg,1537812_1.jpg,1323996_0.jpg
...,...,...,...,...,...,...,...,...,...,...
412,962607_3.jpg,962607_0.jpg,1726783_0.jpg,475727_3.jpg,1838581_0.jpg,917121_0.jpg,1772322_0.jpg,528276_0.jpg,1822046_0.jpg,2220877_0.jpg
413,962746_0.jpg,473558_0.jpg,1186833_0.jpg,268568_3.jpg,962746_1.jpg,438049_1.jpg,782863_0.jpg,353002_0.jpg,473558_1.jpg,1186833_1.jpg
414,962746_1.jpg,438049_0.jpg,5514_0.jpg,438049_1.jpg,1684612_1.jpg,443627_0.jpg,1186833_0.jpg,962746_0.jpg,865103_3.jpg,728946_3.jpg
415,971996_0.jpg,1024602_0.jpg,2034696_1.jpg,805537_1.jpg,698597_0.jpg,1258869_0.jpg,328944_1.jpg,328944_0.jpg,359078_0.jpg,1833356_0.jpg


In [44]:
# This cell is Only for Google Colab
from google.colab import files
indi_df.to_csv('faiss.csv')


In [45]:
files.download('faiss.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>