# FAISS (Facebook AI Similarity Search) Method

In [1]:
pip install faiss-cpu

Collecting faiss-cpu
[?25l  Downloading https://files.pythonhosted.org/packages/98/e9/ea9196f67f7a4c8b3805d5e09d186aba002ece16738fb8af203025fefa59/faiss_cpu-1.6.4.post2-cp36-cp36m-manylinux2014_x86_64.whl (7.9MB)
[K     |████████████████████████████████| 7.9MB 2.6MB/s 
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.6.4.post2


Importing All Essential Libraries

In [2]:
import numpy as np
import faiss 
import time

In [3]:
import pandas as pd

In [4]:
from numpy.linalg import norm
import pickle
from tqdm import tqdm, tqdm_notebook
import os
import time
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input

Defining a function to extract image features given an image and Resnet-50 model

In [5]:
model = ResNet50(weights='imagenet', include_top=False,
                 input_shape=(180, 180, 3))
def extract_features(img_path, model):
    input_shape = (180, 180, 3)
    img = image.load_img(img_path, target_size=(
        input_shape[0], input_shape[1]))
    img_array = image.img_to_array(img)
    expanded_img_array = np.expand_dims(img_array, axis=0)
    preprocessed_img = preprocess_input(expanded_img_array)
    features = model.predict(preprocessed_img)
    flattened_features = features.flatten()
    normalized_features = flattened_features / norm(flattened_features)
    return normalized_features

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


Function to recursively get all the image files under a root directory.

In [6]:
extensions = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG']
def get_file_list(root_dir):
    file_list = []
    counter = 1
    for root, directories, filenames in os.walk(root_dir):
        for filename in filenames:
            if any(ext in filename for ext in extensions):
                file_list.append(os.path.join(root, filename))
                counter += 1
    return file_list

Now, let's run the extraction over the entire dataset and time it.

In [7]:
root_dir = './'
files = sorted(get_file_list(root_dir))

In [8]:
feature_list = []
for i in tqdm_notebook(range(len(files))):
    feature_list.append(extract_features(files[i], model))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=780.0), HTML(value='')))




In [9]:
feature_list

[array([0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.0096454], dtype=float32),
 array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.01078847], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00303162], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00891707], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
       

In [10]:
feature_list=np.array(feature_list)

In [11]:
feature_list.shape

(780, 73728)

In [12]:
dimension = 73728    # dimensions of each vector                         
n = len(files)    # number of vectors                   
np.random.seed(1)             
db_vec = feature_list #np.random.random((n, dimension)).astype('float32')

In [13]:
db_vec.shape

(780, 73728)

In [14]:
nlist = 1  # number of clusters
quantiser = faiss.IndexFlatL2(dimension)  
index = faiss.IndexIVFFlat(quantiser, dimension, nlist,   faiss.METRIC_L2)

In [15]:
print(index.is_trained)   # False
index.train(db_vec)  # train on the database vectors
print(index.ntotal)   # 0
index.add(db_vec)   # add the vectors and update the index
print(index.is_trained)  # True
print(index.ntotal)   # 200

False
0
True
780


In [16]:
nprobe = 1  # find 2 most similar clusters
n_query = 417  
k = 10  # return 3 nearest neighbours
np.random.seed(0)   
query_vectors = feature_list#np.random.random((n_query, dimension)).astype('float32')
distances, indices = index.search(query_vectors, k)

In [17]:
distances

array([[0.        , 0.48963004, 0.48963004, ..., 0.53567934, 0.55061907,
        0.5957324 ],
       [0.        , 0.8765255 , 0.88154906, ..., 0.913577  , 0.913577  ,
        0.913577  ],
       [0.        , 1.0187123 , 1.0458761 , ..., 1.1246294 , 1.1604613 ,
        1.1685632 ],
       ...,
       [0.        , 1.0838999 , 1.1121213 , ..., 1.18502   , 1.1939578 ,
        1.200575  ],
       [0.        , 0.85835785, 0.8804574 , ..., 1.0066422 , 1.0141807 ,
        1.0467707 ],
       [0.        , 0.5098868 , 0.86722827, ..., 1.0148221 , 1.0148499 ,
        1.0183281 ]], dtype=float32)

In [18]:
indices

array([[  0, 342, 197, ..., 562, 716, 218],
       [  1,   0, 252, ..., 657, 342, 243],
       [  2, 228,  18, ..., 625,  63, 555],
       ...,
       [777, 393, 447, ..., 392, 752, 445],
       [778, 759,  74, ..., 246, 668, 221],
       [779, 776, 425, ..., 559, 322, 396]])

In [19]:
faiss.write_index(index,"vectors.index")  # save the index to 
# diskindex = faiss.read_index("vector.index")  # load the index 

In [20]:
diskindex = faiss.read_index("vectors.index")

In [21]:
pickle.dump(feature_list, open('features-cdiscount-resnet.pickle', 'wb'))
pickle.dump(files, open('filenames-cdiscount.pickle','wb'))

In [22]:
files = pickle.load(open('filenames-cdiscount.pickle', 'rb'))
feature_list = pickle.load(open('features-cdiscount-resnet.pickle', 'rb'))

In [23]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

In [24]:
files[1]

'./1000_1.jpg'

In [25]:
files

['./1000_0.jpg',
 './1000_1.jpg',
 './1000_2.jpg',
 './1000_3.jpg',
 './1002_0.jpg',
 './1008_0.jpg',
 './1008_1.jpg',
 './1009_0.jpg',
 './1013_0.jpg',
 './1013_1.jpg',
 './1015_0.jpg',
 './1015_1.jpg',
 './1016_0.jpg',
 './1028_0.jpg',
 './1028_1.jpg',
 './1028_2.jpg',
 './102_0.jpg',
 './1045_0.jpg',
 './1048_0.jpg',
 './104_0.jpg',
 './1052_0.jpg',
 './105_0.jpg',
 './1070_0.jpg',
 './1070_1.jpg',
 './1070_2.jpg',
 './1072_0.jpg',
 './1077_0.jpg',
 './1077_1.jpg',
 './1088_0.jpg',
 './1088_1.jpg',
 './1088_2.jpg',
 './1088_3.jpg',
 './1091_0.jpg',
 './1095_0.jpg',
 './1096_0.jpg',
 './1102_0.jpg',
 './1102_1.jpg',
 './1102_2.jpg',
 './1102_3.jpg',
 './1106_0.jpg',
 './1106_1.jpg',
 './1107_0.jpg',
 './1109_0.jpg',
 './1111_0.jpg',
 './1117_0.jpg',
 './1118_0.jpg',
 './1118_1.jpg',
 './1119_0.jpg',
 './1121_0.jpg',
 './1122_0.jpg',
 './1125_0.jpg',
 './1125_1.jpg',
 './1125_2.jpg',
 './1125_3.jpg',
 './112_0.jpg',
 './112_1.jpg',
 './1141_0.jpg',
 './1141_1.jpg',
 './1141_2.jpg',
 '

In [26]:
filenamenew = [i.split('/')[1] for i in files]

In [27]:
filenamenew

['1000_0.jpg',
 '1000_1.jpg',
 '1000_2.jpg',
 '1000_3.jpg',
 '1002_0.jpg',
 '1008_0.jpg',
 '1008_1.jpg',
 '1009_0.jpg',
 '1013_0.jpg',
 '1013_1.jpg',
 '1015_0.jpg',
 '1015_1.jpg',
 '1016_0.jpg',
 '1028_0.jpg',
 '1028_1.jpg',
 '1028_2.jpg',
 '102_0.jpg',
 '1045_0.jpg',
 '1048_0.jpg',
 '104_0.jpg',
 '1052_0.jpg',
 '105_0.jpg',
 '1070_0.jpg',
 '1070_1.jpg',
 '1070_2.jpg',
 '1072_0.jpg',
 '1077_0.jpg',
 '1077_1.jpg',
 '1088_0.jpg',
 '1088_1.jpg',
 '1088_2.jpg',
 '1088_3.jpg',
 '1091_0.jpg',
 '1095_0.jpg',
 '1096_0.jpg',
 '1102_0.jpg',
 '1102_1.jpg',
 '1102_2.jpg',
 '1102_3.jpg',
 '1106_0.jpg',
 '1106_1.jpg',
 '1107_0.jpg',
 '1109_0.jpg',
 '1111_0.jpg',
 '1117_0.jpg',
 '1118_0.jpg',
 '1118_1.jpg',
 '1119_0.jpg',
 '1121_0.jpg',
 '1122_0.jpg',
 '1125_0.jpg',
 '1125_1.jpg',
 '1125_2.jpg',
 '1125_3.jpg',
 '112_0.jpg',
 '112_1.jpg',
 '1141_0.jpg',
 '1141_1.jpg',
 '1141_2.jpg',
 '1141_3.jpg',
 '1142_0.jpg',
 '1142_1.jpg',
 '1147_0.jpg',
 '1149_0.jpg',
 '114_0.jpg',
 '114_1.jpg',
 '114_2.jpg',
 '1

In [28]:
index = []
for i in range(len(filenamenew)):
  index.append(i)

In [29]:
index

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [30]:
df = pd.DataFrame()
# df['index'] = index
df['images'] = filenamenew

In [31]:
df

Unnamed: 0,images
0,1000_0.jpg
1,1000_1.jpg
2,1000_2.jpg
3,1000_3.jpg
4,1002_0.jpg
...,...
775,993_0.jpg
776,997_0.jpg
777,997_1.jpg
778,997_2.jpg


In [32]:
indi_df = pd.DataFrame(indices)
indi_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,342,197,657,243,68,625,562,716,218
1,1,0,252,167,716,218,197,657,342,243
2,2,228,18,624,0,716,68,625,63,555
3,3,292,354,500,599,619,763,58,721,31
4,4,675,266,595,651,22,728,212,430,482
...,...,...,...,...,...,...,...,...,...,...
775,775,471,677,213,441,729,464,7,704,776
776,776,779,637,322,425,321,597,390,720,670
777,777,393,447,321,114,82,86,392,752,445
778,778,759,74,761,164,118,177,246,668,221


In [33]:
# # for i in indices_df.items:
#   if i = df['']

In [34]:
df

Unnamed: 0,images
0,1000_0.jpg
1,1000_1.jpg
2,1000_2.jpg
3,1000_3.jpg
4,1002_0.jpg
...,...
775,993_0.jpg
776,997_0.jpg
777,997_1.jpg
778,997_2.jpg


In [35]:
dfnew = df.to_dict()
dfnew = dfnew['images']

In [36]:
dfnew

{0: '1000_0.jpg',
 1: '1000_1.jpg',
 2: '1000_2.jpg',
 3: '1000_3.jpg',
 4: '1002_0.jpg',
 5: '1008_0.jpg',
 6: '1008_1.jpg',
 7: '1009_0.jpg',
 8: '1013_0.jpg',
 9: '1013_1.jpg',
 10: '1015_0.jpg',
 11: '1015_1.jpg',
 12: '1016_0.jpg',
 13: '1028_0.jpg',
 14: '1028_1.jpg',
 15: '1028_2.jpg',
 16: '102_0.jpg',
 17: '1045_0.jpg',
 18: '1048_0.jpg',
 19: '104_0.jpg',
 20: '1052_0.jpg',
 21: '105_0.jpg',
 22: '1070_0.jpg',
 23: '1070_1.jpg',
 24: '1070_2.jpg',
 25: '1072_0.jpg',
 26: '1077_0.jpg',
 27: '1077_1.jpg',
 28: '1088_0.jpg',
 29: '1088_1.jpg',
 30: '1088_2.jpg',
 31: '1088_3.jpg',
 32: '1091_0.jpg',
 33: '1095_0.jpg',
 34: '1096_0.jpg',
 35: '1102_0.jpg',
 36: '1102_1.jpg',
 37: '1102_2.jpg',
 38: '1102_3.jpg',
 39: '1106_0.jpg',
 40: '1106_1.jpg',
 41: '1107_0.jpg',
 42: '1109_0.jpg',
 43: '1111_0.jpg',
 44: '1117_0.jpg',
 45: '1118_0.jpg',
 46: '1118_1.jpg',
 47: '1119_0.jpg',
 48: '1121_0.jpg',
 49: '1122_0.jpg',
 50: '1125_0.jpg',
 51: '1125_1.jpg',
 52: '1125_2.jpg',
 53: '

In [37]:
indi_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,342,197,657,243,68,625,562,716,218
1,1,0,252,167,716,218,197,657,342,243
2,2,228,18,624,0,716,68,625,63,555
3,3,292,354,500,599,619,763,58,721,31
4,4,675,266,595,651,22,728,212,430,482
...,...,...,...,...,...,...,...,...,...,...
775,775,471,677,213,441,729,464,7,704,776
776,776,779,637,322,425,321,597,390,720,670
777,777,393,447,321,114,82,86,392,752,445
778,778,759,74,761,164,118,177,246,668,221


In [38]:
indi_df = indi_df.replace(dfnew)
indi_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1000_0.jpg,203_0.jpg,1399_0.jpg,767_0.jpg,1488_0.jpg,1159_0.jpg,709_0.jpg,575_0.jpg,893_0.jpg,1439_0.jpg
1,1000_1.jpg,1000_0.jpg,1504_0.jpg,1322_0.jpg,893_0.jpg,1439_0.jpg,1399_0.jpg,767_0.jpg,203_0.jpg,1488_0.jpg
2,1000_2.jpg,1467_0.jpg,1048_0.jpg,708_0.jpg,1000_0.jpg,893_0.jpg,1159_0.jpg,709_0.jpg,1149_0.jpg,569_1.jpg
3,1000_3.jpg,1572_2.jpg,219_0.jpg,465_3.jpg,651_0.jpg,685_3.jpg,978_0.jpg,1141_2.jpg,898_2.jpg,1088_3.jpg
4,1002_0.jpg,812_0.jpg,1519_0.jpg,647_0.jpg,752_2.jpg,1070_0.jpg,907_2.jpg,1425_0.jpg,317_0.jpg,440_0.jpg
...,...,...,...,...,...,...,...,...,...,...
775,993_0.jpg,430_0.jpg,814_0.jpg,1426_0.jpg,35_0.jpg,907_3.jpg,410_0.jpg,1009_0.jpg,869_1.jpg,997_0.jpg
776,997_0.jpg,997_3.jpg,733_1.jpg,178_0.jpg,312_0.jpg,177_0.jpg,64_1.jpg,273_0.jpg,898_1.jpg,80_0.jpg
777,997_1.jpg,273_3.jpg,371_3.jpg,177_0.jpg,1242_1.jpg,1170_1.jpg,1180_2.jpg,273_2.jpg,95_0.jpg,371_1.jpg
778,997_2.jpg,973_0.jpg,1166_3.jpg,973_2.jpg,131_0.jpg,124_0.jpg,133_1.jpg,148_2.jpg,79_0.jpg,144_0.jpg


In [39]:
# This cell is Only for Google Colab
from google.colab import files
indi_df.to_csv('faiss.csv')


In [40]:
files.download('faiss.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>