In [22]:
import numpy as np 
from collections import Counter 
import os 
import faiss
import time
class FaissKNNImpl:
    
    def __init__(self,k,faiss):
        self.k = k # k nearest neighbor value
        self.faissIns = faiss # FAISS instance
        self.index = 0  
        self.train_labels = []  
        self.test_label_faiss_output = [] 
        
    def fitModel(self,train_features,train_labels): 
        self.train_labels = train_labels
        self.index = self.faissIns.IndexFlatL2(train_features.shape[1])   # build the index 
        self.index.add(train_features)       # add vectors to the index
        
    def predict(self,test_features): 
        distance, test_features_faiss_Index = self.index.search(test_features, self.k)
        self.test_label_faiss_output =  np.zeros(test_features.shape[0])
        for test_index in range(0,test_features.shape[0]):
            self.test_label_faiss_output[test_index] = Counter(self.train_labels[test_features_faiss_Index[test_index]]).most_common(1)[0][0] 
        return self.test_label_faiss_output
      
    def getAccuracy(self,test_labels):
        accuracy = (self.test_label_faiss_output == test_labels).mean() * 100 
        return round(accuracy) 

In [9]:
data_dir = "raw" 
raw_data_file = os.path.join(data_dir, "covtype.data.gz")  
print("Reading raw data from {}".format(raw_data_file))
raw_data = np.loadtxt(raw_data_file, delimiter=',')

Reading raw data from raw/covtype.data.gz


In [10]:
raw_data.shape

(581012, 55)

In [11]:
train_size = int(0.9 * raw_data.shape[0])
train_features = raw[:train_size, :-1].astype('float32')
train_labels = raw[:train_size, -1].astype('float32')
test_features = raw[train_size:, :-1].astype('float32')
test_labels = raw[train_size:, -1].astype('float32')

In [13]:
train_features.shape

(522910, 54)

In [14]:
test_features.shape

(58102, 54)

In [19]:
import faiss_knn as fbknn

In [34]:
k = 5
faissobj = fbknn.FaissKNNImpl(k,faiss)
faissobj.fitModel(train_features,train_labels)

In [35]:
start_time = time.time()
predictions = faissobj.predict(test_features)
run_time = time.time() - start_time
print('time required for predicting %d data point at k = %d: %.2f seconds' % (test_features.shape[0], k, run_time))

time required for predicting 58102 data point at k = 5: 34.72 seconds


In [36]:
accuracy = faissobj.getAccuracy(test_labels) 
print('Accuracy for K = %d : %d ' % (k, accuracy),'%')

Accuracy for K = 5 : 64  %


In [37]:
k = 3
faissobj = fbknn.FaissKNNImpl(k,faiss)
faissobj.fitModel(train_features,train_labels)
start_time = time.time()
predictions = faissobj.predict(test_features)
run_time = time.time() - start_time
print('time required for predicting %d data point at k = %d: %.2f seconds' % (test_features.shape[0], k, run_time))

time required for predicting 58102 data point at k = 3: 35.33 seconds


In [38]:
accuracy = faissobj.getAccuracy(test_labels) 
print('Accuracy for K = %d : %d ' % (k, accuracy),'%')

Accuracy for K = 3 : 66  %
