## KNN Model Implementation in python using FAISS library  </font> 
### source code reference
https://github.com/facebookresearch/faiss

### KNN Wrapper Class for Predicting K nearest neighors 

In [2]:
import numpy as np 
from collections import Counter 
import os 
import faiss
import time 
 
class FaissKNNImpl:
    
    def __init__(self,k,faiss):
        self.k = k # k nearest neighbor value
        self.faissIns = faiss # FAISS instance
        self.index = 0  
        self.train_labels = []  
        self.test_label_faiss_output = [] 
        
    def fitModel(self,train_features,train_labels): 
        self.train_labels = train_labels
        self.index = self.faissIns.IndexFlatL2(train_features.shape[1])   # build the index 
        self.index.add(train_features)       # add vectors to the index
        
    def predict(self,test_features): 
        distance, test_features_faiss_Index = self.index.search(test_features, self.k) 
        self.test_label_faiss_output = stats.mode(self.train_labels[test_features_faiss_Index],axis=1)[0]
        self.test_label_faiss_output = np.array(self.test_label_faiss_output.ravel())
        return self.test_label_faiss_output
      
    def getAccuracy(self,test_labels):
        accuracy = (self.test_label_faiss_output == test_labels).mean() * 100 
        return round(accuracy) 

## Data can be dowloaded from here
https://drive.google.com/file/d/1txt-EhdUPXC7w28s4scKVXco9Xla36O2/view?usp=sharing

In [4]:
data_dir = "raw" 
raw_data_file = os.path.join(data_dir, "covtype.data.gz")  
print("Reading raw data from {}".format(raw_data_file))
raw_data = np.loadtxt(raw_data_file, delimiter=',')

Reading raw data from raw/covtype.data.gz


In [5]:
raw_data.shape

(581012, 55)

## Prepare Training and Test Data using 58k datapoints and 54 features

In [8]:
np.random.seed(0)
np.random.shuffle(raw_data)
train_size = int(0.9 * raw_data.shape[0])
train_features = raw_data[:train_size, :-1].astype('float32')
train_labels = raw_data[:train_size, -1].astype('float32')
test_features = raw_data[train_size:, :-1].astype('float32')
test_labels = raw_data[train_size:, -1].astype('float32')

In [9]:
train_features.shape

(522910, 54)

In [10]:
test_features.shape

(58102, 54)

In [11]:
import faiss_knn as fbknn

## Train FAISS KNN model with k = 5

In [12]:
k = 5
start_time = time.time()
faissobj = fbknn.FaissKNNImpl(k,faiss)
faissobj.fitModel(train_features,train_labels)
run_time = time.time() - start_time
print('time required for training %d data points at k = %d: %.2f seconds' % (train_features.shape[0], k , run_time))

time required for training 522910 data points at k = 5: 0.10 seconds


In [13]:
# start_time = time.time()
predictions = faissobj.predict(test_features)
run_time = time.time() - start_time
print('time required for predicting %d data point at k = %d: %.2f seconds' % (test_features.shape[0], k, run_time))

time required for predicting 58102 data point at k = 5: 49.89 seconds


In [14]:
accuracy = faissobj.getAccuracy(test_labels) 
print('Accuracy for K = %d : %d ' % (k, accuracy),'%')

Accuracy for K = 5 : 97  %


### <font color=red>Accuracy for K = 5 : 97  % </font>  

In [15]:
from sklearn.metrics import classification_report
y_true = test_labels
y_pred = predictions
target_names = ['class 1', 'class 2', 'class 3','class 4', 'class 5', 'class 6','class 7']
print(classification_report(y_true, y_pred, target_names=target_names)) 

              precision    recall  f1-score   support

     class 1       0.97      0.97      0.97     20980
     class 2       0.97      0.98      0.97     28463
     class 3       0.97      0.97      0.97      3589
     class 4       0.94      0.83      0.88       279
     class 5       0.92      0.90      0.91       999
     class 6       0.95      0.94      0.94      1768
     class 7       0.98      0.97      0.97      2024

   micro avg       0.97      0.97      0.97     58102
   macro avg       0.95      0.93      0.94     58102
weighted avg       0.97      0.97      0.97     58102



## Train FAISS KNN model with k = 10

In [19]:
k = 10
start_time = time.time()
faissobj = fbknn.FaissKNNImpl(k,faiss)
faissobj.fitModel(train_features,train_labels)
run_time = time.time() - start_time
print('time required for training %d data point at k = %d: %.2f seconds' % (train_features.shape[0], k, run_time))

time required for training 522910 data point at k = 10: 0.05 seconds


In [20]:
start_time = time.time()
predictions = faissobj.predict(test_features)
run_time = time.time() - start_time
print('time required for predicting %d data point at k = %d: %.2f seconds' % (test_features.shape[0], k, run_time))

time required for predicting 58102 data point at k = 10: 42.15 seconds


In [21]:
accuracy = faissobj.getAccuracy(test_labels) 
print('Accuracy for K = %d : %d ' % (k, accuracy),'%')

Accuracy for K = 10 : 96  %


### <font color=red>Accuracy for K = 10 : 96  % </font>  