## Aproximate Nearest Neighbor vs K Nearest Neighbor

## Import Library

In [25]:
# import basic library for data
import numpy as np
import pandas as pd
import joblib

# import sys library 
# and append src folder into path
import sys
sys.path.append("../")

# import approximate_nn and knn library
from src.approximate_nn import ApproximateNearestNeighbor
from src.approximate_nn import KNearestNeighbor

# import word embedding model
import tensorflow as tf
embedding_nnlm = tf.keras.models.load_model("../../model/nnlm-id-dim50/")



## A. Data Preparation

In [26]:
# load pickle data
X_nnlm = joblib.load("../../data/processed/X_nnlm.pkl")
y_nnlm = joblib.load("../../data/processed/y_nnlm.pkl")

print(f" The shape of the feature file is : {X_nnlm.shape}, the shape of the label file is : {y_nnlm.shape}")

 The shape of the feature file is : (93566, 50), the shape of the label file is : (93566,)


In [27]:
X_nnlm

array([[ 0.10433888,  0.17144816, -0.22230932, ..., -0.13045134,
         0.04030436,  0.07846453],
       [ 0.05992552, -0.09452719, -0.15411599, ..., -0.26243606,
         0.05033754,  0.09142387],
       [ 0.02256165,  0.10901815, -0.23604634, ..., -0.09470864,
        -0.01644576, -0.02838291],
       ...,
       [-0.07383361, -0.3003279 , -0.00087884, ..., -0.02625119,
         0.09066369,  0.17905125],
       [ 0.06527498, -0.002171  ,  0.00582   , ..., -0.17048995,
         0.3254559 , -0.03375199],
       [-0.16227451,  0.03748287, -0.02612683, ...,  0.03958465,
         0.26742777, -0.07449891]])

In [28]:
y_nnlm

array(['teupah selatan', 'simeulue timur', 'teupah barat', ..., 'zurafah',
       'zuriah', 'zus'], dtype='<U44')

## B. Model Training / Registering Data Into Model

### B.1. ANN Model

In [29]:
ann_model_nnlm = ApproximateNearestNeighbor(min_size_split=100, distance_type="cosine-similarity", n_tree = 1, random_state=123)
ann_model_nnlm.fit(X = X_nnlm, y=y_nnlm)

### B.2. KNN Model

In [30]:
knn_model_nnlm = KNearestNeighbor(distance_type="cosine-similarity")
knn_model_nnlm.fit(X = X_nnlm, y=y_nnlm)

## C. Model Testing / Searching Similar Items

In [24]:
def search_similar_item(embedding_model : str,
                        neighbors_model : object,
                        text_input : str):
    # generate embedding for text input
    input_embed = embedding_model([text_input]).numpy().squeeze().tolist()

    # search neighbors by embedding 
    similar_item = neighbors_model.find_similar_items(np.array([input_embed]))

    return similar_item

### C.1. ANN Model

In [31]:
sim_item = search_similar_item(embedding_model=embedding_nnlm, neighbors_model=ann_model_nnlm, text_input="kucing")
sim_item

array(['kucing', 'monyet', 'binatang', 'tikus', 'hewan', 'burung',
       'kelelawar', 'kera', 'kumbang', 'serangga'], dtype='<U44')

### C.2. KNN Model

In [13]:
sim_item_ = search_similar_item(embedding_model=embedding_nnlm, neighbors_model=knn_model_nnlm, text_input="kucing")
sim_item_

array(['kucing', 'angsa', 'ular', 'anjing', 'batu kucing', 'monyet',
       'kelinci', 'binatang', 'beruang', 'sumur kucing'], dtype='<U44')

## D. Searching Time Comparison

In [14]:
%%timeit -r 1 -n 1000
sim_item = search_similar_item(embedding_model=embedding_nnlm, neighbors_model=ann_model_nnlm, text_input="kucing")

16.1 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1,000 loops each)


In [15]:
%%timeit -r 1 -n 1000
sim_item_ = search_similar_item(embedding_model=embedding_nnlm, neighbors_model=knn_model_nnlm, text_input="kucing")

53.4 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1,000 loops each)
