In [2]:
from annoy import AnnoyIndex
import pandas as pd 
import numpy as np
import gc

In [3]:
def load_data_np(features_path):
	""" Loads pickled data into a np array, sorted by images ascending, returns np array of vectors and a np array of images (sorted)
	"""
	print("Loading data..\n\n")

	data = pd.read_pickle(features_path)

	# data = data.head(1000)

	data = data.sort_values("img")

	img_lst = data["img"].values

	if "transfer layer" in data.columns:
		data = np.array(data["transfer layer"].values)
	elif "output layer" in data.columns:
		data = np.array(data["output layer"].values)


	data = [np.array(row) for row in data]

	print("LOADED\n")

	n = gc.collect()

	return data, img_lst

In [4]:
data, img_lst = load_data_np("../data/transfer_layer")

Loading data..


LOADED



In [38]:
def build_Index(data, vector_size, metric="euclidean",trees=10):
    """ Returns Annoy index for specified data, with specified number of trees
    """
    t = AnnoyIndex(vector_size, metric=metric)  # Length of item vector that will be indexed
    for count, v in enumerate(data): # 37k
        t.add_item(count, v)
    print(t)
    t.build(trees)
    print(t)
    print("Indexs has been built!\n\n")
    return t

In [85]:
def nhood_query(item_index,annoy_index, nn=51):
    """ Returns list of tuple pairs neighbors and distances of specified items
    """
    neighbors = annoy_index.get_nns_by_item(item_index, nn, include_distances=True)
    return neighbors[0], neighbors[1]

In [81]:
# u = AnnoyIndex(len(instance))
# t.load('test.ann') # super fast, will just mmap the file
# print(t.get_nns_by_item(0, 51)) # will find the 1000 nearest neighbors

In [82]:
def get_indices_distances(data, metric="euclidean", nn=51, trees=20):
    """ Returns nested np array of indicies and distances 
    """
    indices_lst = []
    distances_lst = []
    t = build_Index(data, len(data[0]), metric=metric,trees=trees)
    for count, v in enumerate(data): # 37k
        indices, distances = nhood_query(count, t, nn=nn)
        indices_lst.append(np.array(indices))
        distances_lst.append(np.array(distances))
    return np.array(indices_lst), np.array(distances_lst)

In [77]:
%timeit indices, distances = get_indices_distances(data, metric="angular")

<annoy.Annoy object at 0x7fc85e0ede50>
<annoy.Annoy object at 0x7fc85e0ede50>
Indexs has been built!


<annoy.Annoy object at 0x7fc85e0ed9b0>
<annoy.Annoy object at 0x7fc85e0ed9b0>
Indexs has been built!


<annoy.Annoy object at 0x7fc85e0edfb0>
<annoy.Annoy object at 0x7fc85e0edfb0>
Indexs has been built!


<annoy.Annoy object at 0x7fc85e5260d0>
<annoy.Annoy object at 0x7fc85e5260d0>
Indexs has been built!


<annoy.Annoy object at 0x7fc85e0ed830>
<annoy.Annoy object at 0x7fc85e0ed830>
Indexs has been built!


<annoy.Annoy object at 0x7fc85e0ed9d0>
<annoy.Annoy object at 0x7fc85e0ed9d0>
Indexs has been built!


<annoy.Annoy object at 0x7fc85e5260d0>
<annoy.Annoy object at 0x7fc85e5260d0>
Indexs has been built!


<annoy.Annoy object at 0x7fc85e5260d0>
<annoy.Annoy object at 0x7fc85e5260d0>
Indexs has been built!


1min 28s ± 2.58 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [78]:
%timeit indices, distances = get_indices_distances(data, metric="euclidean")

<annoy.Annoy object at 0x7fc85e0edf70>
<annoy.Annoy object at 0x7fc85e0edf70>
Indexs has been built!


<annoy.Annoy object at 0x7fc85e5260d0>
<annoy.Annoy object at 0x7fc85e5260d0>
Indexs has been built!


<annoy.Annoy object at 0x7fc85e0edaf0>
<annoy.Annoy object at 0x7fc85e0edaf0>
Indexs has been built!


<annoy.Annoy object at 0x7fc85e0ed810>
<annoy.Annoy object at 0x7fc85e0ed810>
Indexs has been built!


<annoy.Annoy object at 0x7fc85e0edc30>
<annoy.Annoy object at 0x7fc85e0edc30>
Indexs has been built!


<annoy.Annoy object at 0x7fc85e0eda10>
<annoy.Annoy object at 0x7fc85e0eda10>
Indexs has been built!


<annoy.Annoy object at 0x7fc85e0edbf0>
<annoy.Annoy object at 0x7fc85e0edbf0>
Indexs has been built!


<annoy.Annoy object at 0x7fc85e0eddf0>
<annoy.Annoy object at 0x7fc85e0eddf0>
Indexs has been built!


1min 12s ± 721 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [87]:
%timeit indices, distances = get_indices_distances(data, metric="manhattan")

<annoy.Annoy object at 0x7fc85e3af150>
<annoy.Annoy object at 0x7fc85e3af150>
Indexs has been built!


<annoy.Annoy object at 0x7fc85e0eda90>
<annoy.Annoy object at 0x7fc85e0eda90>
Indexs has been built!


<annoy.Annoy object at 0x7fc85e2efb50>
<annoy.Annoy object at 0x7fc85e2efb50>
Indexs has been built!


<annoy.Annoy object at 0x7fc85e0eda90>
<annoy.Annoy object at 0x7fc85e0eda90>
Indexs has been built!


<annoy.Annoy object at 0x7fc85e2efb50>
<annoy.Annoy object at 0x7fc85e2efb50>
Indexs has been built!


<annoy.Annoy object at 0x7fc85e0eda90>
<annoy.Annoy object at 0x7fc85e0eda90>
Indexs has been built!


<annoy.Annoy object at 0x7fc85e0eda90>
<annoy.Annoy object at 0x7fc85e0eda90>
Indexs has been built!


<annoy.Annoy object at 0x7fc85e0ed9b0>
<annoy.Annoy object at 0x7fc85e0ed9b0>
Indexs has been built!


1min 17s ± 778 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
