In [280]:
# %load_ext autoreload
%autoreload 2
import torch
import numpy as np
import pandas as pd

#### Checks the KNN's rankings of datapoints is correct. i.e. anchor-i dist < anchor-j dist for i < j

In [308]:
def dist(x, y):
    return (x-y).pow(2).sum()

In [304]:
from datasets import TripletAudio
train_data = torch.from_numpy(np.loadtxt('data/trainData.txt', dtype=np.float32))
train_KNN = pd.read_csv('data/trainKNN.csv', index_col=0)
# check num elems are same in DF and dataset
assert(train_data.shape[0] == train_KNN.shape[0])
# #randomly pick 10 rows to check
selected_indicies = np.random.choice(list(range(0,train_KNN.shape[0])), size=100, replace=False)
# ensure that the ordering of neighbours is correct for that row
for index in selected_indicies:
    row = train_KNN.iloc[index]
    anchor = train_data[index]
    for prev, cur in zip(row, row[1:]):
        prev_dist = dist(anchor, train_data[prev])
        cur_dist = dist(anchor, train_data[cur])
        assert(prev_dist < cur_dist)
print("All Good!")

AssertionError: 

#### Generates triplets and ensures that the anchor-pos distance < anchor-neg distance

In [314]:
from datasets import TripletAudio
dataset = TripletAudio(True, 5, 5, 5)
for i, ((anchor, pos, neg), label, index) in enumerate(dataset):
    assert((np.linalg.norm(anchor-pos) - np.linalg.norm(anchor-neg)) < 0)
print("All Good!")

All Good!


#### Runs two checks for the embedding process:

In [268]:
from networks import AnchorNet
#test batch size 1
train_data = torch.tensor([[[2.],[3.]]])
anchor = AnchorNet([], 2, 1, True)
expected_output = torch.tensor([[5**(1/2) - 1]])
output = anchor.forward(train_data)
assert(torch.equal(output, expected_output))
# test batch size 2
train_data = torch.tensor([[[12.],[8.]],[[6.],[9.]]])
anchor = AnchorNet(train_data, 2, 1, True)
expected_output = torch.tensor([[170**(1/2) - 1], [89**(1/2) - 1]])
output = anchor.forward(train_data)
assert(torch.equal(output, expected_output))
print("All Good!")

All Good!


#### Checks similarity generator 

In [309]:
from utils import similarity_calculator 
similarity = similarity_calculator(train_KNN, 5)

In [None]:
selected_indicies = np.random.choice(list(range(0,train_KNN.shape[0])), size=100, replace=False)
for index in selected_indicies:
    similarity_indicies = np.where(similarity[index])[0] #indicies from similarity matrix
    expected_indicies = np.append(train_KNN.iloc[index][:5], index) #get true KNN's and add index itself
    np.testing.assert_array_equal(np.sort(similarity_indicies), np.sort(expected_indicies))
print("All Good!")

#### Checks the negative triplet selector

In [70]:
#mock df to use as the knn dataset
mock_df = pd.DataFrame(columns=list(range(0,5)))
mock_df.loc[0] = [1,2,5,6,7] #reciprocal neighbours with 1,2. One way with 3
mock_df.loc[1] = [0,6,7,8,9] #reciprocal neighbours with 0
mock_df.loc[2] = [0,10,11,12,13] #reciprocal neighbours with 0
mock_df.loc[3] = [0,15,16,17,18] #one way with 0
mock_df.loc[4] = [18,19,20,21,22] #no NN
# need to mock remaining for similaritycalc
for i in range(5,23):
    mock_df.loc[i] = [18,19,20,21,22]

In [71]:
from utils import SemihardNegativeTripletSelector
triplet_selector = SemihardNegativeTripletSelector(2.5, mock_df)
indicies = np.array([0,1,2,3,4])
embeddings = torch.tensor([[5.,5.],[5.,4.], [5.,3.],[7.,6.], [4.,2.5]])

generating similarity matrix
done


In [72]:
output = triplet_selector.get_triplets(embeddings, indicies)
expected_output = np.array([[1,0,4],[0,2,3]])
np.testing.assert_array_equal(output, expected_output)
print('negative triplet selector works!')

negative triplet selector works!


#### Bucket hash tests

In [235]:
from recall import BucketHash
buckets = np.array([[1,0,1], [1,1,0],[0,0,0]])
bucket_hash = BucketHash()
for i,x in enumerate(buckets):
    bucket_hash.add(x,i)
expected_output = {(1,0,1): [0], (1,1,0): [1], (0,0,0): [2]}
print(bucket_hash.hash)
for k, val in bucket_hash.hash.items():
    np.testing.assert_equal(val, expected_output[k])
for i in range(0,len(buckets)):
    np.testing.assert_equal(bucket_hash.get(buckets[i]), [i])
np.testing.assert_equal(bucket_hash.keys(), buckets)
print('All good!')

defaultdict(<class 'list'>, {(1, 0, 1): [0], (1, 1, 0): [1], (0, 0, 0): [2]})
All good!


#### Recall

In [121]:
from recall import Recall
r = Recall(5,3)
query = np.array([0.6,0.2,-0.4])
sorted_proj, mapping = r.sorted_to_proj_map(query)
assert(mapping == {0:1, 1:2, 2:0})
np.testing.assert_equal(sorted_proj,[0.2, 0.4, 0.6])
print("sorted_to_proj_map works!")

sorted_to_proj_map works!


In [247]:
data = torch.tensor([
    [-1.,2.,3.],
    [3.,-4.,-1.],
    [-6.,-2.,3.],
    [4.,1.,-8.],
    [1.,-1.,-1.],
    [-1.,2.,.3],
    [-2.,1.,4.],
    [0.,0.,0.],
    [5.,5.,5.],
])
model = lambda x: torch.mm(x,torch.tensor([[1.,1.], [2.,-1], [0,1.]]))
query = torch.tensor([[3.,3.,3.], [3.,3.,3.], [3.,3.,3.]]) #tests same query multiple times
mock_true_knns = pd.DataFrame(columns=list(range(0,3)))
#test that recall gets all true KNNs %
mock_true_knns.loc[0] = [8,0,5] #100% 
mock_true_knns.loc[1] = [8,0,4] #66%
mock_true_knns.loc[2] = [1,2,3] #0%
naive = r.calculate(data, model, query, mock_true_knns) #naive slow method
generation = r.calculate(data, model, query, mock_true_knns, False) #faster generate to probe method
for results in [naive, generation]:
    np.testing.assert_equal(results, [1.0, 2/3, 0.])
print('Recall works!')

done embedding data
done quantizing into 4 buckets
done embedding data
done quantizing into 4 buckets
Recall works!
