In [112]:
#imports and constants
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import euclidean_distances
import csv

In [2]:
def fvecs_read(filename, c_contiguous=True):
    fv = np.fromfile(filename, dtype=np.float32)
    if fv.size == 0:
        return np.zeros((0, 0))
    dim = fv.view(np.int32)[0]
    assert dim > 0
    fv = fv.reshape(-1, 1 + dim)
    if not all(fv.view(np.int32)[:, 0] == dim):
        raise IOError("Non-uniform vector sizes in " + filename)
    fv = fv[:, 1:]
    if c_contiguous:
        fv = fv.copy()
    return fv

def fvecs_write(data, filename):
    n, d = data.shape
    data_to_file = np.empty((n, d+1), np.float32)
    data_to_file.view(np.int32)[:, 0] = d
    data_to_file[:, 1:] = data
    data_to_file.tofile(filename)

In [129]:
data = fvecs_read("data/audio_base.fvecs") #read in the data
data

array([[54343., 29776., 61789., ..., 48916., 51402., 58398.],
       [61316., 65420., 56087., ..., 64560., 52845., 54574.],
       [40361., 62775., 60291., ..., 46604., 59174., 54336.],
       ...,
       [43378., 31941., 63146., ..., 49708., 56622., 58992.],
       [55945., 66552., 53190., ..., 55152., 58037., 55757.],
       [65975., 64302., 53143., ..., 58365., 57474., 53535.]],
      dtype=float32)

In [4]:
trainData, queryData = train_test_split(data, test_size=0.1) # split the data into training and test

In [126]:
%%time
df = pd.DataFrame({'KNN': []})
neigh = NearestNeighbors(n_neighbors=trainData.shape[0])
neigh.fit(trainData) 
NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean')
for index, queryPoint in enumerate(queryData):
    #store the indicies of the KNN's in KNN field. Need to to use tolist for file writing purposes
    df.loc[index] = [neigh.kneighbors([queryPoint])[1][0].tolist()] 

CPU times: user 10min 10s, sys: 12.8 s, total: 10min 22s
Wall time: 2min 41s


In [127]:
df

Unnamed: 0,KNN
0,"[12379, 20762, 37127, 29536, 26691, 20450, 321..."
1,"[36186, 13707, 27977, 7651, 15371, 48023, 4427..."
2,"[20075, 15298, 46749, 20323, 6327, 43882, 3898..."
3,"[11085, 13095, 7412, 46427, 42195, 25022, 2122..."
4,"[24496, 2565, 40489, 30146, 23337, 4842, 26401..."
5,"[25234, 28879, 23574, 17736, 32681, 22838, 373..."
6,"[35131, 39046, 18915, 32901, 35380, 18558, 278..."
7,"[4676, 10314, 7892, 5432, 29156, 7285, 20817, ..."
8,"[33394, 3860, 24119, 37141, 24443, 10656, 1327..."
9,"[29769, 6854, 33376, 1879, 21668, 22781, 29428..."


In [123]:
trainData

array([[47076., 78785., 59149., ..., 61660., 53675., 55468.],
       [54428., 45544., 68372., ..., 56565., 59825., 56958.],
       [55879., 86189., 57446., ..., 55092., 57768., 54300.],
       ...,
       [63002., 59673., 55873., ..., 59051., 49862., 55535.],
       [52009., 53559., 62644., ..., 58814., 57613., 52045.],
       [67357., 55986., 60821., ..., 55956., 58749., 59763.]],
      dtype=float32)

In [132]:
#pickle data
np.savetxt('data/trainData.txt', trainData, fmt='%f')
np.savetxt('data/queryData.txt', queryData, fmt='%f')
df.to_pickle("./data/KNN.pkl")

#write the data to files
# df.to_csv('data/KNN.csv', quoting=csv.QUOTE_ALL)