In [112]:
#imports and constants
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import euclidean_distances
import csv

In [2]:
def fvecs_read(filename, c_contiguous=True):
    fv = np.fromfile(filename, dtype=np.float32)
    if fv.size == 0:
        return np.zeros((0, 0))
    dim = fv.view(np.int32)[0]
    assert dim > 0
    fv = fv.reshape(-1, 1 + dim)
    if not all(fv.view(np.int32)[:, 0] == dim):
        raise IOError("Non-uniform vector sizes in " + filename)
    fv = fv[:, 1:]
    if c_contiguous:
        fv = fv.copy()
    return fv

def fvecs_write(data, filename):
    n, d = data.shape
    data_to_file = np.empty((n, d+1), np.float32)
    data_to_file.view(np.int32)[:, 0] = d
    data_to_file[:, 1:] = data
    data_to_file.tofile(filename)

In [139]:
data = fvecs_read("data/audio_base.fvecs") #read in the data
data = (data - np.mean(data))/np.std(data)

In [146]:
trainData, queryData = train_test_split(data, test_size=0.1) # split the data into training and test

In [141]:
%%time
df = pd.DataFrame({'KNN': []})
neigh = NearestNeighbors(n_neighbors=trainData.shape[0])
neigh.fit(trainData) 
NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean')
for index, queryPoint in enumerate(queryData):
    #store the indicies of the KNN's in KNN field. Need to to use tolist for file writing purposes
    df.loc[index] = [neigh.kneighbors([queryPoint])[1][0].tolist()] 

CPU times: user 9min 44s, sys: 10.3 s, total: 9min 54s
Wall time: 2min 32s


In [151]:
df

Unnamed: 0,KNN
0,"[21642, 18446, 32729, 9211, 28478, 31073, 1605..."
1,"[25930, 16134, 32646, 25540, 39855, 43307, 178..."
2,"[12044, 23103, 30481, 36962, 41786, 9532, 5620..."
3,"[26062, 44476, 11888, 47067, 28251, 46430, 374..."
4,"[42914, 23076, 36025, 10302, 14014, 40032, 725..."
5,"[16011, 41902, 17163, 33672, 20569, 22880, 144..."
6,"[13844, 38447, 27417, 34815, 33845, 18009, 395..."
7,"[33962, 33450, 4111, 11805, 34165, 21895, 3695..."
8,"[1857, 8785, 12731, 11191, 15351, 32474, 18790..."
9,"[14509, 42122, 8990, 25375, 31377, 40824, 4179..."


In [144]:
#pickle data
np.savetxt('data/trainData.txt', trainData, fmt='%f')
np.savetxt('data/queryData.txt', queryData, fmt='%f')
df.to_pickle("./data/KNN.pkl")

#write the data to files
# df.to_csv('data/KNN.csv', quoting=csv.QUOTE_ALL)