In [3]:
#imports and constants
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import euclidean_distances
import csv

In [4]:
def fvecs_read(filename, c_contiguous=True):
    fv = np.fromfile(filename, dtype=np.float32)
    if fv.size == 0:
        return np.zeros((0, 0))
    dim = fv.view(np.int32)[0]
    assert dim > 0
    fv = fv.reshape(-1, 1 + dim)
    if not all(fv.view(np.int32)[:, 0] == dim):
        raise IOError("Non-uniform vector sizes in " + filename)
    fv = fv[:, 1:]
    if c_contiguous:
        fv = fv.copy()
    return fv

def fvecs_write(data, filename):
    n, d = data.shape
    data_to_file = np.empty((n, d+1), np.float32)
    data_to_file.view(np.int32)[:, 0] = d
    data_to_file[:, 1:] = data
    data_to_file.tofile(filename)

In [5]:
data = fvecs_read("data/audio_base.fvecs") #read in the data
data = (data - np.mean(data))/np.std(data)

In [6]:
trainData, queryData = train_test_split(data, test_size=0.1) # split the data into training and test

In [7]:
%%time
df = pd.DataFrame({'KNN': []})
neigh = NearestNeighbors(n_neighbors=trainData.shape[0])
neigh.fit(trainData) 
NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean')
for index, queryPoint in enumerate(queryData):
    #store the indicies of the KNN's in KNN field. Need to to use tolist for file writing purposes
    df.loc[index] = [neigh.kneighbors([queryPoint])[1][0].tolist()] 

CPU times: user 9min 53s, sys: 13.4 s, total: 10min 6s
Wall time: 2min 39s


In [8]:
df

Unnamed: 0,KNN
0,"[6382, 19789, 25156, 20783, 39322, 18259, 2195..."
1,"[46102, 15729, 25080, 20253, 46317, 34741, 253..."
2,"[8606, 10530, 35916, 11270, 37726, 37231, 4616..."
3,"[3507, 42250, 40199, 41903, 36696, 15026, 3859..."
4,"[36563, 23443, 47010, 29099, 1271, 30634, 2271..."
5,"[25694, 40815, 34372, 15026, 29863, 38491, 356..."
6,"[27401, 44599, 16714, 5356, 24670, 9387, 750, ..."
7,"[10215, 12807, 29307, 18041, 37303, 7233, 3579..."
8,"[12939, 43375, 24552, 42812, 29945, 37488, 374..."
9,"[39280, 24652, 29801, 9172, 5443, 42265, 20421..."


In [9]:
#pickle data
np.savetxt('data/trainData.txt', trainData, fmt='%f')
np.savetxt('data/queryData.txt', queryData, fmt='%f')
df.to_pickle("./data/KNN.pkl")

#write the data to files
# df.to_csv('data/KNN.csv', quoting=csv.QUOTE_ALL)