In [3]:
#imports and constants
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import euclidean_distances
import csv

In [4]:
def fvecs_read(filename, c_contiguous=True):
    fv = np.fromfile(filename, dtype=np.float32)
    if fv.size == 0:
        return np.zeros((0, 0))
    dim = fv.view(np.int32)[0]
    assert dim > 0
    fv = fv.reshape(-1, 1 + dim)
    if not all(fv.view(np.int32)[:, 0] == dim):
        raise IOError("Non-uniform vector sizes in " + filename)
    fv = fv[:, 1:]
    if c_contiguous:
        fv = fv.copy()
    return fv

def fvecs_write(data, filename):
    n, d = data.shape
    data_to_file = np.empty((n, d+1), np.float32)
    data_to_file.view(np.int32)[:, 0] = d
    data_to_file[:, 1:] = data
    data_to_file.tofile(filename)

In [5]:
data = fvecs_read("data/audio_base.fvecs") #read in the data
data = (data - np.mean(data))/np.std(data)

In [6]:
trainData, queryData = train_test_split(data, test_size=0.1) # split the data into training and test

In [7]:
%%time
df = pd.DataFrame({'KNN': []})
neigh = NearestNeighbors(n_neighbors=trainData.shape[0])
neigh.fit(trainData) 
NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean')
for index, queryPoint in enumerate(queryData):
    #store the indicies of the KNN's in KNN field. Need to to use tolist for file writing purposes
    df.loc[index] = [neigh.kneighbors([queryPoint])[1][0].tolist()] 

CPU times: user 10min 22s, sys: 14.1 s, total: 10min 36s
Wall time: 2min 47s


In [8]:
df

Unnamed: 0,KNN
0,"[33049, 27643, 24297, 45048, 2359, 13760, 1638..."
1,"[14920, 34724, 3343, 43295, 12475, 43421, 1315..."
2,"[43457, 30138, 5808, 27450, 25286, 20906, 4134..."
3,"[33955, 8951, 44909, 12198, 40851, 21598, 3552..."
4,"[7806, 40623, 3095, 36342, 42460, 15194, 8450,..."
5,"[13050, 10275, 13129, 5705, 8882, 10586, 19852..."
6,"[45722, 47129, 35386, 32134, 27026, 41672, 443..."
7,"[9908, 15682, 42151, 23916, 30482, 38554, 2281..."
8,"[81, 6288, 10471, 3136, 18513, 27300, 20106, 2..."
9,"[18577, 47992, 35029, 29951, 40310, 13644, 897..."


In [9]:
#pickle data
np.savetxt('data/trainData.txt', trainData, fmt='%f')
np.savetxt('data/queryData.txt', queryData, fmt='%f')
df.to_pickle("./data/KNN.pkl")

#write the data to files
# df.to_csv('data/KNN.csv', quoting=csv.QUOTE_ALL)