In [4]:
#imports and constants
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import euclidean_distances
import csv

In [139]:
def fvecs_read(filename, c_contiguous=True):
    fv = np.fromfile(filename, dtype=np.float32)
    if fv.size == 0:
        return np.zeros((0, 0))
    dim = fv.view(np.int32)[0]
    assert dim > 0
    fv = fv.reshape(-1, 1 + dim)
    if not all(fv.view(np.int32)[:, 0] == dim):
        raise IOError("Non-uniform vector sizes in " + filename)
    fv = fv[:, 1:]
    if c_contiguous:
        fv = fv.copy()
    return fv

def fvecs_write(data, filename):
    n, d = data.shape
    data_to_file = np.empty((n, d+1), np.float32)
    data_to_file.view(np.int32)[:, 0] = d
    data_to_file[:, 1:] = data
    data_to_file.tofile(filename)

In [140]:
data = fvecs_read("data/audio_base.fvecs") #read in the data
data = (data - np.mean(data))/np.std(data) #standardise

In [277]:
def KNN(dataset, numPos, numNeg):
    df = pd.DataFrame(columns = list(range(0,numPos)) + list(range(-numNeg, 0))) #initialise dataframe
    nearest = NearestNeighbors(n_neighbors=dataset.shape[0])
    nearest.fit(dataset) 
    # NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean')
    for index, point in enumerate(dataset[:1]):
        #store the indicies of the KNN's in KNN field. Need to to use tolist for file writing purposes
        nn = nearest.kneighbors([point])[1][0].tolist()
        df.loc[index] = nn[1:numPos+1] + nn[-numNeg:] # start from 1 to exclude the point itself
        
        if index % 100 == 0:
            print(index, 'done')
    return df

In [272]:
%%time
trainData, valData = train_test_split(data, test_size=0.1) # split the data into training and test
valDF = KNN(valData, 50, 50)
print('validation done.')
trainDF = KNN(trainData, 50, 50)
print('training done.')

0 done
100 done
200 done
300 done
400 done
500 done
600 done
700 done
800 done
900 done
1000 done
1100 done
1200 done
1300 done
1400 done
1500 done
1600 done
1700 done
1800 done
1900 done
2000 done
2100 done
2200 done
2300 done
2400 done
2500 done
2600 done
2700 done
2800 done
2900 done
3000 done
3100 done
3200 done
3300 done
3400 done
3500 done
3600 done
3700 done
3800 done
3900 done
4000 done
4100 done
4200 done
4300 done
4400 done
4500 done
4600 done
4700 done
4800 done
4900 done
5000 done
5100 done
5200 done
5300 done
validation done.
0 done
100 done
200 done
300 done
400 done
500 done
600 done
700 done
800 done
900 done
1000 done
1100 done
1200 done
1300 done
1400 done
1500 done
1600 done
1700 done
1800 done
1900 done
2000 done
2100 done
2200 done
2300 done
2400 done
2500 done
2600 done
2700 done
2800 done
2900 done
3000 done
3100 done
3200 done
3300 done
3400 done
3500 done
3600 done
3700 done
3800 done
3900 done
4000 done
4100 done
4200 done
4300 done
4400 done
4500 done
4600 do

In [273]:
%%time
#pickle data
np.savetxt('data/trainData.txt', trainDF, fmt='%f')
np.savetxt('data/valData.txt', valDF, fmt='%f')
trainDF.to_csv('data/trainKNN.csv')
valDF.to_csv('data/valKNN.csv')

CPU times: user 3.1 s, sys: 176 ms, total: 3.27 s
Wall time: 3.32 s
