In [12]:
!pip install pandas tensorflow keras tqdm
import pandas as pd
import numpy as np
from tqdm import tqdm

np.random.seed(42)



## Load data

In [120]:
mfcc_train = pd.read_csv('data/TIMIT/train.mfcccsv').values
mfcc_test = pd.read_csv('data/TIMIT/test.mfcccsv').values
y_train = pd.read_csv('data/TIMIT/train.targcsv').values
y_test = pd.read_csv('data/TIMIT/test.targcsv').values

In [121]:
# we only use a random sample of the data
sample_idx = np.random.choice(np.arange(mfcc_train.shape[0]), 3000, replace=False)
mfcc_train = mfcc_train[sample_idx]
y_train = y_train[sample_idx]

sample_idx = np.random.choice(np.arange(mfcc_train.shape[0]), 800, replace=False)
mfcc_test = mfcc_test[sample_idx]
y_test = y_test[sample_idx]
print("train on {} samples, test on {} samples".format(mfcc_train.shape[0], mfcc_test.shape[0]))

train on 3000 samples, test on 800 samples


In [122]:
mfcc_train.shape, y_train.shape, mfcc_test.shape, y_test.shape

((3000, 13), (3000, 40), (800, 13), (800, 40))

In [157]:
def euclidian_distance(vec1, vec2):
    """ return euclidian distance between two vectors """
    return np.sqrt( np.sum( (vec1 - vec2) **2 , axis=0) )
    
class KNearestNeighbor:
    # contains indices of 200 nearest neighbors for each test sample
    nearest_neighbors = [] # np array shape (num_test_samples, 200)
    
    def store_nn(self, X_train, X_test, y_train, y_test):
        # store 200 nearest neighbors for each item in X_test
        distances = [] # list of tuples (index)
        for _test_sample in tqdm(X_test):
            
            _distances_row = [euclidian_distance(_test_sample, _train_sample) for _train_sample in X_train]
            _distances_row = np.array(_distances_row)
            
            _nn_train_indices = np.argsort(_distances_row)[0:200]
            _nn_labels = np.argmax( y_train[ _nn_train_indices ], axis=1)
            
            self.nearest_neighbors.append(_nn_labels)
        
        self.nearest_neighbors = np.vstack ( self.nearest_neighbors)
        print(self.nearest_neighbors.shape)
                
    def predict(self, X, k=3):
        preds = []
        
        for i in range(X.shape[0]):
            neighbors = self.nearest_neighbors[i, 0:k] # get relevant neighbors
            counts = np.bincount(neighbors)
            prediction = np.argmax(counts) # set prediction to most common label value
            preds.append(prediction)
            
        return preds

In [None]:
knn = KNearestNeighbor()
knn.store_nn(mfcc_train, mfcc_test, y_train, y_test)

In [143]:
def get_prediction_rate(y_pred, y_test):
    return np.sum(y_pred == y_test) / len(y_test)

k_values_to_try = [1, 3, 5, 7, 9, 11, 13, 15]

for _k in k_values_to_try:
    preds = knn.predict(mfcc_test, k=_k)
    prediction_rate = get_prediction_rate(np.array(preds), np.argmax(y_test, axis=1))
    print("k={}: prediction rate = {:.2f}".format(_k, prediction_rate*100))

k=1: prediction rate = 41.62
k=3: prediction rate = 44.38
k=5: prediction rate = 48.88
k=7: prediction rate = 50.88
k=9: prediction rate = 51.38
k=11: prediction rate = 51.75
k=13: prediction rate = 51.50
k=15: prediction rate = 52.00


## b) Normalized feature vectors

In [161]:
mfcc_train_norm = ( mfcc_train - np.mean(mfcc_train, axis=0) ) / np.std(mfcc_train, axis=0)
mfcc_test_norm = ( mfcc_test - np.mean(mfcc_test, axis=0) ) / np.std(mfcc_test, axis=0)

knn_norm = KNearestNeighbor()
knn_norm.store_nn(mfcc_train_norm, mfcc_test_norm, y_train, y_test)

100%|█████████████████████████████████████████████████████████████| 800/800 [00:35<00:00, 22.47it/s]

(800, 200)





In [164]:
for _k in k_values_to_try:
    preds = knn_norm.predict(mfcc_test, k=_k)
    prediction_rate = get_prediction_rate(np.array(preds), np.argmax(y_test, axis=1))
    print("k={}: prediction rate = {:.2f}".format(_k, prediction_rate*100))

k=1: prediction rate = 43.50
k=3: prediction rate = 46.12
k=5: prediction rate = 49.50
k=7: prediction rate = 50.62
k=9: prediction rate = 52.00
k=11: prediction rate = 53.62
k=13: prediction rate = 53.50
k=15: prediction rate = 54.62
