In [40]:
##### SAVE RAW SPEAKER FILES #####

import time
import pickle
import numpy as np
from os import listdir
import scipy.io.wavfile as wav

# load wav file
start_time = time.time()
rate, data = wav.read('Custom Speaker/speaker5test.wav')
print(rate)

print("Loaded - %s seconds" % (time.time() - start_time))

25000
Loaded - 0.031249523162841797 seconds


In [41]:
##### APPLY TRANSFORMATIONS #####
from scipy.signal import stft
from scipy.cluster.vq import whiten

start_time = time.time()

fs=25000 # audio is in 25,000 samples/sec
nperseg=1250 # 1250 samples per segment (50ms)
noverlap=250 # 250 sample overlap (10ms overlap)

# apply, short-term Fourier, then absolute value, then natural log, then Whitening transform
start_time = time.time()
freqs, times, Zxx = stft(x=data, fs=fs, nperseg=nperseg, noverlap=noverlap, boundary=None)  
spectral = whiten(np.log(np.absolute(Zxx)).T)

print("Spectral created - %s seconds" % (time.time() - start_time))


Spectral created - 0.12495899200439453 seconds


In [43]:
# Load Code Books into array
# codeBooks[speaker]
codeBooks = []
start_time = time.time()

for i in range(34):
    codeBooks.append(np.load('codeBooks/codeBook' + str(i+1) + '.npy'))
    
print("CodeBooks loaded")

##### Determine speaker from test data #####
from scipy.cluster.vq import vq

# distances[speaker]
distances = []

# Get distortion distance between testData[speaker] and each code book
for book in codeBooks:
    code, dist = vq(obs=spectral, code_book=book)
    distances.append(np.sum(dist))

i=1
for d in distances:
    if d == min(distances): print('Distortion from speaker ' + str(i) + ': ' + str(d) + ' <-- Closest Speaker ' + str(i))
    else: print('Distortion from speaker ' + str(i) + ': ' + str(d))
    i+=1
print("Predictions Complete - %s seconds" % (time.time() - start_time))

CodeBooks loaded
Distortion from speaker 1: 47398.81859273765
Distortion from speaker 2: 46714.450860835015
Distortion from speaker 3: 45984.44156207888
Distortion from speaker 4: 47311.54904708019
Distortion from speaker 5: 45376.484803955645
Distortion from speaker 6: 47912.52992306366
Distortion from speaker 7: 48845.69502749767
Distortion from speaker 8: 47541.59169895419
Distortion from speaker 9: 45691.192996857644
Distortion from speaker 10: 46914.06100462526
Distortion from speaker 11: 48171.981787500044
Distortion from speaker 12: 46457.690847845995
Distortion from speaker 13: 48752.75447395648
Distortion from speaker 14: 46149.304156226244
Distortion from speaker 15: 47355.48899510072
Distortion from speaker 16: 47189.16303275903
Distortion from speaker 17: 46161.09918779887
Distortion from speaker 18: 49317.01429898189
Distortion from speaker 19: 47430.405858471975
Distortion from speaker 20: 47526.92290078746
Distortion from speaker 21: 47197.85320267746
Distortion from spe