In [1]:
import numpy as np
import discrete_signal
import matplotlib.pyplot as plt
from idft import idft
from dft import dft
from scipy.io.wavfile import write
import cmath
import sounddevice as sd
import time

In [2]:
class recordsound():
   
    def __init__(self, T, fs):
    
        self.T = T
        self.fs = fs
                
    def solve(self):
        
        print('start recording')
        voicerecording = sd.rec(int(self.T * self.fs), self.fs, 1)
        sd.wait() 
        print('end recording')
        write('myvoice.wav', self.fs, voicerecording) 
        
        return voicerecording

In [3]:
T = 1  
fs = 8000  
num_recs = 10  
digits = [1, 5] 
digit_recs = []

for digit in digits:
    partial_recs = np.zeros((num_recs, int(T*fs)))
    print('When prompted to speak, say ' + str(digit) + '. \n')
    for i in range(num_recs):
        time.sleep(2)
        digit_recorder = recordsound(T, fs)
        spoken_digit = digit_recorder.solve().reshape(int(T*fs))
        partial_recs[i, :] = spoken_digit
    digit_recs.append(partial_recs)


np.save("recorded_digits.npy", digit_recs)

digit_recs = np.load("recorded_digits.npy")
digits = [1, 2]
num_digits = len(digit_recs)
num_recs, N = digit_recs[0].shape 
fs = 8000
DFTs = []
DFTs_c = []

for digit_rec in digit_recs:
    DFTs_aux = np.zeros((num_recs, N), dtype=np.complex_)
    DFTs_c_aux = np.zeros((num_recs, N), dtype=np.complex_)
    for i in range(num_recs):
        rec_i = digit_rec[i, :]
        energy_rec_i = np.linalg.norm(rec_i)
        rec_i /= energy_rec_i
        DFT_rec_i = dft(rec_i, fs)
        [_, X, _, X_c] = DFT_rec_i.solve()
        DFTs_aux[i, :] = X 
        DFTs_c_aux[i, :] = X_c
    DFTs.append(DFTs_aux)
    DFTs_c.append(DFTs_c_aux) 

np.save("spoken_digits_DFTs.npy", DFTs)
np.save("spoken_digits_DFTs_c.npy", DFTs_c)

When prompted to speak, say 1. 

start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
When prompted to speak, say 2. 

start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording


In [4]:
T = 1  # recording time
fs = 8000  # sampling frequency
num_recs = 10  # number of recordings for the test set
digit_recs = []

partial_recs = np.zeros((num_recs, int(T*fs)))
print('When prompted to speak, say 1 or 2' + '. \n')
for i in range(num_recs):
    time.sleep(2)
    digit_recorder = recordsound(T, fs)
    spoken_digit = digit_recorder.solve().reshape(int(T*fs))
    partial_recs[i, :] = spoken_digit
digit_recs.append(partial_recs)

# Storing recorded voices
np.save("test_set.npy", partial_recs)

# Creating an audio file with the spoken digits
test_set_audio = partial_recs.reshape(T*fs*num_recs)
file_name = 'test_set_audio_rec.wav'
write(file_name, fs, test_set_audio.astype(np.float32))


When prompted to speak, say 1 or 2. 

start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording


Average comparison

In [6]:
T = 1  # recording time
fs = 8000  # sampling frequency

test_set = np.load("test_set.npy")

# loads (DFTs of) training set
training_set_DFTs = np.load("spoken_digits_DFTs.npy")
# Average spectra
num_digits = len(training_set_DFTs)
_, N = training_set_DFTs[0].shape
average_spectra = np.zeros((num_digits, N), dtype=np.complex_)
average_signal = np.zeros((num_digits, N), dtype=np.complex_)

for i in range(num_digits):
    # Average of modulus of spectra
    average_spectra[i, :] = np.mean(np.absolute(training_set_DFTs[i]), axis=0)
    iDFT = idft(average_spectra[i, :], fs, N)
    y_demod, Treal = iDFT.solve_ifft()
    average_signal[i, :] = y_demod

num_recs, N = test_set.shape
predicted_labels = np.zeros(num_recs)

for i in range(num_recs):
    rec_i = test_set[i, :]
    # We can use the norm of the ith signal to normalize its DFT
    energy_rec_i = np.linalg.norm(rec_i)
    rec_i /= energy_rec_i

    # Comparisons
    inner_prods = np.zeros(num_digits)

    for j in range(num_digits):
        inner_prods[j] = np.linalg.norm(np.convolve(rec_i , average_signal[j, :],'same'))**2

    predicted_labels[i] = np.argmax(inner_prods) + 1

print("Average spectrum comparison --- predicted labels: \n")

# Storing predicted labels
np.save("predicted_labels_avg.npy", predicted_labels)
true_labels=np.array([1, 1, 1, 2, 2, 2, 1, 1, 1, 2])
print('The accuracy is:',(1-sum(abs(true_labels-predicted_labels))/len(true_labels))*100)


Average spectrum comparison --- predicted labels: 

The accuracy is: 50.0


Prediction

In [8]:
T = 1  # recording time
fs = 8000  # sampling frequency
time_slots=20 # time of online recording

# loads (DFTs of) training set
training_set_DFTs = np.load("spoken_digits_DFTs.npy")

# Average spectra
num_digits = len(training_set_DFTs)
_, N = training_set_DFTs[0].shape
average_spectra = np.zeros((num_digits, N), dtype=np.complex_)
average_signal = np.zeros((num_digits, N), dtype=np.complex_)

for i in range(num_digits):
    # Average of modulus of spectra
    average_spectra[i, :] = np.mean(np.absolute(training_set_DFTs[i]), axis=0)
    iDFT = idft(average_spectra[i, :], fs, N)
    y_demod, Treal = iDFT.solve_ifft()
    average_signal[i, :] = y_demod


for t in range(time_slots):
    voicerecording = sd.rec(int(T * fs), fs, 1)
    sd.wait()  # Wait until recording is finished
    rec_i = voicerecording.astype(np.float32)
    rec_i=rec_i[:,0]

    # We can use the norm of the ith signal to normalize its DFT
    energy_rec_i = np.linalg.norm(rec_i)
    rec_i /= energy_rec_i
    # Comparisons
    inner_prods = np.zeros(num_digits)

    for j in range(num_digits):
        inner_prods[j] = np.linalg.norm(np.convolve(rec_i , average_signal[j, :]))**2

    print('The number said is:', np.argmax(inner_prods) + 1)


The number said is: 1
The number said is: 1
The number said is: 1
The number said is: 1
The number said is: 1
The number said is: 1
The number said is: 1
The number said is: 1
The number said is: 1
The number said is: 1
The number said is: 1
The number said is: 1
The number said is: 1
The number said is: 1
The number said is: 2
The number said is: 2
The number said is: 1
The number said is: 2
The number said is: 2
The number said is: 2


The lab we use the signal rather than just the spectrum to run it online. To do this, we use calculations in time representation of the signals where we record in parallel. In the code, we take 1s each and compute the norm of convolution with $h_y$ and $h_z$ continuously and repeat the same procedure.