In [1]:
import numpy as np
import discrete_signal
import matplotlib.pyplot as plt
from idft import idft
from dft import dft
from scipy.io.wavfile import write
import cmath
import sounddevice as sd
import time

In [2]:
class recordsound():
   
    def __init__(self, T, fs):
    
        self.T = T
        self.fs = fs
                
    def solve(self):
        
        print('start recording')
        voicerecording = sd.rec(int(self.T * self.fs), self.fs, 1)
        sd.wait()  # Wait until recording is finished
        print('end recording')
        write('myvoice.wav', self.fs, voicerecording)  # Save as WAV file 
        
        return voicerecording

In [6]:
T = 1  
fs = 8000  
num_recs = 10  
digits = [1, 2] 
digit_recs = []

for digit in digits:
    partial_recs = np.zeros((num_recs, int(T*fs)))
    print('When prompted to speak, say ' + str(digit) + '. \n')
    for i in range(num_recs):
        time.sleep(2)
        digit_recorder = recordsound(T, fs)
        spoken_digit = digit_recorder.solve().reshape(int(T*fs))
        partial_recs[i, :] = spoken_digit
    digit_recs.append(partial_recs)


np.save("recorded_digits.npy", digit_recs)

digit_recs = np.load("recorded_digits.npy")
digits = [1, 2]
num_digits = len(digit_recs)
num_recs, N = digit_recs[0].shape 
fs = 8000
DFTs = []
DFTs_c = []

for digit_rec in digit_recs:
    DFTs_aux = np.zeros((num_recs, N), dtype=np.complex_)
    DFTs_c_aux = np.zeros((num_recs, N), dtype=np.complex_)
    for i in range(num_recs):
        rec_i = digit_rec[i, :]
        energy_rec_i = np.linalg.norm(rec_i)
        rec_i /= energy_rec_i
        DFT_rec_i = dft(rec_i, fs)
        [_, X, _, X_c] = DFT_rec_i.solve()
        DFTs_aux[i, :] = X 
        DFTs_c_aux[i, :] = X_c
    DFTs.append(DFTs_aux)
    DFTs_c.append(DFTs_c_aux) 

np.save("spoken_digits_DFTs.npy", DFTs)
np.save("spoken_digits_DFTs_c.npy", DFTs_c)

When prompted to speak, say 1. 

start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
When prompted to speak, say 2. 

start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording
start recording
end recording


In [None]:
T = 1  # recording time
fs = 8000  # sampling frequency
num_recs = 10  # number of recordings for the test set
digit_recs = []

partial_recs = np.zeros((num_recs, int(T*fs)))
print('When prompted to speak, say 1 or 2' + '. \n')
for i in range(num_recs):
    time.sleep(2)
    digit_recorder = recordsound(T, fs)
    spoken_digit = digit_recorder.solve().reshape(int(T*fs))
    partial_recs[i, :] = spoken_digit
digit_recs.append(partial_recs)

# Storing recorded voices
np.save("test_set.npy", partial_recs)

# Creating an audio file with the spoken digits
test_set_audio = partial_recs.reshape(T*fs*num_recs)
file_name = 'test_set_audio_rec.wav'
write(file_name, fs, test_set_audio.astype(np.float32))


In [4]:
T = 1 
fs = 8000  
test_set = np.load("test_set.npy")
training_set_DFTs = np.abs(np.load("spoken_digits_DFTs.npy"))

num_digits = len(training_set_DFTs)
_, N = training_set_DFTs[0].shape
average_spectra = np.zeros((num_digits, N), dtype=np.complex_)

for i in range(num_digits):
    average_spectra[i, :] = np.mean(training_set_DFTs[i], axis=0) 

num_recs, N = test_set.shape
predicted_labels = np.zeros(num_recs)
    
for i in range(num_recs):
    rec_i = test_set[i, :]
    energy_rec_i = np.linalg.norm(rec_i)
    rec_i /= energy_rec_i
    DFT_rec_i = dft(rec_i, fs)
    [_, X, _, X_c] = DFT_rec_i.solve()

    inner_prods = np.zeros(num_digits) 
    for j in range(num_digits):
        inner_prods[j] = np.inner(np.abs(X), np.abs(average_spectra[j, :]))
    predicted_labels[i] = np.argmax(inner_prods) + 1
    
print("Average spectrum comparison --- predicted labels: \n")
print(np.matrix(predicted_labels[:, None]))

Average spectrum comparison --- predicted labels: 

[[1.]
 [1.]
 [2.]
 [2.]
 [1.]
 [1.]
 [2.]
 [2.]
 [1.]
 [1.]]


In [7]:

T = 1 
fs = 8000  
test_set = np.load("test_set.npy")

training_set_DFTs = np.load("spoken_digits_DFTs.npy")
num_digits = len(training_set_DFTs)

num_recs, N = test_set.shape
predicted_labels = np.zeros(num_recs)
training_set_size, _ = training_set_DFTs[0].shape

for i in range(num_recs):
    rec_i = test_set[i, :]
    energy_rec_i = np.linalg.norm(rec_i)
    rec_i /= energy_rec_i
    DFT_rec_i = dft(rec_i, fs)
    [_, X, _, X_c] = DFT_rec_i.solve()

    inner_prods = np.zeros((num_digits, training_set_size))
    for j in range(num_digits):
        for k in range(training_set_size):
            sample_dft = (training_set_DFTs[j])[k, :]  
            inner_prods[j, k] = np.inner(np.abs(X), np.abs(sample_dft))
    max_position = np.unravel_index(np.argmax(inner_prods), inner_prods.shape)  
    predicted_labels[i] = max_position[0] + 1  

print("Nearest neighbor comparison --- predicted labels: \n")
print(np.matrix(predicted_labels[:, None]))

Nearest neighbor comparison --- predicted labels: 

[[1.]
 [1.]
 [2.]
 [2.]
 [1.]
 [1.]
 [2.]
 [2.]
 [1.]
 [1.]]


In our case, average spectrum comparison ad nearest neighbor comparison outputs overlap, meaning that they have 100% accuracy level. Of course, in real-world setting, it is not possible given a lot of noise in data and a immmense amount of data present to digital systems. 