In [2]:
from utils.utils import Utils
from preprocessing.AudioPreprocessor import AudioPreprocessor
from feature_extraction.LPCExtractor import LPCExtractor
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [20]:
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]
        
def get_data_set(count, speakers):
    nn_input_chunks_per_speaker = int(count/speakers)
    X = np.zeros((count, 12*20))
    y = np.zeros(count, dtype='uint8')
    
    frames_per_speaker = nn_input_chunks_per_speaker * 20
    
    all_speakers = []
    
    for i in range(0, speakers):
        all_speakers.append([])
        index = 0
        while (len(all_speakers[i]) < frames_per_speaker):
            print(index, end="\r")
            y_, sr = Utils.load_file(f"/home/henry/Downloads/archive/50_speakers_audio_data/Speaker_{10+i:04}/Speaker_{10+i:04}_{index:05}.wav")
        
            y_ = AudioPreprocessor.remove_noise(y=y_, sr=sr)
            y_ = AudioPreprocessor.remove_silence(y=y_)
            frames = AudioPreprocessor.create_frames(y=y_, frame_size=500, overlap=100)
            frames = AudioPreprocessor.window_frames(frames=frames)
            
            lpcc = LPCExtractor.lpc(frames=frames, order=12)
            # lpcc = LPCExtractor.lpcc(lpc_list=lpc, order=12)
            
            all_speakers[i] += lpcc
            
            index += 1
        print()
    
    for i in range(0, speakers):
        for j in range(0, nn_input_chunks_per_speaker):
            X[i*nn_input_chunks_per_speaker + j] = np.concatenate((all_speakers[i][20*j][1:13], 
                                           all_speakers[i][20*j+1][1:13], 
                                           all_speakers[i][20*j+2][1:13],
                                           all_speakers[i][20*j+3][1:13],
                                           all_speakers[i][20*j+4][1:13],
                                           all_speakers[i][20*j+5][1:13],
                                           all_speakers[i][20*j+6][1:13],
                                           all_speakers[i][20*j+7][1:13],
                                           all_speakers[i][20*j+8][1:13],
                                           all_speakers[i][20*j+9][1:13],
                                           all_speakers[i][20*j+10][1:13], 
                                           all_speakers[i][20*j+11][1:13], 
                                           all_speakers[i][20*j+12][1:13],
                                           all_speakers[i][20*j+13][1:13],
                                           all_speakers[i][20*j+14][1:13],
                                           all_speakers[i][20*j+15][1:13],
                                           all_speakers[i][20*j+16][1:13],
                                           all_speakers[i][20*j+17][1:13],
                                           all_speakers[i][20*j+18][1:13],
                                           all_speakers[i][20*j+19][1:13]
                                         ))
            y[i*nn_input_chunks_per_speaker + j] = i
            
    return X, y

In [21]:
count = 5500
speakers = 5
X, y = get_data_set(count=count, speakers=speakers)

8
9
9
10
9


In [28]:
def main(X, y, speakers):
    print(y)
    X, y = unison_shuffled_copies(X, y)
    print(y)
    # model takes 10 frames a 12 coefficients
    model = keras.Sequential([
        keras.layers.Flatten(input_shape=[12*20]),
        keras.layers.Dense(16, activation=tf.nn.relu),
        keras.layers.Dense(16, activation=tf.nn.relu),
        keras.layers.Dense(speakers, activation=tf.nn.softmax)
    ])
    
    model.compile(optimizer=tf.optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    model.fit(X[int(5*count/6):], y[int(5*count/6):], epochs=1000, verbose=0)
    
    test_loss, test_acc = model.evaluate(X[-int(count/6):], y[-int(count/6):])
    
    print(f"Test accuracy: {test_acc}")
    print(f"Test loss: {test_loss}")
    
    
    y_, sr = Utils.load_file(f"/home/henry/Downloads/archive/50_speakers_audio_data/Speaker_0014/Speaker_0014_00020.wav")
        
    y_ = AudioPreprocessor.remove_noise(y=y_, sr=sr)
    y_ = AudioPreprocessor.remove_silence(y=y_)
    frames = AudioPreprocessor.create_frames(y=y_, frame_size=500, overlap=100)
    frames = AudioPreprocessor.window_frames(frames=frames)
    
    lpcc = LPCExtractor.lpc(frames=frames, order=12)
    # lpcc = LPCExtractor.lpcc(lpc_list=lpc, order=12)
    
    X = np.zeros((int(len(lpcc)/20), 12*20))
    
    for j in range(0, int(len(lpcc)/20)):
        X[j] = np.concatenate((lpcc[20*j][1:13], 
                                lpcc[20*j+1][1:13], 
                                lpcc[20*j+2][1:13],
                                lpcc[20*j+3][1:13],
                                lpcc[20*j+4][1:13],
                                lpcc[20*j+5][1:13],
                                lpcc[20*j+6][1:13],
                                lpcc[20*j+7][1:13],
                                lpcc[20*j+8][1:13],
                                lpcc[20*j+9][1:13],
                                lpcc[20*j+10][1:13], 
                                lpcc[20*j+11][1:13], 
                                lpcc[20*j+12][1:13],
                                lpcc[20*j+13][1:13],
                                lpcc[20*j+14][1:13],
                                lpcc[20*j+15][1:13],
                                lpcc[20*j+16][1:13],
                                lpcc[20*j+17][1:13],
                                lpcc[20*j+18][1:13],
                                lpcc[20*j+19][1:13]
                                ))
    if X.shape[0] > 100:
        X_2 = X[-100:]
    pred = model.predict(X)
    print(np.argmax(pred, axis=1))
    print(np.count_nonzero(np.argmax(pred, axis=1) == 0))
    print(np.count_nonzero(np.argmax(pred, axis=1) == 1))
    print(np.count_nonzero(np.argmax(pred, axis=1) == 2))
    print(np.count_nonzero(np.argmax(pred, axis=1) == 3))
    print(np.count_nonzero(np.argmax(pred, axis=1) == 4))
    # print(y[-100:])
    
    
if __name__ == "__main__":
    main(X, y, speakers)

[0 0 0 ... 4 4 4]
[2 3 2 ... 0 4 2]
Test accuracy: 0.6484715938568115
Test loss: 0.8353310227394104
[2 4 2 4 4 4 4 4 2 4 0 2 2 2 1 2 2 4 2 3 4 4 3 4 2 2 3 3 2 2 2 4 4 2 3 4 4
 0 4 0 2 4 2 4 4 4 4 4 2 3 0 2 2 4 2 2 4 2 0 2 4 2 2 4 4 2 0 4 2 2 4 4 2 2
 3 0 2 4 3 2 2 2 4 2 2 0 4 0 3 4 3 2 2 0 4 2 0 2 2 4 3 2 2 4 2 0 2 2 2 4 4
 2 2 2 2 2 2 4]
12
1
54
11
40
