In [1]:
import os
from os.path import isfile, join
from pathlib import Path
import shutil
import subprocess

import numpy as np
import tensorflow as tf
from tensorflow import keras
from IPython.display import display, Audio

In [2]:
voicefile_names = os.listdir("C:/Users/Himagnya/Downloads/16000_pcm_speeches/audio") 

In [3]:
noisefile_names = os.listdir("C:/Users/Himagnya/Downloads/16000_pcm_speeches/noise") 

In [4]:
valid_split = 0.1
shuffle_seed = 43
sample_rate = 16000
scale = 0.5
batch_size = 128
epochs = 15 

In [5]:
noise_path = "C:/Users/Himagnya/Downloads/16000_pcm_speeches/noise"
audio_path = "C:/Users/Himagnya/Downloads/16000_pcm_speeches/audio"

In [6]:
noise_paths = []

for subdir in os.listdir(noise_path):
    subdir_path= Path (noise_path) / subdir
    if os.path.isdir (subdir_path):
        noise_paths += [
            os.path.join(subdir_path, filepath)
            for filepath in os.listdir(subdir_path)
            if filepath.endswith(".wav")
        ]

In [7]:
noise_paths

['C:\\Users\\Himagnya\\Downloads\\16000_pcm_speeches\\noise\\other\\exercise_bike.wav',
 'C:\\Users\\Himagnya\\Downloads\\16000_pcm_speeches\\noise\\other\\pink_noise.wav',
 'C:\\Users\\Himagnya\\Downloads\\16000_pcm_speeches\\noise\\_background_noise_\\10convert.com_Audience-Claps_daSG5fwdA7o.wav',
 'C:\\Users\\Himagnya\\Downloads\\16000_pcm_speeches\\noise\\_background_noise_\\doing_the_dishes.wav',
 'C:\\Users\\Himagnya\\Downloads\\16000_pcm_speeches\\noise\\_background_noise_\\dude_miaowing.wav',
 'C:\\Users\\Himagnya\\Downloads\\16000_pcm_speeches\\noise\\_background_noise_\\running_tap.wav']

In [13]:
%matplotlib inline
import matplotlib.pyplot as plt
import librosa.display

In [15]:
#os.system (command)
def load_noise_sample(path):
    sample, sampling_rate = tf.audio.decode_wav(
        tf.io.read_file(path), desired_channels=1
    )
    print("Sampling rate of original audio", sampling_rate)
    if sampling_rate >= sample_rate:
        print("shape", sample.shape [0])
        slices = int(sample.shape [0] / sample_rate)
        print (slices)
        sample= tf.split(sample[: slices* sample_rate], slices)
        return sample
    else:
        print("Sampling rate for", path, "is incorrect")
        return None
noises = []
for path in noise_paths: 
    sample = load_noise_sample(path)
    if sample:
        noises.extend (sample)
noises = tf.stack(noises)

Sampling rate of original audio tf.Tensor(22050, shape=(), dtype=int32)
shape 1350648
84
Sampling rate of original audio tf.Tensor(22050, shape=(), dtype=int32)
shape 1323000
82
Sampling rate of original audio tf.Tensor(44100, shape=(), dtype=int32)
shape 718514
44
Sampling rate of original audio tf.Tensor(22050, shape=(), dtype=int32)
shape 2098788
131
Sampling rate of original audio tf.Tensor(22050, shape=(), dtype=int32)
shape 1362816
85
Sampling rate of original audio tf.Tensor(22050, shape=(), dtype=int32)
shape 1348479
84


In [17]:
def paths_and_labels_to_dataset (audio_paths, labels):
    path_ds = tf.data. Dataset.from_tensor_slices (audio_paths)
    audio_ds = path_ds.map (lambda x: path_to_audio (x))
    label_ds = tf.data. Dataset.from_tensor_slices (labels)
    return tf.data. Dataset.zip((audio_ds, label_ds))

In [18]:
def path_to_audio (path):
    audio = tf.io.read_file (path)
    audio, _ = tf.audio.decode_wav (audio, 1, sample_rate)
    return audio

In [19]:
def add_noise (audio, noises=None, scale=0.5):
    if noises is not None:
        tf_rnd = tf.random.uniform(
            (tf.shape (audio) [0],), 0, noises.shape [0], dtype=tf.int32
        )
        noise = tf.gather (noises, tf_rnd, axis=0)
        
        prop= tf.math.reduce_max (audio, axis=1) / tf.math.reduce_max (noise, axis=1)
        prop= tf.repeat (tf.expand_dims (prop, axis=1), tf.shape (audio)[1], axis=1)
        
        audio = audio + noise * prop * scale
    return audio

In [20]:
def audio_to_fft (audio):
    audio = tf.squeeze (audio, axis=-1)
    fft = tf.signal.fft(
        tf.cast(tf.complex (real=audio, imag=tf.zeros_like(audio)), tf.complex64)
    )
    fft = tf.expand_dims (fft, axis=-1)
    return tf.math.abs (fft[:,: (audio.shape [1] // 2), :])

In [21]:
class_names = os.listdir (audio_path)
print (class_names,)
audio_paths = []
labels = []
for label, name in enumerate (class_names):
    print("Speaker: ", (name))
    dir_path = Path (audio_path) / name
    speaker_sample_paths = [
        os.path.join (dir_path, filepath)
        for filepath in os.listdir(dir_path)
        if filepath.endswith(".wav")
    ]
    audio_paths += speaker_sample_paths
    labels += [label]* len (speaker_sample_paths)

['Benjamin_Netanyau', 'Jens_Stoltenberg', 'Julia_Gillard', 'Magaret_Tarcher', 'Nelson_Mandela']
Speaker:  Benjamin_Netanyau
Speaker:  Jens_Stoltenberg
Speaker:  Julia_Gillard
Speaker:  Magaret_Tarcher
Speaker:  Nelson_Mandela


In [22]:
# Shuffle to generate random data
rng = np. random. RandomState (shuffle_seed)
rng.shuffle (audio_paths)
rng = np. random. RandomState(shuffle_seed)
rng.shuffle (labels) 

In [23]:
# Split into training and validation
num_val_samples = int (valid_split * len(audio_paths))
train_audio_paths = audio_paths[:-num_val_samples]
train_labels = labels[:-num_val_samples]
valid_audio_paths = audio_paths [-num_val_samples:]
valid_labels = labels [-num_val_samples:]

In [25]:
num_val_samples = int(valid_split * len(audio_paths))
print("Using {} files for training.".format(len(audio_paths) - num_val_samples))
train_audio_paths = audio_paths[:-num_val_samples]
train_labels = labels[:-num_val_samples]

print("Using {} files for validation.".format(num_val_samples))
valid_audio_paths = audio_paths[-num_val_samples:]
valid_labels = labels[-num_val_samples:]

Using 6751 files for training.
Using 750 files for validation.


In [26]:
# Create datasets, one for training and the other for validation
train_ds = paths_and_labels_to_dataset(train_audio_paths, train_labels)
train_ds = train_ds.shuffle(buffer_size=batch_size * 8, seed=shuffle_seed).batch(
    batch_size
)

valid_ds = paths_and_labels_to_dataset(valid_audio_paths, valid_labels)
valid_ds = valid_ds.shuffle(buffer_size=32 * 8, seed=shuffle_seed).batch(32)

In [27]:
# Add noise to the training set
train_ds = train_ds.map(
    lambda x, y: (add_noise (x, noises, scale=scale), y),
    num_parallel_calls = tf.data.experimental.AUTOTUNE,
)

# Transform audio wave to the frequency domain using 'audio_to_fft`
train_ds = train_ds.map(
    lambda x, y: (audio_to_fft (x), y), num_parallel_calls=tf.data.experimental.AUTOTUNE
)

train_ds = train_ds.prefetch(tf.data.experimental.AUTOTUNE)

valid_ds = valid_ds.map(
    lambda x, y: (audio_to_fft (x), y), num_parallel_calls=tf.data.experimental.AUTOTUNE
)
valid_ds = valid_ds.prefetch(tf.data.experimental.AUTOTUNE)

In [28]:
from tensorflow.keras.layers import Conv1D

In [29]:
def residual_block(x, filters, conv_num= 3, activation = "relu"):
    s = keras.layers. Conv1D (filters, 1, padding = "same") (x)
    
    for i in range (conv_num - 1):
        x = keras.layers. Conv1D (filters, 3, padding = "same") (x)
        x = keras.layers. Activation (activation) (x)
    x = keras.layers. Conv1D (filters, 3, padding = "same") (x)
    x = keras.layers.Add() ([x, s])
    x = keras.layers. Activation (activation) (x)
    return keras.layers. MaxPool1D (pool_size = 2, strides= 2)(x)

def build_model (input_shape, num_classes):
    inputs = keras. layers. Input (shape = input_shape, name = "input")

    x = residual_block (inputs, 16, 2)
    x = residual_block (inputs, 32, 2)
    x = residual_block (inputs, 64, 3)
    x = residual_block (inputs, 128, 3)
    x = residual_block (inputs, 128, 3)
    x = keras.layers. AveragePooling1D (pool_size=3, strides=3)(x)
    x = keras.layers. Flatten() (x)
    x = keras.layers.Dense (256, activation="relu") (x)
    x = keras.layers. Dense (128, activation="relu") (x)
    outputs = keras.layers.Dense (num_classes, activation = "softmax", name = "output") (x)
    return keras.models. Model(inputs = inputs, outputs = outputs)

model = build_model((sample_rate // 2, 1), len(class_names))
model.summary()
model.compile (optimizer="Adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model_save_filename= "model.h5"
earlystopping_cb = keras.callbacks. EarlyStopping (patience=10, restore_best_weights=True)
mdlcheckpoint_cb = keras.callbacks. ModelCheckpoint (
    model_save_filename, monitor="val_accuracy", save_best_only=True
)

In [30]:
epochs = 50

Training 

In [31]:
history = model.fit(
    train_ds,
    epochs = epochs,
    validation_data=valid_ds,
    callbacks=[earlystopping_cb, mdlcheckpoint_cb],
)

Epoch 1/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10s/step - accuracy: 0.3951 - loss: 75.7275 



[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m565s[0m 10s/step - accuracy: 0.3984 - loss: 74.8623 - val_accuracy: 0.7747 - val_loss: 0.5503
Epoch 2/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10s/step - accuracy: 0.8752 - loss: 0.3411



[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m538s[0m 10s/step - accuracy: 0.8757 - loss: 0.3398 - val_accuracy: 0.9267 - val_loss: 0.1893
Epoch 3/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10s/step - accuracy: 0.9462 - loss: 0.1415 



[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m548s[0m 10s/step - accuracy: 0.9464 - loss: 0.1413 - val_accuracy: 0.9613 - val_loss: 0.1030
Epoch 4/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10s/step - accuracy: 0.9754 - loss: 0.0782



[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m534s[0m 10s/step - accuracy: 0.9753 - loss: 0.0785 - val_accuracy: 0.9680 - val_loss: 0.0932
Epoch 5/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10s/step - accuracy: 0.9684 - loss: 0.0913 



[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m545s[0m 10s/step - accuracy: 0.9684 - loss: 0.0912 - val_accuracy: 0.9760 - val_loss: 0.0698
Epoch 6/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m529s[0m 10s/step - accuracy: 0.9784 - loss: 0.0560 - val_accuracy: 0.9720 - val_loss: 0.0744
Epoch 7/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m518s[0m 10s/step - accuracy: 0.9729 - loss: 0.0737 - val_accuracy: 0.9600 - val_loss: 0.1289
Epoch 8/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10s/step - accuracy: 0.9813 - loss: 0.0538



[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m523s[0m 10s/step - accuracy: 0.9813 - loss: 0.0537 - val_accuracy: 0.9800 - val_loss: 0.0806
Epoch 9/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m526s[0m 10s/step - accuracy: 0.9783 - loss: 0.0632 - val_accuracy: 0.9747 - val_loss: 0.0855
Epoch 10/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m523s[0m 10s/step - accuracy: 0.9874 - loss: 0.0420 - val_accuracy: 0.9800 - val_loss: 0.0600
Epoch 11/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m528s[0m 10s/step - accuracy: 0.9890 - loss: 0.0317 - val_accuracy: 0.9800 - val_loss: 0.0623
Epoch 12/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m521s[0m 10s/step - accuracy: 0.9842 - loss: 0.0446 - val_accuracy: 0.9587 - val_loss: 0.0899
Epoch 13/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m524s[0m 10s/step - accuracy: 0.9806 - loss: 0.0486 - val_accuracy: 0.9747 - val_loss: 0.0710
Epoch 14/50
[1m53/53[0m [32m━━━━



[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m523s[0m 10s/step - accuracy: 0.9920 - loss: 0.0220 - val_accuracy: 0.9827 - val_loss: 0.0793
Epoch 15/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m523s[0m 10s/step - accuracy: 0.9809 - loss: 0.0596 - val_accuracy: 0.9827 - val_loss: 0.0531
Epoch 16/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m518s[0m 10s/step - accuracy: 0.9929 - loss: 0.0272 - val_accuracy: 0.9773 - val_loss: 0.0736
Epoch 17/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m517s[0m 10s/step - accuracy: 0.9909 - loss: 0.0215 - val_accuracy: 0.9813 - val_loss: 0.0682
Epoch 18/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m515s[0m 10s/step - accuracy: 0.9907 - loss: 0.0308 - val_accuracy: 0.9747 - val_loss: 0.1080
Epoch 19/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10s/step - accuracy: 0.9944 - loss: 0.0183



[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m520s[0m 10s/step - accuracy: 0.9944 - loss: 0.0182 - val_accuracy: 0.9880 - val_loss: 0.0503
Epoch 20/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m528s[0m 10s/step - accuracy: 0.9953 - loss: 0.0125 - val_accuracy: 0.9840 - val_loss: 0.0591
Epoch 21/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m517s[0m 10s/step - accuracy: 0.9941 - loss: 0.0161 - val_accuracy: 0.9867 - val_loss: 0.0504
Epoch 22/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m517s[0m 10s/step - accuracy: 0.9940 - loss: 0.0191 - val_accuracy: 0.9827 - val_loss: 0.0840
Epoch 23/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m522s[0m 10s/step - accuracy: 0.9881 - loss: 0.0437 - val_accuracy: 0.9813 - val_loss: 0.0692
Epoch 24/50
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m525s[0m 10s/step - accuracy: 0.9923 - loss: 0.0288 - val_accuracy: 0.9747 - val_loss: 0.0927
Epoch 25/50
[1m53/53[0m [32m━━━

In [32]:
print ("Accuracy of model: ", model.evaluate(valid_ds))

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 360ms/step - accuracy: 0.9893 - loss: 0.0474
Accuracy of model:  [0.05026037618517876, 0.9879999756813049]


In [33]:
SAMPLES_TO_DISPLAY = 10
test_ds= paths_and_labels_to_dataset (valid_audio_paths, valid_labels)
test_ds = test_ds.shuffle (buffer_size = batch_size * 8, seed=shuffle_seed).batch(
    batch_size
)

test_ds = test_ds.map (lambda x, y: (add_noise (x, noises, scale=scale), y))

for audios, labels in test_ds.take (1):
    ffts = audio_to_fft (audios)
    y_pred = model.predict (ffts)
    rnd = np. random. randint(0, batch_size, SAMPLES_TO_DISPLAY)
    audios = audios.numpy () [rnd, :, :]
    labels = labels.numpy ( ) [rnd]
    y_pred = np.argmax (y_pred, axis=-1) [rnd]

    for index in range (SAMPLES_TO_DISPLAY):
        print( 
            "Speaker:\33{} {}\33 \tPredicted:\33{} {}\33 ".format(
                "[92m" if labels[index] == y_pred[index] else "[91m",
                class_names[labels [index]],
                "[92m" if labels[index] == y_pred [index] else "[91m",
                class_names[y_pred [index]],
            )
        )
        if labels [index] ==y_pred [index]:
            print("Welcome")
        else:
            print("Sorry")
        print("The speaker is" if labels[index]== y_pred [index] else "", class_names [y_pred [index]])

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 375ms/step
Speaker:[92m Magaret_Tarcher 	Predicted:[92m Magaret_Tarcher 
Welcome
The speaker is Magaret_Tarcher
Speaker:[92m Julia_Gillard 	Predicted:[92m Julia_Gillard 
Welcome
The speaker is Julia_Gillard
Speaker:[92m Julia_Gillard 	Predicted:[92m Julia_Gillard 
Welcome
The speaker is Julia_Gillard
Speaker:[92m Julia_Gillard 	Predicted:[92m Julia_Gillard 
Welcome
The speaker is Julia_Gillard
Speaker:[92m Magaret_Tarcher 	Predicted:[92m Magaret_Tarcher 
Welcome
The speaker is Magaret_Tarcher
Speaker:[92m Benjamin_Netanyau 	Predicted:[92m Benjamin_Netanyau 
Welcome
The speaker is Benjamin_Netanyau
Speaker:[92m Nelson_Mandela 	Predicted:[92m Nelson_Mandela 
Welcome
The speaker is Nelson_Mandela
Speaker:[91m Jens_Stoltenberg 	Predicted:[91m Benjamin_Netanyau 
Sorry
 Benjamin_Netanyau
Speaker:[92m Nelson_Mandela 	Predicted:[92m Nelson_Mandela 
Welcome
The speaker is Nelson_Mandela
Speaker:[9

In [37]:
def paths_to_dataset (audio_paths):
    path_ds = tf.data. Dataset.from_tensor_slices (audio_paths)
    return tf.data. Dataset.zip((path_ds))

def predict (path, labels):
    test = paths_and_labels_to_dataset(path, labels)
    
    test = test.shuffle(buffer_size=batch_size* 8, seed=shuffle_seed). batch (
        batch_size 
    )
    test = test.prefetch(tf.data.experimental.AUTOTUNE)

    test = test.map (lambda x, y: (add_noise (x, noises, scale=scale), y))
    
    for audios, labels in test.take (1):
        ffts = audio_to_fft (audios)
        y_pred = model.predict (ffts)
        rnd = np.random.randint(0, 1, 1)
        audios= audios.numpy () [rnd, :]
        labels = labels.numpy () [rnd]
        y_pred = np.argmax(y_pred, axis=-1)[rnd]

        for index in range (1):
                print (
                "Speaker: \33{} {}\33 \tPredicted: \33{} {}\33 " .format(
                "[92m",y_pred[index],
                    "[92m", y_pred[index]
                    )
                )

                print("Speaker Predicted: ", class_names [y_pred[index]])
                

In [40]:
path = ["C:/Users/Himagnya/Downloads/16000_pcm_speeches/Julia_Gillard/963.wav"]
labels = ["unknown"]

predict (path, labels)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 165ms/step
Speaker: [92m 2 	Predicted: [92m 2 
Speaker Predicted:  Julia_Gillard
