In [1]:
#%pip install tensorflow-io
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.io as tfio
from tensorflow.io import gfile 
from tensorflow.python.ops import gen_audio_ops as audio_ops
from tqdm.notebook import tqdm 

In [2]:
#naming constants
SOUNDS = "Sound Recording"
#epsilon figures out the audio in the sample
EPS = 0.1                   #0.1 is like the constant stuff yunooo
REAL_LEN = 20

In [3]:
#shit ton of shit
words = [
    'backward',
    'bed',
    'bird',
    'cat',
    'dog',
    'down',
    'eight',
    'five',
    'follow',
    'forward',
    'four',
    'go',
    'happy',
    'house',
    'learn',
    'left',
    'apeke',
    'nine',
    'no',
    'off',
    'on',
    'one',
    'right',
    'seven',
    'six',
    'stop',
    'three',
    'tree',
    'two',
    'up',
    'yes',
    'zero',
    '_background',
]

In [4]:
#getting the sound file and converts it to a list
from glob import glob
def get_file(word):
    return gfile.glob(SOUNDS + '/' + word + '/*.wav')
#trying to find where the audio begins and end
#burnaboy & dave: Location of the audio
def get_audio_position(audio, epsilon):
    audio = audio - np.mean(audio)  #makes for a consistent audio
    audio = audio /np.max(np.abs(audio))  #normalizing audio
    return tfio.audio_trim(audio, axis = 0, epsilon = ESP)

def get_lenght_of_audio(audio, epsilon):
    position = get_lenght_of_audio(audio, epsilon)
    lenght = (position[1] - position[0]) #figuring out the proper lenght of the audio
    return lenght.numpy()

#is there enough voice?
def is_enough_voice(audio,epsilon, required_lenght):
    voice_lenght = get_lenght_of_audio(au dio, epsilon)
    return voice_lenght >= required_lenght           #returns a boolean

#is the audio of appropriate size
def appropriate_lenght(audio, expected_lenght):
    return audio[0] == expected_lenght               #returns a boolean

def valid_file(file_name):
    #load file
    audio_tensor = tfio.AudioIOTensor(file_name)
    if not appropriate_lenght(audio_tensor, REAL_LEN):
        return False
    #normalizing once more form -1 to 1
    audio = tf.cast(audio.tensor[:], tf.float32)
    audio = audio - np.mean(audio)
    audio = audio / np.max(np.abs(audio))
    #checking if there is voice in the audio
    if not is_enough_voice(audio,epsilon, required_lenght):
        return False
    return True

In [5]:
def get_spectrogram(audio):
    # normalizng the audio again
    audio = audio - np.mean(audio)
    audio = audio / np.max(np.abs(audio))
    # create the spectrogram
    spectrogram = audio_ops.audio_spectrogram(audio,
                                              window_size=320,  #values are subject to change
                                              stride=160,
                                              magnitude_squared=True).numpy()
    # Average pooling
    spectrogram = tf.nn.pool(
        input=tf.expand_dims(spectrogram, -1), #adding a anew axis making it 3D
        window_shape=[1, 6],
        strides=[1, 6],
        pooling_type='AVG', #specifying the pooling type
        padding='SAME')     #ensuring the output is the same as the imput
    spectrogram = tf.squeeze(spectrogram, axis=0)       #axis 0 removed the axis that was there before reducing it to a 2D shape
    spectrogram = np.log10(spectrogram + 1e-6)
    return spectrogram

In [7]:
# putting the file in the specrogram we mad above
def processing_file(path):
    # loading the audio data
    audio_tensor = tfio.audio.AudioIOTensor(file_path)
    
    # normalize the audio from values -1 and 1
    audio = tf.cast(audio_tensor[:], tf.float32)
    audio = audio - np.mean(audio)
    audio = audio / np.max(np.abs(audio))
    
    # randomly reposition the audio in the sample
    voice_begin, voice_end = get_voice_position(audio, ESP)
    end_gap=len(audio) - voice_end
    random_offset = np.random.uniform(0, voice_begin+end_gap)
    audio = np.roll(audio,-random_offset+end_gap)      #i still dont properly understand this
    
    
    #I will not introduce background noise in the beta testing of the model cause it will distort the data availbale.
    #at this point im tired
    # add some random background noise
    background_volume = np.random.uniform(0, 0.1)      #ESP is 0.1
    
    # filtering in background noise files
    background_noise = get_files('_background_noise_')
    background_file = np.random.choice(background_noise)
    background_tensor = tfio.audio.AudioIOTensor(background_file)
    background_start = np.random.randint(0, (len(background_tensor) - REAL_LEN))  # I havent completely understood this
    
    # normalise the background noise
    background = tf.cast(background_tensor[background_start:background_start+ REAL_LEN], tf.float32)
    background = background - np.mean(background)
    background = background / np.max(np.abs(background))
    # mix the audio with the scaled background
    audio = audio + background_volume * background
    # get the spectrogram
    return get_spectrogram(audio) 

In [10]:
#Splitting data
train = []
validate = []
test = []

TRAIN_SIZE = 0.7
VALIDATE = 0.15
TEST_SIZE = 0.15

In [16]:
#lots of refractoring to be done whalai
def process_files(file_names, label, repeat=1):
    file_names = tf.repeat(file_names, repeat).numpy()   #tqdm creates a progress bar
    return [(process_file(file_names), label) for file_name in tqdm(file_names, desc=f"{word} ({label})", leave=False)]


# process the files for a word into the spectrogram and one hot encoding word value
def process_word(word, repeat=1):
    # the index of the word word we are processing
    label = words.index(word)
    # get a list of files names for the word
    file_names = [file_name for file_name in tqdm(get_files(word), desc="Checking", leave=False) if is_valid_file(file_name)]
    # randomly shuffle the filenames
    np.random.shuffle(file_names)
    # split the files into train, validate and test buckets
    train_size=int(TRAIN_SIZE*len(file_names))
    validation_size=int(VALIDATION_SIZE*len(file_names))
    test_size=int(TEST_SIZE*len(file_names))
    
    
    # get the training samples
    train.extend(
        process_files(
            file_names[:train_size],
            label,
            repeat=repeat
        )
    )
    # and the validation samples
    validate.extend(
        process_files(
            file_names[train_size:train_size+validation_size],
            label,
            repeat=repeat
        )
    )
    # and the test samples
    test.extend(
        process_files(
            file_names[train_size+validation_size:],
            label,
            repeat=repeat
        )
    )   #i dont know what i am doing 

# process all the words and all the files
for word in tqdm(word, desc="Processing words"):
    if '_' not in word:
        # add more examples of Apeke to balance our training set
        repeat = 70 if word == 'apeke' else 1  #change word to KW which is apeke
        #process_word(word, repeat=repeat)
    
print(len(train), len(test), len(validate))

Processing words:   0%|          | 0/1 [00:00<?, ?it/s]

0 0 0


*The block of code below hasnt been looked at because the initial testing of the code will require no background noise. Robustness of data isnt a cateria in our current build.*

In [22]:
#this block of code hasnt been looked at because the initial testing of the code will require no background noise.
#robustness of data isnt a cateria in our current build
# process the background noise files
def process_background(file_name, label):
    # load the audio file
    audio_tensor = tfio.audio.AudioIOTensor(file_name)
    audio = tf.cast(audio_tensor[:], tf.float32)
    audio_length = len(audio)
    samples = []
    for section_start in tqdm(range(0, audio_length-EXPECTED_SAMPLES, 8000), desc=file_name, leave=False):
        section_end = section_start + EXPECTED_SAMPLES
        section = audio[section_start:section_end]
        # get the spectrogram
        spectrogram = get_spectrogram(section)
        samples.append((spectrogram, label))

    # simulate random utterances
    for section_index in tqdm(range(1000), desc="Simulated Words", leave=False):
        section_start = np.random.randint(0, audio_length - EXPECTED_SAMPLES)
        section_end = section_start + EXPECTED_SAMPLES
        section = np.reshape(audio[section_start:section_end], (EXPECTED_SAMPLES))

        result = np.zeros((EXPECTED_SAMPLES))
        # create a pseudo bit of voice
        voice_length = np.random.randint(MINIMUM_VOICE_LENGTH/2, EXPECTED_SAMPLES)
        voice_start = np.random.randint(0, EXPECTED_SAMPLES - voice_length)
        hamming = np.hamming(voice_length)
        # amplify the voice section
        result[voice_start:voice_start+voice_length] = hamming * section[voice_start:voice_start+voice_length]
        # get the spectrogram
        spectrogram = get_spectrogram(np.reshape(section, (16000, 1)))
        samples.append((spectrogram, label))
        
    
    np.random.shuffle(samples)
    
    train_size=int(TRAIN_SIZE*len(samples))
    validation_size=int(VALIDATION_SIZE*len(samples))
    test_size=int(TEST_SIZE*len(samples))
    
    train.extend(samples[:train_size])

    validate.extend(samples[train_size:train_size+validation_size])

    test.extend(samples[train_size+validation_size:])

        
for file_name in tqdm(get_file('_background_noise_'), desc="Processing Background Noise"):
    process_background(file_name, words.index("_background"))
    
print(len(train), len(test), len(validate))

Processing Background Noise: 0it [00:00, ?it/s]

0 0 0


In [23]:
#the data was given by that guy did dyi alexa
#i still dont know if i should add this
def process_problem_noise(file_name, label):
    samples = []
    # load the audio file
    audio_tensor = tfio.audio.AudioIOTensor(file_name)
    audio = tf.cast(audio_tensor[:], tf.float32)
    audio_length = len(audio)
    samples = []
    for section_start in tqdm(range(0, audio_length-EXPECTED_SAMPLES, 400), desc=file_name, leave=False):
        section_end = section_start + EXPECTED_SAMPLES
        section = audio[section_start:section_end]
        # get the spectrogram
        spectrogram = get_spectrogram(section)
        samples.append((spectrogram, label))
        
    np.random.shuffle(samples)
    
    train_size=int(TRAIN_SIZE*len(samples))
    validation_size=int(VALIDATION_SIZE*len(samples))
    test_size=int(TEST_SIZE*len(samples))
    
    train.extend(samples[:train_size])
    validate.extend(samples[train_size:train_size+validation_size])
    test.extend(samples[train_size+validation_size:])


for file_name in tqdm(get_file("_problem_noise_"), desc="Processing problem noise"):
    process_problem_noise(file_name, words.index("_background"))

Processing problem noise: 0it [00:00, ?it/s]

In [25]:
print(len(train), len(test), len(validate))

0 0 0


In [26]:
# randomise the training samples
np.random.shuffle(train)
X_train, Y_train = zip(*train)
X_validate, Y_validate = zip(*validate)
X_test, Y_test = zip(*test)
# save the computed data
np.savez_compressed(
    "training_spectrogram.npz",
    X=X_train, Y=Y_train)
print("Saved training data")
np.savez_compressed(
    "validation_spectrogram.npz",
    X=X_validate, Y=Y_validate)
print("Saved validation data")
np.savez_compressed(
    "test_spectrogram.npz",
    X=X_test, Y=Y_test)
print("Saved test data")

ValueError: not enough values to unpack (expected 2, got 0)

In [21]:
# get the width and height of the spectrogram "image"
IMG_WIDTH=X_train[0].shape[0]
IMG_HEIGHT=X_train[0].shape[1]
def plot_images2(images_arr, imageWidth, imageHeight):
    fig, axes = plt.subplots(5, 5, figsize=(10, 20))
    axes = axes.flatten()
    for img, ax in zip(images_arr, axes):
        ax.imshow(np.reshape(img, (imageWidth, imageHeight)))
        ax.axis("off")
    plt.tight_layout()
    plt.show()
    

NameError: name 'X_train' is not defined