# Creating with TinyML - 04a. Generate a Voice Dataset
This Colab will download the [speech commands dataset](https://ai.googleblog.com/2017/08/launching-speech-commands-dataset.html) (published by Google) and let you select a subset of words / speech commands to extract. It will then generate a balanced dataset with classes for your selected words. 

In addition a "silence" (ambient noise) class and a "unknown" class (random words not in your list).

## Setup the environment

In [None]:
import os
import sys
import tarfile
import urllib.request
import numpy as np
import shutil
import math
import soundfile as sf

## Constants, do not change

In [None]:
#CONSTANTS, DO NOT CHANGE
DATA_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz'
OUT_DIR = 'dataset'
DOWNLOAD_DIR = 'download/'

ALL_WORDS = ["bed", "bird", "cat", "dog", "down", "eight", "five", "follow", "forward", "four", "go", "happy", "house", "learn", "left", "marvin", "nine", "no", "off", "on", "one", "right", "seven", "sheila", "six", "stop", "three", "tree", "two", "up", "visual", "wow", "yes", "zero"]



# Configure

Select one or more of the following words:

* bed
* bird
* cat
* dog
* down
* eight
* five
* follow
* forward
* four
* go
* happy
* house
* learn
* left
* marvin
* nine
* no
* off
* on
* one
* right
* seven
* sheila
* six
* stop
* three
* tree
* two
* up
* visual
* wow
* yes
* zero




In [None]:

# Select here
SELECTED_WORDS = ['cat','dog']
# Limit the amount of files generated
MAX_FILES = 3000

# Check our selection against allowed words
for label in SELECTED_WORDS:
  if not label in ALL_WORDS:
    raise Exception(f"{label} is not a word in the dataset")


## Function definitions
Functions to download and process or data

In [None]:
# From: https://github.com/tensorflow/tensorflow/blob/de034a2911bdf0547a92e79b0c9858f8c4625fe0/tensorflow/examples/speech_commands/input_data.py
def maybe_download_and_extract_dataset(data_url, dest_directory):
  """Download and extract data set tar file.
  If the data set we're using doesn't already exist, this function
  downloads it from the TensorFlow.org website and unpacks it into a
  directory.
  If the data_url is none, don't download anything and expect the data
  directory to contain the correct files already.
  Args:
    data_url: Web location of the tar file containing the data set.
    dest_directory: File path to extract data to.
  """
  if not data_url:
    return
  if not os.path.isdir(dest_directory):
    os.makedirs(dest_directory)
  file_name = data_url.split('/')[-1]
  file_path = os.path.join(dest_directory, file_name)

  if not os.path.exists(file_path):
    def _progress(count, block_size, total_size):
      sys.stdout.write(
        '\r>> Downloading %s %.1f%%' %
        (file_name, float(count * block_size) / float(total_size) * 100.0))
      sys.stdout.flush()

    try:
      file_path, _ = urllib.request.urlretrieve(data_url, file_path, _progress)
    except:
      print(
        'Failed to download URL: {0} to folder: {1}. Please make sure you '
        'have enough free space and an internet connection'.format(
          data_url, file_path))
      raise
    print()
    statinfo = os.stat(file_path)
    print('Successfully downloaded {0} ({1} bytes)'.format(
      file_name, statinfo.st_size))
    tarfile.open(file_path, 'r:gz').extractall(dest_directory)


In [None]:
def _generate_silence(file, num):
  data, sr = sf.read(file)
  split = []
  duration = int(np.ceil(len(data) / sr))

  out_path = os.path.join(OUT_DIR, 'silence')
  if not os.path.isdir(out_path):
    os.mkdir(out_path)
    
  for i in range(num):
    o = math.floor(np.random.uniform(0, duration-1))
    temp = data[o * sr: o * sr + sr]
    temp *= np.random.uniform(0, 1)
    if '_noise_' in file:
      temp *= 0.1
    split.append(temp)

  base_name = os.path.splitext(os.path.basename(file))[0]
  for i in range(num):
    filename = os.path.join(out_path, f'{base_name}_{i}.wav')
    sf.write(filename, split[i], sr)



def _write_files(labeled_files, unknown_files, noise_files):
  if os.path.isdir(OUT_DIR):
    shutil.rmtree(OUT_DIR)
  os.mkdir(OUT_DIR)
  
  for index, files in enumerate(labeled_files):
    label = SELECTED_WORDS[index]
    label_path = os.path.join(OUT_DIR, label)
    os.mkdir(label_path)
    for file_path in files:
      shutil.copy2(file_path, f'{label_path}/')
  
  unknown_path = os.path.join(OUT_DIR, 'unknown')
  os.mkdir(unknown_path)

  for file_path in unknown_files:
    shutil.copy2(file_path, unknown_path)

  num_slices_per_noise_file = math.floor(len(unknown_files) / len(noise_files))
  print('num_slices_per_noise_file',num_slices_per_noise_file)
  for noise_file in noise_files:
    _generate_silence(noise_file, num_slices_per_noise_file)

  
def _get_wav_files(label):
  dir = os.path.join(DOWNLOAD_DIR, label)
  files = []
  for file in os.listdir(dir):
    if file.endswith(".wav"):
        files.append(os.path.join(DOWNLOAD_DIR, label, file))
  return files

def _check_label(label):
  if not label in ALL_WORDS:
    raise Exception(f"{label} is not a word in the dataset")

def generate():
  for label in SELECTED_WORDS:
    _check_label(label)
  
  np.random.seed(0)
  
  list_of_files = [_get_wav_files(label) for label in SELECTED_WORDS]
  if MAX_FILES > 0:
    max_per_label = math.floor(MAX_FILES / (len(SELECTED_WORDS)+2))
    list_of_files = [np.random.choice(l, max_per_label, replace=False) for l in list_of_files]
    
  num_word_files = len(list_of_files[0])

  list_of_non_words = [word for word in ALL_WORDS if not word in SELECTED_WORDS]
  list_of_non_word_files = np.hstack([_get_wav_files(label) for label in list_of_non_words])

  random_selection = np.random.choice(list_of_non_word_files, min(num_word_files, len(list_of_non_word_files)), replace=False)  

  noise_files = _get_wav_files('_background_noise_')

  _write_files(list_of_files, random_selection, noise_files)

  

# Download dataset and process

In [None]:
maybe_download_and_extract_dataset(DATA_URL, DOWNLOAD_DIR)
generate()

In [None]:
!zip -r /content/my_voice_dataset.zip $OUT_DIR
from google.colab import files
files.download("/content/my_voice_dataset.zip")
