In [1]:
import os
from os.path import isdir, join
from pathlib import Path
import pandas as pd

# Math
import numpy as np
from scipy.fftpack import fft
from scipy import signal
from scipy.io import wavfile
import librosa

from sklearn.decomposition import PCA

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import IPython.display as ipd
import librosa.display

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import pandas as pd

import re
import hashlib
from tensorflow.python.util import compat
import pickle
import random
import io
random.seed(59185)

%matplotlib inline

In [2]:
noise_dir = '/home/maikfangogoair/tmp/label_data/_background_noise_/'
def noise_pool(noise_dir):
    noise_list = []
    white_list = []
    for x in os.listdir(noise_dir):
        if (x.endswith(".wav") or x.endswith(".WAV")) :
            y, sr = librosa.load(noise_dir + x , sr=None)
            for i in range(0, y.shape[0], sr):
                if i + sr < y.shape[0]:
                    sample = y[i: i + sr]
                    if x != "white_noise.wav":
                        noise_list.append(sample)
                    else :
                        white_list.append(sample)
                    
    print("There are %d one second noise." % len(noise_list))
    print("There are %d one second white." % len(white_list))
    return noise_list, white_list
NOISE_LIST, WHITE_LIST = noise_pool(noise_dir)
NOISE_POOL_SIZE = len(NOISE_LIST)
WHITE_POOL_SIZE = len(WHITE_LIST)

There are 337 one second noise.
There are 59 one second white.


In [3]:
#https://github.com/adiyoss/GCommandsPytorch/blob/master/gcommand_loader.py
def get_spect(path, normalize=True , window_size=.02, window_stride=.01, window='hamming', max_len=101, \
                                              add_noise = False,use_white=False ,background_loud=0.3, foreground_loud = 1.0):
    y, sr = librosa.load(path, sr=None)
    if add_noise == True:
        if use_white:
            noise = (NOISE_LIST+WHITE_LIST)[random.randint(0, NOISE_POOL_SIZE+WHITE_POOL_SIZE -1 )][0:y.shape[0]]
        else:
            noise = NOISE_LIST[random.randint(0, NOISE_POOL_SIZE -1 )][0:y.shape[0]]
        y = foreground_loud * y + background_loud * noise
    n_fft = int(sr * window_size)
    win_length = n_fft
    hop_length = int(sr * window_stride)

    # STFT
    D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
                     win_length=win_length, window=window)
    spect, _ = librosa.magphase(D)
    # S = log(S+1)
    spect = np.log1p(spect)

    # make all spects with the same dims
    if spect.shape[1] < max_len:
        pad = np.zeros((spect.shape[0], max_len - spect.shape[1]))
        spect = np.hstack((spect, pad))
    elif spect.shape[1] > max_len:
        spect = spect[:max_len, ]
    spect = np.resize(spect, (1, spect.shape[0], spect.shape[1]))
    
    if normalize:
        mean = spect.mean()
        std = spect.std()
        if std != 0:
            spect = (spect - mean) /std

    return spect 

In [4]:
# def extract_background(path,func):
#     S_full = func(path=path, normalize=False)
    
#     S_filter = librosa.decompose.nn_filter(S_full,
#                                            aggregate=np.median,
#                                            metric='cosine',
#                                            width=int(librosa.time_to_frames(1, sr=16000)))


#     S_filter = np.minimum(S_full, S_filter)

#     margin_i, margin_v = 2, 10
#     power = 2

#     mask_i = librosa.util.softmask(S_filter,
#                                    margin_i * (S_full - S_filter),
#                                    power=power)

#     # mask_v = librosa.util.softmask(S_full - S_filter,
#     #                                margin_v * S_filter,
#     #                                power=power)

#     # S_foreground = mask_v * S_full
#     S_background = mask_i * S_full
#     return S_background
    

In [5]:
#copy from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/speech_commands/input_data.py#L61
def which_set(filename, validation_percentage, testing_percentage):
  """Determines which data partition the file should belong to.
  We want to keep files in the same training, validation, or testing sets even
  if new ones are added over time. This makes it less likely that testing
  samples will accidentally be reused in training when long runs are restarted
  for example. To keep this stability, a hash of the filename is taken and used
  to determine which set it should belong to. This determination only depends on
  the name and the set proportions, so it won't change as other files are added.
  It's also useful to associate particular files as related (for example words
  spoken by the same person), so anything after '_nohash_' in a filename is
  ignored for set determination. This ensures that 'bobby_nohash_0.wav' and
  'bobby_nohash_1.wav' are always in the same set, for example.
  Args:
    filename: File path of the data sample.
    validation_percentage: How much of the data set to use for validation.
    testing_percentage: How much of the data set to use for testing.
  Returns:
    String, one of 'training', 'validation', or 'testing'.
  """
  base_name = os.path.basename(filename)
  # We want to ignore anything after '_nohash_' in the file name when
  # deciding which set to put a wav in, so the data set creator has a way of
  # grouping wavs that are close variations of each other.
  hash_name = re.sub(r'_nohash_.*$', '', base_name)
  # This looks a bit magical, but we need to decide whether this file should
  # go into the training, testing, or validation sets, and we want to keep
  # existing files in the same set even if more files are subsequently
  # added.
  # To do that, we need a stable way of deciding based on just the file name
  # itself, so we do a hash of that and then use that to generate a
  # probability value that we use to assign it.
  hash_name_hashed = hashlib.sha1(compat.as_bytes(hash_name)).hexdigest()
  MAX_NUM_WAVS_PER_CLASS = 2**27 - 1  # ~134M
  percentage_hash = ((int(hash_name_hashed, 16) %
                      (MAX_NUM_WAVS_PER_CLASS + 1)) *
                     (100.0 / MAX_NUM_WAVS_PER_CLASS))
  if percentage_hash < validation_percentage:
    result = 'validation'
  elif percentage_hash < (testing_percentage + validation_percentage):
    result = 'testing'
  else:
    result = 'training'
  return result

In [6]:
def save_feature(all_data , feature_name , save_dir):
    condition = True
    version = 0
    while (condition):
        version += 1
        save_file = feature_name + "_v" + str(version) + ".pkl"
        condition = save_file in os.listdir(save_dir)
        
    
    with open(save_dir + save_file, 'wb') as fp:
        pickle.dump(all_data, fp)
    print("saved file is: " + save_dir + save_file)

In [7]:
validation_percentage = 10
testing_percentage = 5
# silence_percentage = 100
silence_percentage = 50
unknown_percentage=100
all_data = []
feature_name = "spect"
feature_func = {"spect": get_spect}[feature_name]
train_audio_path = '/home/maikfangogoair/tmp/label_data/'
SLIENCE_LABEL = "silence"
UNKNOWN_LABEL = 'unknown'
word_list = [x for x in os.listdir(train_audio_path) if os.path.isdir(train_audio_path + x) and x != '_background_noise_']
# wanted_list=word_list.copy()
# wanted_list.append(SLIENCE_LABEL)
wanted_list = ['silence', 'unknown', 'yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']


cnt=0
for folder in word_list:
    for filename in os.listdir(train_audio_path + folder):
        if filename.endswith('.wav') or filename.endswith('.WAV'):
            cnt+=1
            group = which_set(filename, validation_percentage, testing_percentage)
            path = train_audio_path + folder + "/" + filename

            if random.randint(1,100) <= silence_percentage:
                label_idx = wanted_list.index(SLIENCE_LABEL)
                feature = feature_func(path=path, add_noise = True, background_loud=random.uniform(0.01, 0.99), foreground_loud = 0.0, use_white=True)
                all_data.append({"label": SLIENCE_LABEL, "label_idx": label_idx, "filename": filename, \
                             "speaker": SLIENCE_LABEL+str(cnt), \
                             "feature": feature, \
                             "group": group})
                
            if folder not in wanted_list:
                label =  UNKNOWN_LABEL
                if random.randint(1,100) > unknown_percentage:
                    continue;
            else:
                label = folder
            speaker = filename.split('_')[0]
            label_idx = wanted_list.index(label)
            feature = feature_func(path=path, add_noise = True, background_loud=0 , foreground_loud=random.uniform(0.50, 0.99), use_white=False )
            all_data.append({"label":label, "label_idx": label_idx, "filename": filename, \
                             "speaker": speaker, \
                             "feature": feature, \
                             "group": group})

            
            if cnt % 1000 ==0:
                print("executing ... %d" % cnt)

#             if cnt > 1000: break


random.shuffle(all_data)
save_dir = "/home/maikfangogoair/tmp/save/"
save_feature(all_data , feature_name , save_dir)

with io.open(save_dir + "wanted_list.pkl", 'wb') as f:
    pickle.dump(wanted_list, f)
print("wanted_list saved to " + save_dir + "wanted_list.pkl")

executing ... 1000
executing ... 2000
executing ... 3000
executing ... 4000
executing ... 5000
executing ... 6000
executing ... 7000
executing ... 8000
executing ... 9000
executing ... 10000
executing ... 11000
executing ... 12000
executing ... 13000
executing ... 14000
executing ... 15000
executing ... 16000
executing ... 17000
executing ... 18000
executing ... 19000
executing ... 20000
executing ... 21000
executing ... 22000
executing ... 23000
executing ... 24000
executing ... 25000
executing ... 26000
executing ... 27000
executing ... 28000
executing ... 29000
executing ... 30000
executing ... 31000
executing ... 32000
executing ... 33000
executing ... 34000
executing ... 35000
executing ... 36000
executing ... 37000
executing ... 38000
executing ... 39000
executing ... 40000
executing ... 41000
executing ... 42000
executing ... 43000
executing ... 44000
executing ... 45000
executing ... 46000
executing ... 47000
executing ... 48000
executing ... 49000
executing ... 50000
executing

In [8]:
data_statistics = {"validation":{},"testing":{},"training":{}}
for x in all_data:
    data_statistics[x["group"]][x["label"]] = data_statistics[x["group"]].get(x["label"],0) + 1
data_statistics

{'testing': {'down': 166,
  'go': 157,
  'left': 160,
  'no': 155,
  'off': 147,
  'on': 161,
  'right': 163,
  'silence': 2037,
  'stop': 157,
  'unknown': 2519,
  'up': 169,
  'yes': 164},
 'training': {'down': 1929,
  'go': 1955,
  'left': 1946,
  'no': 1950,
  'off': 1954,
  'on': 1949,
  'right': 1948,
  'silence': 26951,
  'stop': 1977,
  'unknown': 34299,
  'up': 1946,
  'yes': 1952},
 'validation': {'down': 264,
  'go': 260,
  'left': 247,
  'no': 270,
  'off': 256,
  'on': 257,
  'right': 256,
  'silence': 3334,
  'stop': 246,
  'unknown': 4221,
  'up': 260,
  'yes': 261}}

In [9]:
#check if any speaker in different group?
speaker_list = {}
for x in all_data:
    l = speaker_list.get(x["speaker"], {})
    if x["group"] in l.keys():
        speaker_list[x["speaker"]][x["group"]] += 1
    else:
        l[x["group"]] = 1
        speaker_list[x["speaker"]] = l
print("There are %d different speakers." % len(speaker_list))
print("There are %d speakers have more than one group." % len([{k,v} for k,v in speaker_list.items() if len(v.items()) > 1]))
#check sum
summary = 0
for _, v in speaker_list.items():
    summary = summary + v.get("training",0) + v.get("testing",0) +v.get("validation",0)
assert summary == len(all_data)
print("Total wave count is: %d." %  summary  )

There are 34203 different speakers.
There are 0 speakers have more than one group.
Total wave count is: 97043.


In [None]:
def show_spec(one_line):
    fig = plt.figure(figsize=(14, 8))
    ax1 = fig.add_subplot(211)
    ax1.imshow(np.squeeze(one_line["feature"], axis=0) , aspect='auto', origin='lower')
    ax1.set_title(one_line["label"])

In [10]:
one_line = [x for x in all_data if x["label"] != "unknown" and x["label"] == "silence"][random.randint(0,1000)]
show_spec(one_line)

NameError: name 'show_spec' is not defined

In [53]:
ipd.Audio(join(train_audio_path, one_line["label"]+"/"+one_line["filename"]))