In [None]:
# Automatically reload imported modules that are changed outside this notebook
%load_ext autoreload
%autoreload 2

# More pixels in figures
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.dpi"] = 200

# Init PRNG with fixed seed for reproducibility
import numpy as np
np_rng = np.random.default_rng(1)

import tensorflow as tf
tf.random.set_seed(np_rng.integers(0, tf.int64.max))

# Audio augmentation by random speed changes and random filtering

**2020-11-10**


This example expands `common-voice-small`, in which we talked about different ways of augmenting the dataset.
Instead of simply copying samples, we can resample them randomly to make them a bit [faster or slower](https://www.isca-speech.org/archive/interspeech_2015/i15_3586.html).
In addition, by applying random [finite impulse response](https://en.wikipedia.org/wiki/Finite_impulse_response) (FIR) filters on the signals, we can try to [simulate microphone differences](https://www.isca-speech.org/archive/Interspeech_2018/abstracts/1047.html).
We'll apply these two augmentation techniques in this example and see if it is possible to improve on our previous results.

`tf.data.Dataset` makes it easy to cache all raw audio samples into a single file, from which we can reload the whole dataset at each epoch.
This means that we can reapply both random augmentation techniques at every epoch, hopefully with different output at each epoch.

## Data

This example uses the same data as in the `common-voice-small` example.

In [None]:
import urllib.parse
from IPython.display import display, Markdown


languages = """
    et
    mn
    ta
    tr
""".split()

languages = sorted(l.strip() for l in languages)

display(Markdown("### Languages"))
display(Markdown('\n'.join("* `{}`".format(l) for l in languages)))

bcp47_validator_url = 'https://schneegans.de/lv/?tags='
display(Markdown("See [this tool]({}) for a description of the BCP-47 language codes."
                 .format(bcp47_validator_url + urllib.parse.quote('\n'.join(languages)))))

## Loading the metadata

In [None]:
import os


workdir = "/data/exp/cv4-augment"
datadir = "/mnt/data/speech/common-voice/downloads/2020/cv-corpus"

print("work dir:", workdir)
print("data source dir:", datadir)
print()

os.makedirs(workdir, exist_ok=True)
assert os.path.isdir(datadir), datadir + " does not exist"

dirs = sorted((f for f in os.scandir(datadir) if f.is_dir()), key=lambda f: f.name)

print(datadir)
for d in dirs:
    if d.name in languages:
        print(' ', d.name)
        for f in os.scandir(d):
            print('   ', f.name)

missing_languages = set(languages) - set(d.name for d in dirs)
assert missing_languages == set(), "missing languages: {}".format(missing_languages)

In [None]:
from lidbox.meta import common_voice, generate_label2target


meta = common_voice.load_all(datadir, languages)
meta, lang2target = generate_label2target(meta)

print("lang2target")
for l, t in lang2target.items():
    print("  {}: {}".format(l, t))

for split in meta.split.unique():
    display(Markdown("### " + split))
    display(meta[meta["split"]==split])

### Checking the metadata is valid

In [None]:
from lidbox.meta import verify_integrity


print("size of all metadata", meta.shape)
meta = meta.dropna()
print("after dropping NaN rows", meta.shape)

print("verifying integrity")
verify_integrity(meta)
print("ok")

## Balancing the language distribution

We'll repeat the same random oversampling by audio sample length procedure as we did in `common-voice-small`.
This time, we add a flag `is_copy == True` to each oversampled copy, which allows us to easily filter all copies when we do random speed changes on the audio signals.

In [None]:
import pandas as pd
import seaborn as sns
from lidbox.meta import read_audio_durations, random_oversampling
from lidbox.visualize import plot_duration_distribution


meta["duration"] = read_audio_durations(meta)

# Flag for distinguishing original rows from copies produced by oversampling
# This is also used later for random resampling of signals
meta = meta.assign(is_copy=False)
train, rest = meta[meta["split"]=="train"], meta[meta["split"]!="train"]
augmented_train = random_oversampling(train, copy_flag="is_copy", random_state=np_rng.bit_generator)
meta = pd.concat([augmented_train, rest], verify_integrity=True).sort_index()
verify_integrity(meta)

sns.set(rc={})
plot_duration_distribution(meta)
for split in meta.split.unique():
    display(Markdown("### " + split))
    display(meta[meta["split"]==split])

## Inspecting the audio

In [None]:
samples = (meta[meta["split"]=="train"]
           .groupby("label")
           .sample(n=2, random_state=np_rng.bit_generator))
samples

In [None]:
from lidbox.features import audio
from lidbox.visualize import plot_signal
from IPython.display import display, Audio, HTML


def read_mp3(path):
    s, rate = audio.read_mp3(path)
    out_rate = 16000
    s = audio.resample(s, rate, out_rate)
    s = audio.peak_normalize(s, dBFS=-3.0)
    s = audio.remove_silence(s, out_rate)
    return s, out_rate

def embed_audio(signal, rate):
    display(Audio(data=signal, rate=rate, embed=True, normalize=False))

def plot_separator():
    display(HTML(data="<hr style='border: 2px solid'>"))

    
for sentence, lang, clip_path in samples[["sentence", "label", "path"]].to_numpy():
    signal, rate = read_mp3(clip_path)
    signal = signal.numpy()
    plot_signal(signal)
    print("length: {} sec".format(signal.size / rate))
    print("lang:", lang)
    print("sentence:", sentence)
    embed_audio(signal, rate)
    plot_separator()

## Random filtering


In [None]:
import scipy.signal


def random_filter(s, N=10):
    b = np_rng.normal(0, 1, N)
    return scipy.signal.lfilter(b, 1.0, s).astype(np.float32), b

def display_signal(s, r, l):
    plot_signal(s)
    print("length: {} sec".format(s.size / r))
    print("lang:", l)
    embed_audio(s, r)
    plot_separator()

    
sentence, lang, path = samples[["sentence", "label", "path"]].to_numpy()[2]
signal, rate = read_mp3(path)
signal = audio.remove_silence(signal, rate).numpy()

print("original")
display_signal(signal, rate, lang)

np.set_printoptions(precision=1)

for _ in range(5):
    s, b = random_filter(signal)
    print("filter:", b)
    s = audio.peak_normalize(s, dBFS=-3.0).numpy()
    display_signal(s, rate, lang)

## Random speed change

In [None]:
def random_speed_change(s, r, lo=0.9, hi=1.1):
    ratio = np_rng.uniform(lo, hi)
    new_len = int(len(s) * r / (ratio * r))
    return scipy.signal.resample(s, new_len).astype(np.float32), ratio
    

print("original")
display_signal(signal, rate, lang)

for ratio in [0.9, 0.95, 1, 1.05, 1.1]:
    s, ratio = random_speed_change(signal, rate, lo=ratio, hi=ratio)
    print("speed ratio: {:.3f}".format(ratio))
    display_signal(s, rate, lang)

## Loading all data

In [None]:
from lidbox.features import audio, cmvn


TF_AUTOTUNE = tf.data.experimental.AUTOTUNE


def metadata_to_dataset_input(meta):   
    return {
        "id": tf.constant(meta.index, tf.string),
        "path": tf.constant(meta.path, tf.string),
        "label": tf.constant(meta.label, tf.string),
        "target": tf.constant(meta.target, tf.int32),
        "split": tf.constant(meta.split, tf.string),
        "is_copy": tf.constant(meta.is_copy, tf.bool),
    }


def read_mp3(x):
    s, r = audio.read_mp3(x["path"])
    out_rate = 16000
    s = audio.resample(s, r, out_rate)
    s = audio.peak_normalize(s, dBFS=-3.0)
    s = audio.remove_silence(s, out_rate)
    return dict(x, signal=s, sample_rate=out_rate)


def random_speed_change_wrapper(x):
    if not x["is_copy"]:
        return x
    s, _ = tf.numpy_function(
        random_speed_change,
        [x["signal"], x["sample_rate"]],
        [tf.float32, tf.float64],
        name="np_random_speed_change")
    return dict(x, signal=s)


def random_filter_wrapper(x):
    s, _ = tf.numpy_function(
        random_filter,
        [x["signal"]],
        [tf.float32, tf.float64],
        name="np_random_filter")
    s = tf.cast(s, tf.float32)
    s = audio.peak_normalize(s, dBFS=-3.0)
    return dict(x, signal=s)


def batch_extract_features(x):
    with tf.device("GPU"):
        signals, rates = x["signal"], x["sample_rate"]
        S = audio.spectrograms(signals, rates[0])
        S = audio.linear_to_mel(S, rates[0])
        S = tf.math.log(S + 1e-6)
        S = cmvn(S, normalize_variance=False)
    return dict(x, logmelspec=S)


def signal_is_not_empty(x):
    return tf.size(x["signal"]) > 0


def pipeline_from_metadata(data, split):
    if split == "train":
        data = data.sample(frac=1)
    ds = (
        tf.data.Dataset.from_tensor_slices(metadata_to_dataset_input(data))
        .map(read_mp3, num_parallel_calls=TF_AUTOTUNE)
        .filter(signal_is_not_empty)
        # Try to keep 1000 signals prefetched in an in-memory buffer to reduce downstream latency
        .prefetch(1000)
        # Cache signals to a single file
        .cache(os.path.join(cachedir, "data", split))
        # In-memory buffer when reading from the cache
        .prefetch(1000))
    if split == "train":
        ds = (ds
              # Randomly change speed of all oversampled copies
              .map(random_speed_change_wrapper, num_parallel_calls=TF_AUTOTUNE)
              # Apply random filter for every training sample
              .map(random_filter_wrapper, num_parallel_calls=TF_AUTOTUNE))
    return (ds
        .batch(1)
        .map(batch_extract_features, num_parallel_calls=TF_AUTOTUNE)
        .unbatch())


cachedir = os.path.join(workdir, "cache")
os.makedirs(os.path.join(cachedir, "data"))

split2ds = {
    split: pipeline_from_metadata(meta[meta["split"]==split], split)
    for split in meta.split.unique()
}

## Exhaust iterators to collect all audio into binary files

**NOTE** that this creates 7.2 GiB of additional data on disk.

In [None]:
import lidbox.data.steps as ds_steps


for split, ds in split2ds.items():
    print("filling", split, "cache")
    _ = ds_steps.consume(ds, log_interval=2000) 

## Inspect dataset contents in TensorBoard

In [None]:
for split, ds in split2ds.items():
    _ = ds_steps.consume_to_tensorboard(
            ds.map(lambda x: dict(x, input=x["logmelspec"])),
            os.path.join(cachedir, "tensorboard", "data", split),
            {"batch_size": 1,
             "image_size_multiplier": 2,
             "num_batches": 100})

## Train a supervised, neural network language classifier

In [None]:
import lidbox.models.xvector as xvector


def create_model(num_freq_bins, num_labels):
    model = xvector.create([None, num_freq_bins], num_labels, channel_dropout_rate=0.8)
    model.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5))
    return model


model = create_model(
    num_freq_bins=40,
    num_labels=len(lang2target))
model.summary()

In [None]:
def as_model_input(x):
    return x["logmelspec"], x["target"]


callbacks = [
    # Write scalar metrics and network weights to TensorBoard
    tf.keras.callbacks.TensorBoard(
        log_dir=os.path.join(cachedir, "tensorboard", model.name),
        update_freq="epoch",
        write_images=True,
        profile_batch=0,
    ),
    # Stop training if validation loss has not improved from the global minimum in 10 epochs
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=10,
    ),
    # Write model weights to cache everytime we get a new global minimum loss value
    tf.keras.callbacks.ModelCheckpoint(
        os.path.join(cachedir, "model", model.name),
        monitor='val_loss',
        save_weights_only=True,
        save_best_only=True,
        verbose=1,
    ),
]

train_ds = split2ds["train"].map(as_model_input).shuffle(1000)
dev_ds = split2ds["dev"].map(as_model_input)

history = model.fit(
    train_ds.batch(1),
    validation_data=dev_ds.batch(1),
    callbacks=callbacks,
    verbose=2,
    epochs=100)

## Evaluate the classifier

In [None]:
from lidbox.util import evaluate_testset_with_model
from lidbox.visualize import draw_confusion_matrix


_ = model.load_weights(os.path.join(cachedir, "model", model.name))

report = evaluate_testset_with_model(
    model=model,
    test_ds=split2ds["test"].map(lambda x: dict(x, input=x["logmelspec"])).batch(1),
    test_meta=meta[meta["split"]=="test"],
    lang2target=lang2target)

for m in ("avg_detection_cost", "avg_equal_error_rate", "accuracy"):
    print("{}: {:.3f}".format(m, report[m]))
    
lang_metrics = pd.DataFrame.from_dict({k: v for k, v in report.items() if k in lang2target})
lang_metrics["mean"] = lang_metrics.mean(axis=1)
display(lang_metrics.T)

fig, ax = draw_confusion_matrix(report["confusion_matrix"], lang2target)

## Conclusions

Comparing to our previous example with the same dataset of 4 different languages (`common-voice-small`), the $\text{C}_\text{avg}$ value improved from 0.112 to 0.091 and accuracy from 0.803 to 0.846.

Even though it is tempting to conclude that our augmentation approach was the cause of this improvement, we should probably perform hundreds of experiments with carefully chosen configuration settings to get a reliable answer if augmentation is useful or not.