In [None]:
# Automatically reload imported modules that are changed outside this notebook
%load_ext autoreload
%autoreload 2

# More pixels in figures
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.dpi"] = 200

# Init PRNG with fixed seed for reproducibility
import numpy as np
np_rng = np.random.default_rng(1)

import tensorflow as tf
tf.random.set_seed(np_rng.integers(0, tf.int64.max))

# Language vectors, recurrent neural networks, and an angular proximity loss function

**2020-11-21**

In this example, we take a different approach for training language vectors (embeddings) compared to `common-voice-embeddings`.
Previously, we trained a neural network on a classification task and used one of its layers as the representation for different classes.
In this example, we train a neural network directly on the language vector task by maximizing the angular distance between vectors of different classes.
We'll be using the approach described by [G. Gelly and J.L. Gauvain](https://www.isca-speech.org/archive/Interspeech_2017/abstracts/1334.html).



## Data

We will continue with the same, 4-language Common Voice data as in all previous examples.

In [None]:
import urllib.parse
from IPython.display import display, Markdown


languages = """
    et
    mn
    ta
    tr
""".split()

languages = sorted(l.strip() for l in languages)

display(Markdown("### Languages"))
display(Markdown('\n'.join("* `{}`".format(l) for l in languages)))

bcp47_validator_url = 'https://schneegans.de/lv/?tags='
display(Markdown("See [this tool]({}) for a description of the BCP-47 language codes."
                 .format(bcp47_validator_url + urllib.parse.quote('\n'.join(languages)))))

## Loading and preparing the metadata


In [None]:
import os

from lidbox.meta import (
    common_voice,
    generate_label2target,
    verify_integrity,
    read_audio_durations,
    random_oversampling_on_split
)


workdir = "/data/exp/cv4-angular-lstm"
datadir = "/mnt/data/speech/common-voice/downloads/2020/cv-corpus"

print("work dir:", workdir)
print("data source dir:", datadir)
print()

os.makedirs(workdir, exist_ok=True)
assert os.path.isdir(datadir), datadir + " does not exist"

dirs = sorted((f for f in os.scandir(datadir) if f.is_dir()), key=lambda f: f.name)

print(datadir)
for d in dirs:
    if d.name in languages:
        print(' ', d.name)
        for f in os.scandir(d):
            print('   ', f.name)

missing_languages = set(languages) - set(d.name for d in dirs)
assert missing_languages == set(), "missing languages: {}".format(missing_languages)

meta = common_voice.load_all(datadir, languages)
meta, lang2target = generate_label2target(meta)

print("\nsize of all metadata", meta.shape)
meta = meta.dropna()
print("after dropping NaN rows", meta.shape)

print("verifying integrity")
verify_integrity(meta)
print("ok\n")

print("reading audio durations")
meta["duration"] = read_audio_durations(meta)
print("balancing the label distributions")
meta = random_oversampling_on_split(meta, "train")

## Preparing the feature extraction pipeline

Most of the preprocessing will be as in `common-voice-embeddings`, but this time we will not be training on samples with varying length.

We will make these changes:
* Signals will be divided into 3.2 second chunks, with 75% overlap, as suggested in the [paper](https://www.isca-speech.org/archive/Interspeech_2017/abstracts/1334.html).
* Every signal that is shorter than 3.2 seconds will be repeatedly appended to itself until it is at least 3.2 seconds long.
* Random speed changes are applied only once, before caching the training set signals to disk. This is because `tf.keras.Model.fit` assumes the training set length does not change. This could probably be fixed by writing a custom training loop but we won't be doing that here.

In [None]:
import scipy.signal

from lidbox.features import audio, cmvn
import lidbox.data.steps as ds_steps


TF_AUTOTUNE = tf.data.experimental.AUTOTUNE


def metadata_to_dataset_input(meta):   
    return {
        "id": tf.constant(meta.index, tf.string),
        "path": tf.constant(meta.path, tf.string),
        "label": tf.constant(meta.label, tf.string),
        "target": tf.constant(meta.target, tf.int32),
        "split": tf.constant(meta.split, tf.string),
        "is_copy": tf.constant(meta.is_copy, tf.bool),
    }


def read_mp3(x):
    s, r = audio.read_mp3(x["path"])
    out_rate = 16000
    s = audio.resample(s, r, out_rate)
    s = audio.peak_normalize(s, dBFS=-3.0)
    s = audio.remove_silence(s, out_rate)
    return dict(x, signal=s, sample_rate=out_rate)


def random_filter(x):
    def scipy_filter(s, N=10):
        b = np_rng.normal(0, 1, N)
        return scipy.signal.lfilter(b, 1.0, s).astype(np.float32), b
    s, _ = tf.numpy_function(
        scipy_filter,
        [x["signal"]],
        [tf.float32, tf.float64],
        name="np_random_filter")
    s = tf.cast(s, tf.float32)
    s = audio.peak_normalize(s, dBFS=-3.0)
    return dict(x, signal=s)


def random_speed_change(ds):
    return ds_steps.random_signal_speed_change(ds, min=0.9, max=1.1, flag="is_copy")


def create_signal_chunks(ds):
    ds = ds_steps.repeat_too_short_signals(ds, 3200)
    ds = ds_steps.create_signal_chunks(ds, 3200, 800)
    return ds


def batch_extract_features(x):
    with tf.device("GPU"):
        signals, rates = x["signal"], x["sample_rate"]
        S = audio.spectrograms(signals, rates[0])
        S = audio.linear_to_mel(S, rates[0])
        S = tf.math.log(S + 1e-6)
        S = cmvn(S, normalize_variance=False)
    return dict(x, logmelspec=S)


def pipeline_from_meta(data, split):
    if split == "train":
        data = data.sample(frac=1, random_state=np_rng.bit_generator)

    ds = (tf.data.Dataset
            .from_tensor_slices(metadata_to_dataset_input(data))
            .map(read_mp3, num_parallel_calls=TF_AUTOTUNE))

    if split == "train":
        return (ds
            .apply(random_speed_change)
            .cache(os.path.join(cachedir, "data", split))
            .prefetch(100)
            .map(random_filter, num_parallel_calls=TF_AUTOTUNE)
            .apply(create_signal_chunks)
            .batch(100)
            .map(batch_extract_features, num_parallel_calls=TF_AUTOTUNE)
            .unbatch())
    else:
        return (ds
            .apply(create_signal_chunks)
            .batch(100)
            .map(batch_extract_features, num_parallel_calls=TF_AUTOTUNE)
            .unbatch()
            .cache(os.path.join(cachedir, "data", split))
            .prefetch(100))


cachedir = os.path.join(workdir, "cache")
os.makedirs(os.path.join(cachedir, "data"))

split2ds = {split: pipeline_from_meta(meta[meta["split"]==split], split)
            for split in meta.split.unique()}

### Filling the caches

In [None]:
for split, ds in split2ds.items():
    print("filling", split, "cache")
    _ = ds_steps.consume(ds, log_interval=5000) 

## Training the LSTM model with angular proximity loss

`lidbox` implements both the model and the angular proximity loss function used in the reference paper.
The loss function aims to maximize the cosine distance of language vectors of different languages and minimize the distance for vectors of the same language.
Reference vectors will be generated for each class such that all reference vectors are orthogonal to each other.


In addition, we'll add [random channel dropout](https://dl.acm.org/doi/abs/10.1016/j.patrec.2017.09.023) to avoid overfitting on noise, as in the `common-voice-small` example.

In [None]:
from lidbox.models import ap_lstm
from lidbox.losses import SparseAngularProximity


def create_model(num_freq_bins=40, num_labels=len(lang2target)):
    m = ap_lstm.create(
        input_shape=[None, num_freq_bins],
        num_outputs=num_labels,
        num_lstm_units=200,
        channel_dropout_rate=0.8)
    m.compile(
        loss=SparseAngularProximity(num_labels, m.output.shape[1]),
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3))
    return m


model = create_model()
model.summary()

In [None]:
callbacks = [
    tf.keras.callbacks.TensorBoard(
        log_dir=os.path.join(cachedir, "tensorboard", model.name),
        update_freq="epoch",
        write_images=True,
        profile_batch=0,
    ),
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=10,
    ),
    tf.keras.callbacks.ModelCheckpoint(
        os.path.join(cachedir, "model", model.name),
        monitor='val_loss',
        save_weights_only=True,
        save_best_only=True,
        verbose=1,
    ),
]


def as_model_input(x):
    return x["logmelspec"], x["target"]


train_ds = split2ds["train"].map(as_model_input).shuffle(5000)
dev_ds = split2ds["dev"].map(as_model_input)

history = model.fit(
    train_ds.batch(32),
    validation_data=dev_ds.batch(32),
    callbacks=callbacks,
    verbose=2,
    epochs=100)

## Evaluating as an end-to-end classifier

The angular proximity loss function uses reference directions for each language, such that each direction is orthogonal to each other.
By selecting the closest reference direction for every predicted language vector, the model can be used as an end-to-end classifier.

In [None]:
import pandas as pd

from lidbox.util import predict_with_model, classification_report
from lidbox.visualize import draw_confusion_matrix


def load_trained_model():
    model = create_model()
    model.load_weights(os.path.join(cachedir, "model", model.name))
    return model


def display_classification_report(report):
    for m in ("avg_detection_cost", "avg_equal_error_rate", "accuracy"):
        print("{}: {:.3f}".format(m, report[m]))

    lang_metrics = pd.DataFrame.from_dict(
        {k: v for k, v in report.items() if k in lang2target})
    lang_metrics["mean"] = lang_metrics.mean(axis=1)
    display(lang_metrics.T)

    fig, ax = draw_confusion_matrix(report["confusion_matrix"], lang2target)


model = load_trained_model()


def predict_with_ap_loss(x):
    with tf.device("GPU"):
        # Generate language vector for input spectra
        language_vector = model(x["input"], training=False)
        # Predict languages by computing distances to reference directions
        return x["id"], model.loss.predict(language_vector)


chunk2pred = predict_with_model(
    model=model,
    ds=split2ds["test"].map(lambda x: dict(x, input=x["logmelspec"])).batch(128),
    predict_fn=predict_with_ap_loss)

### Merging chunk predictions

We divided all samples into 3.2 second chunks, so all predictions are still for these chunks.
Lets merge all chunk predictions by taking the average over all chunks for each sample.

In [None]:
chunk2pred

In [None]:
from lidbox.util import merge_chunk_predictions


utt2pred = merge_chunk_predictions(chunk2pred)
utt2pred

### Evaluate test set predictions

In [None]:
test_meta = meta[meta["split"]=="test"].join(utt2pred, how="outer")
assert not test_meta.isna().any(axis=None), "failed to join predictions"

true_sparse = test_meta.target.to_numpy(np.int32)
pred_dense = np.stack(test_meta.prediction)

report = classification_report(true_sparse, pred_dense, lang2target)
display_classification_report(report)

## Extracting all data as language vectors

In [None]:
from lidbox.util import model2function


extractor = model2function(load_trained_model())
print("extractor:", str(extractor))

In [None]:
from lidbox.visualize import plot_embedding_vector


def is_not_copy(x):
    return not x["is_copy"]

def batch_extract_embeddings(x):
    with tf.device("GPU"):
        return dict(x, embedding=extractor(x["logmelspec"]))


embedding_demo_ds = (split2ds["train"]
                     .filter(is_not_copy)
                     .take(12)
                     .batch(1)
                     .map(batch_extract_embeddings)
                     .unbatch())

for x in embedding_demo_ds.as_numpy_iterator():
    print(x["id"].decode("utf-8"), x["embedding"].shape)
    plot_embedding_vector(x["embedding"], figsize=(10, 0.2))

### Constructing a language vector extractor pipeline

We'll now extend the existing feature extraction pipeline by adding a step where we extract language vectors with the trained model.
In addition, we merge all chunks of each sample by summing over all components of its chunk vectors.
The vector is then L2-normalized.

In [None]:
from sklearn.preprocessing import normalize
from lidbox.util import predictions_to_dataframe


# Merge chunk vectors by taking the sum over each component and L2-normalizing the result
def sum_and_normalize(pred):
    v = np.stack(pred).sum(axis=0)
    v = normalize(v.reshape((1, -1)), axis=1)
    return np.squeeze(v)


def ds_to_embeddings(ds):
    to_pair = lambda x: (x["id"], x["embedding"])
    ds = (ds
        .batch(128)
        .map(batch_extract_embeddings, num_parallel_calls=TF_AUTOTUNE)
        .unbatch()
        .map(to_pair, num_parallel_calls=TF_AUTOTUNE))

    ids = []
    embeddings = []
    
    for id, embedding in ds.as_numpy_iterator():
        ids.append(id.decode("utf-8"))
        embeddings.append(embedding.astype(np.float32))
        
    df = predictions_to_dataframe(ids, embeddings)
    return merge_chunk_predictions(df, merge_rows_fn=sum_and_normalize)


embeddings_by_split = (ds_to_embeddings(ds) for ds in split2ds.values())
m = meta.join(pd.concat(embeddings_by_split, verify_integrity=True), how="outer")
assert not m.prediction.isna().any(axis=None), "Missing embeddings, some rows contained NaN values"

meta = m.rename(columns={"prediction": "embedding"})

### Preprocessing the language vectors for back-end training

Now, let's extract all embeddings and integer targets into NumPy-data and preprocess them with scikit-learn.

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from lidbox.embed.sklearn_utils import PLDA


def embeddings_as_numpy_data(df):
    X = np.stack(df.embedding.values).astype(np.float32)
    y = df.target.to_numpy(dtype=np.int32)
    return X, y


def random_sample(X, y, sample_size_ratio):
    N = X.shape[0]
    sample_size = int(sample_size_ratio*N)
    sample_idx = np_rng.choice(np.arange(N), size=sample_size, replace=False)
    return X[sample_idx], y[sample_idx]


def pca_3d_scatterplot_by_label(data, targets, split_name):
    target2lang = {t: l for l, t in lang2target.items()}
    
    df = pd.DataFrame.from_dict({
        "x": data[:,0],
        "y": data[:,1],
        "z": data[:,2],
        "lang": [target2lang[t] for t in targets],
    })
    
    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(111, projection='3d')
    
    for lang, g in df.groupby("lang"):
        ax.scatter(g.x, g.y, g.z, label=lang)
    
    ax.legend()
    ax.set_title("3D PCA scatter plot of {} set language vectors".format(split_name))
    plt.show()


train_X, train_y = embeddings_as_numpy_data(meta[meta["split"]=="train"])
print("training vectors", train_X.shape, train_y.shape)
test_X, test_y = embeddings_as_numpy_data(meta[meta["split"]=="test"])
print("test vectors", test_X.shape, test_y.shape)

# Standardize all vectors using training set statistics
scaler = StandardScaler()
scaler.fit(train_X)
train_X = scaler.transform(train_X)
test_X = scaler.transform(test_X)

# Reduce dimensions
pre_shape = train_X.shape
plda = PLDA()
plda.fit(train_X, train_y)
train_X = plda.transform(train_X)
test_X = plda.transform(test_X)
print("PLDA reduced dimensions from {} to {}".format(pre_shape, train_X.shape))

# L2-normalize vectors to surface of a unit sphere
train_X = normalize(train_X)
test_X = normalize(test_X)

# Map vectors to 3D with PCA, select 10% samples, plot vectors
pca = PCA(n_components=3, whiten=False)
pca.fit(train_X)

X, y = random_sample(pca.transform(train_X), train_y, 0.1)
pca_3d_scatterplot_by_label(X, y, "training")

X, y = random_sample(pca.transform(test_X), test_y, 0.1)
pca_3d_scatterplot_by_label(X, y, "test")

## Fit classifier on training set vectors and evaluate on test set vectors

In [None]:
from sklearn.naive_bayes import GaussianNB
from lidbox.util import classification_report


# Fit classifier
clf = GaussianNB()
clf.fit(train_X, train_y)

# Predict scores on test set with classifier and compute metrics
test_pred = clf.predict_log_proba(test_X)
# Clamp -infs to -100
test_pred = np.maximum(-100, test_pred)
report = classification_report(test_y, test_pred, lang2target)
display_classification_report(report)

## Conclusions

Compared to the results from our previous examples, we were unable to get better results by training an RNN based model with the angular proximity loss function.
However, the PCA scatter plots suggest that language vectors of the same class are much closer to each other compared to what we extracted from the x-vector model.

In any case, we might need much larger datasets before we can reliably compare the x-vector model and the LSTM model we used here.