In [34]:

import os
from glob import glob
import tensorflow as tf
import numpy as np
from tqdm import tqdm
import soundfile as sf
import pandas as pd
from scipy import signal
import csv
import utils

# Data extraction

In [36]:
#Download training data
#!wget https://www.openslr.org/resources/12/train-clean-100.tar.gz
#Download dev data
#!wget https://www.openslr.org/resources/12/dev-clean.tar.gz

In [35]:
#unzip
#!tar -xzvf  train-clean-100.tar.gz 2> /dev/null
#!tar -xzvf  dev-clean.tar.gz 2> /dev/null

In [28]:
!mkdir dev_data
!mkdir train_data

mkdir: cannot create directory ‘dev_data’: File exists
mkdir: cannot create directory ‘train_data’: File exists


In [29]:
preprocessing_para   = {
    "window_size" : 20,
    "step_size" : 10,
    "data_dir": "./LibriSpeech/",
    "data_train_dir": "./train_data/",
    "data_dev_dir": "./dev_data/"
}

In [30]:
directory = "LibriSpeech/train-clean-100/"
dir_walk = list(os.walk(directory))

In [39]:
character_map = utils.character_mapping()

In [40]:
def log_linear_specgram(audio, sample_rate, window_size=20,
                        step_size=10, eps=1e-10):

    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))

    _, _, spec = signal.spectrogram(audio, fs=sample_rate,
                                    window='hann', nperseg=nperseg, noverlap=noverlap,
                                    detrend=False)

    return np.log(spec.T.astype(np.float32) + eps)


In [41]:

def data_processing(directory, ds = "train"):
  num_hours = 0
  dir_walk = list(os.walk(directory))
  filename = []
  spec_length = []
  labels_length  = []
  labels = []

  for root, dirs, files in tqdm(dir_walk):
    for file in files:
      if file[-4:] == '.txt':
        with open(os.path.join(root, file), 'r') as f:
          for line in f.readlines():
            sections = line.split(' ')
            audio, sr = sf.read(os.path.join(root, sections[0] + '.flac'))
            num_hours += (len(audio) / sr) / 3600
            spec = log_linear_specgram(audio, sr, window_size = preprocessing_para['window_size'],step_size=preprocessing_para['step_size'])
            if ds =="train":
              np.save(os.path.join(preprocessing_para["data_train_dir"], sections[0] + '.npy'), spec)
            else:
              np.save(os.path.join(preprocessing_para["data_dev_dir"], sections[0] + '.npy'), spec)
            ids = [character_map[c] for c in ' '.join(sections[1:]).lower() if c in character_map]

            filename.append(sections[0])
            spec_length.append(spec.shape[0])
            labels_length.append(len(ids))
            labels.append(' '.join([str(i) for i in ids]))
  df = pd.DataFrame({
      "filename" : filename,
      "spec_length": spec_length,
      "labels_length": labels_length,
      "labels": labels
  })
  if ds == "train":
    df.to_csv(preprocessing_para["data_train_dir"] + ds + ".csv", index=False)
  elif ds == "dev":
    df.to_csv(preprocessing_para["data_dev_dir"] + ds + ".csv", index=False)
  print(f"Done")
  print(f"Hours pre-processed: {str(num_hours)}")

In [42]:
data_processing("LibriSpeech/train-clean-100/")

100%|███████████████████████████████████████████████████████████████████████████████| 837/837 [02:08<00:00,  6.50it/s]

Done
Hours pre-processed: 100.5908796527777





In [43]:
data_processing("./LibriSpeech/dev-clean/", "dev")

100%|███████████████████████████████████████████████████████████████████████████████| 138/138 [00:08<00:00, 16.95it/s]


Done
Hours pre-processed: 5.387811319444446


# Data processing

In [12]:
train_df = pd.read_csv("./train_data/train.csv")
max_spec_length = train_df["spec_length"].max()
max_label_length = train_df["labels_length"].max()
print(f"max_spec_length: {max_spec_length}")
print(f"max_label_length: {max_label_length}")
print(f"number of sample: {train_df.shape[0]}")

max_spec_length: 2451
max_label_length: 398
number of sample: 28539


In [14]:
def create_data_generator(directory, max_input_length, max_label_length, batch_size=8):
    x, y, input_lengths, label_lengths = [], [], [], []
    with open(os.path.join(directory, "train.csv"), 'r') as metadata:
        metadata_reader = csv.DictReader(metadata, fieldnames=['filename', 'spec_length', 'labels_length', 'labels'])
        next(metadata_reader)
        for row in metadata_reader:
            audio = np.load(os.path.join(directory, row['filename'] + '.npy'))
            x.append(audio)
            y.append([int(i) for i in row['labels'].split(' ')])
            input_lengths.append(int(row['spec_length']))
            label_lengths.append(int(row['labels_length']))
            if len(x) == batch_size:
                yield {
                    'inputs': tf.keras.preprocessing.sequence.pad_sequences(x, maxlen=max_input_length, padding='post'),
                    'labels': tf.keras.preprocessing.sequence.pad_sequences(y, maxlen=max_label_length, padding='post'),
                    'input_lengths': np.asarray(input_lengths),
                    'label_lengths': np.asarray(label_lengths)
                }, {
                    'ctc': np.zeros([batch_size])
                }
                x, y, input_lengths, label_lengths = [], [], [], []

In [15]:
training_para = {
    "batch_size": 32,
    "vocal_size": len(character_map)
}

In [16]:
data_generator = create_data_generator(directory="./train_data/",
                                                 max_input_length=max_spec_length,
                                                 max_label_length=max_label_length,
                                                 batch_size=training_para['batch_size'])

# Model

In [17]:
def clipped_relu(x):
    return tf.keras.activations.relu(x, max_value=20)

def ctc_lambda_func(args):
    labels, y_pred, input_length, label_length = args
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

def ctc(y_true, y_pred):
    return y_pred

In [18]:
hparams = {

    'verbose': 1,

    'conv_channels': [100],
    'conv_filters': [5],
    'conv_strides': [2],

    'rnn_units': [64],
    'bidirectional_rnn': True,

    'future_context': 2,

    'use_bn': True,

    'learning_rate': 0.001,
    "max_input_length": max_spec_length,
    'vocab_size': len(character_map)

}

In [19]:
def ds2_gru_model(input_dim=161, fc_size=1024, rnn_size=512, output_dim=29, initialization='glorot_uniform',
                  conv_layers=1, gru_layers=5, use_conv=True):
    """ DeepSpeech 2 implementation
    Architecture:
        Input Spectrogram TIMEx161
        1 Batch Normalisation layer on input
        1-3 Convolutional Layers
        1 Batch Normalisation layer
        1-7 BiDirectional GRU Layers
        1 Batch Normalisation layer
        1 Fully connected Dense
        1 Softmax output
    Details:
       - Uses Spectrogram as input rather than MFCC
       - Did not use BN on the first input
       - Network does not dynamically adapt to maximum audio size in the first convolutional layer. Max conv
          length padded at 2048 chars, otherwise use_conv=False
    Reference:
        https://arxiv.org/abs/1512.02595
    """

    K.set_learning_phase(1)

    input_data = Input(shape=(None, input_dim), name='inputs')
    x = BatchNormalization(axis=-1, momentum=0.99, epsilon=1e-3, center=True, scale=True)(input_data)

    if use_conv:
        conv = ZeroPadding1D(padding=(0, 2048))(x)
        for l in range(conv_layers):
            x = Conv1D(filters=fc_size, name='conv_{}'.format(l+1), kernel_size=11, padding='valid', activation='relu', strides=2)(conv)
    else:
        for l in range(conv_layers):
            x = TimeDistributed(Dense(fc_size, name='fc_{}'.format(l + 1), activation='relu'))(x)  # >>(?, time, fc_size)

    x = BatchNormalization(axis=-1, momentum=0.99, epsilon=1e-3, center=True, scale=True)(x)

    for l in range(gru_layers):
        x = Bidirectional(GRU(rnn_size, name='fc_{}'.format(l + 1), return_sequences=True, activation='relu', kernel_initializer=initialization),
                      merge_mode='sum')(x)

    x = BatchNormalization(axis=-1, momentum=0.99, epsilon=1e-3, center=True, scale=True)(x)

    # Last Layer 5+6 Time Dist Dense Layer & Softmax
    x = TimeDistributed(Dense(fc_size, activation=clipped_relu))(x)
    y_pred = TimeDistributed(Dense(output_dim, name="y_pred", activation="softmax"))(x)

    # labels = K.placeholder(name='the_labels', ndim=1, dtype='int32')
    labels = Input(name='labels', shape=[None,], dtype='int32')
    input_length = Input(name='input_lengths', shape=[1], dtype='int32')
    label_length = Input(name='label_lengths', shape=[1], dtype='int32')

    # Keras doesn't currently support loss funcs with extra parameters
    # so CTC loss is implemented in a lambda layer
    loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([labels,
                                                                       y_pred,
                                                                       input_length,
                                                                       label_length])

    model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)

    return model

In [20]:
model = ds2_gru_model()

2021-09-17 11:47:39.030340: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-17 11:47:39.034480: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-17 11:47:39.034999: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-17 11:47:39.035827: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags



o
2021-09-17 11:47:39.037445: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-17 11:47:39.038037: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-17 11:47:39.299321: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-17 11:47:39.299809: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-17 11:47:39.300262: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from



In [21]:
optimizer = tf.keras.optimizers.Adam(learning_rate=hparams['learning_rate'], beta_1=0.9, beta_2=0.999, epsilon=1e-8, clipnorm=5)
model.compile(optimizer=optimizer, loss=ctc)
model.fit_generator(data_generator, epochs=2,steps_per_epoch=10)

2021-09-17 11:47:40.241969: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/2


2021-09-17 11:47:45.980844: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8200
2021-09-17 11:47:46.508904: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2021-09-17 11:47:46.509323: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2021-09-17 11:47:46.509334: W tensorflow/stream_executor/gpu/asm_compiler.cc:77] Couldn't get ptxas version string: Internal: Couldn't invoke ptxas --version
2021-09-17 11:47:46.509801: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2021-09-17 11:47:46.509830: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] Internal: Failed to launch ptxas
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.
2021-09-17 11:47:47.011707: I tensorflow/stream_executor/cuda/c

Epoch 2/2


<keras.callbacks.History at 0x7f9f4d31f850>

In [22]:
y_pred = model.get_layer('ctc').input[1]
y_pred

<KerasTensor: shape=(None, None, 29) dtype=float32 (created by layer 'time_distributed_1')>

In [38]:
input_data = tf.keras.layers.Input(name='inputs', shape=[hparams['max_input_length'], 161])
x = input_data
if hparams['use_bn']:
          x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.ZeroPadding1D(padding=(0, hparams['max_input_length']))(x)
for i in range(len(hparams['conv_channels'])):
  x = tf.keras.layers.Conv1D(hparams['conv_channels'][i], hparams['conv_filters'][i], strides=hparams['conv_strides'][i], activation='relu', padding='same')(x)
if hparams['use_bn']:
  x = tf.keras.layers.BatchNormalization()(x)
for h_units in hparams['rnn_units']:
  if hparams['bidirectional_rnn']:
    h_units = int(h_units / 2)
  gru = tf.keras.layers.GRU(h_units, activation='relu', return_sequences=True)
  if hparams['bidirectional_rnn']:
    gru = tf.keras.layers.Bidirectional(gru, merge_mode='sum')
  x = gru(x)
if hparams['use_bn']:
  x = tf.keras.layers.BatchNormalization()(x)
if hparams['future_context'] > 0:
  if hparams['future_context'] > 1:
    x = tf.keras.layers.ZeroPadding1D(padding=(0, hparams['future_context'] - 1))(x)
  x = tf.keras.layers.Conv1D(100, hparams['future_context'], activation='relu')(x)
y_pred = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(hparams['vocab_size'] + 1, activation='sigmoid'))(x)

labels = tf.keras.layers.Input(name='labels', shape=[None], dtype='float32')
input_length = tf.keras.layers.Input(name='input_lengths', shape=[1], dtype='float32')
label_length = tf.keras.layers.Input(name='label_lengths', shape=[1], dtype='float32')

loss_out = Lambda(ctc_lambda_func, name='ctc')([labels, y_pred, input_length, label_length])
model = tf.keras.Model(inputs=[input_data, labels, input_length, label_length], outputs=[loss_out])



In [33]:
model.fit(data_generator)

CancelledError: ignored