In [1]:
import tensorflow as tf
# tf.debugging.set_log_device_placement(True)
import tensorflow_addons as tfa
import sys
import os
if os.path.abspath('../') not in sys.path:
    sys.path.append(os.path.abspath('../'))


import automatic_speech_recognition as asr
import time
from datetime import datetime
import argparse
import pickle

In [2]:
import numpy as np
import pandas as pd

#np.seterr(all='raise')

In [3]:
from tensorflow import keras
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.mixed_precision import experimental as mixed_precision

In [4]:
from automatic_speech_recognition.model.quartznet import QUARTZNET_LAYERS

In [5]:
import horovod.tensorflow.keras as hvd

In [6]:
#%load_ext tensorboard
#%tensorboard --logdir=./models/ --port=32779

In [7]:
%pdb off

Automatic pdb calling has been turned OFF


In [8]:
def get_pipeline(model, optimizer=None):
    alphabet = asr.text.Alphabet(lang='en')
    features_extractor = asr.features.FilterBanks(
        features_num=64,
        standardize="per_feature",
        winlen=0.02,
        winstep=0.01,
        window=np.hanning
    )
    if not optimizer:
        optimizer = tf.optimizers.Adam(lr=1e-3, beta_1=0.9, beta_2=0.999)
    decoder = asr.decoder.GreedyDecoder()
    pipeline = asr.pipeline.CTCPipeline(
        alphabet, features_extractor, model, optimizer, decoder
    )
    return pipeline

## Train

In [9]:
#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [10]:
# Initialize Horovod
hvd.init()
# Pin GPU to be used to process local rank (one GPU per process)
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
if gpus:
    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')

In [11]:
def train_model(filename, dataset_idx, val_dataset_idx=None, initial_lr=0.001, batch_size=10, epochs=25, tensorboard=False, restart_filename=None,
               is_mixed_precision=False, n_blocks=2, decay=1.0):
    basename = os.path.basename(filename).split('.')[0]
    model_dir = os.path.join(os.path.dirname(filename), basename + '_train')
    os.makedirs(model_dir, exist_ok=True)
    
    model = asr.model.get_quartznet(64, 29, is_mixed_precision=is_mixed_precision, num_b_block_repeats=n_blocks)

    if restart_filename:
        model.load_weights(restart_filename)
    
    initial_lr_global = initial_lr * hvd.size()
    
    dataset = asr.dataset.Audio.from_csv(dataset_idx, batch_size=batch_size,
                                         max_filesize=750000,
                                         use_filesizes=True, group_size=hvd.size(), rank=hvd.rank())
    dataset.sort_by_length()
    dataset._references = dataset._references[-batch_size*2:]
    dataset._indices = np.arange(len(dataset))
    dataset.shuffle_indices()

    print(f'Group size: {hvd.size()} rank {hvd.rank()} got {len(dataset)} batches')

    if val_dataset_idx:
        val_dataset = asr.dataset.Audio.from_csv(val_dataset_idx, batch_size=batch_size, use_filesizes=True,
                                                group_size=hvd.size(), rank=hvd.rank())

        print(f'Group size: {hvd.size()} rank {hvd.rank()} got {len(val_dataset)} val batches')

    opt_instance = tfa.optimizers.NovoGrad(0.02 * initial_lr_global, beta_1=0.8, beta_2=0.5, weight_decay=0.001)

    opt = hvd.DistributedOptimizer(opt_instance)
    pipeline = get_pipeline(model, opt)
    
    callbacks = [
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),
        hvd.callbacks.MetricAverageCallback(),
    ]

    augmentation = asr.augmentation.Cutout(
        F=6,
        T=6,
        n=2,
        fill_value=0,
        seed=5
    )

    time_start = time.time()
    
    # pretrain - warmup
    pipeline.fit(dataset, dev_dataset=None,
                 augmentation=augmentation,
                 epochs=3,
                 steps_per_epoch=2,
                 callbacks=callbacks,
                 verbose=1 if hvd.rank() == 0 else 0)
    # training
    cosine_schedule = tf.keras.experimental.CosineDecayRestarts(
        initial_lr_global, epochs, t_mul=2.0,
        m_mul=2, alpha=1e-5 * hvd.size() /initial_lr_global,
    )

    callbacks.append(LearningRateScheduler(cosine_schedule))
    
    if hvd.rank() == 0:
        prefix = datetime.now().strftime("%Y%m%d-%H%M%S")
        print('Will save to: {}'.format(os.path.join(model_dir, prefix)))
        monitor_metric_name = 'loss'  if not val_dataset_idx else 'val_loss'  # val_loss is wrong and broken
        callbacks.append(
            keras.callbacks.ModelCheckpoint(
                os.path.join(model_dir, prefix + '_best.h5'),
                monitor=monitor_metric_name, save_weights_only=True,
                save_best_only=True))
        #if tensorboard:
        #    logdir = os.path.join(model_dir, 'tb', prefix)
        #    tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)
        #    callbacks.append(tensorboard_callback)

    hist = pipeline.fit(dataset, dev_dataset=None, # val_dataset,
                        augmentation=augmentation,
                        epochs=epochs,
                        #steps_per_epoch=270,
                        callbacks=callbacks,
                        verbose=1 if hvd.rank() == 0 else 0)
        
    elapsed = time.time() - time_start
    
    if hvd.rank() == 0:
        print(f'Elapsed time: {elapsed}')
        #np.save(os.path.join(model_dir, prefix + '_hist.p'), np.array(hist))
    return model

In [13]:
model = train_model(
        filename='./models/quartznet_5x5_mp_dev.h5', 
        dataset_idx='./data/train-other-960-index.csv',
        val_dataset_idx='./data/dev-clean-index.csv',
        is_mixed_precision=True,
        batch_size=30,
        initial_lr=0.001,
        epochs=3,
        tensorboard=False,
        restart_filename=None,
        n_blocks=2
)

Group size: 1 rank 0 got 2 batches
Group size: 1 rank 0 got 54 val batches
Epoch 1/3


ResourceExhaustedError: OOM when allocating tensor with shape[50,586,512] and type half on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Mul]

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Train ctc asr model')
    parser.add_argument('--filename', type=str,
                        help='filename of the model')
    parser.add_argument('--dataset', type=str,
                        help='path to the dataset index',)
    parser.add_argument('--val_dataset', type=str,
                        help='path to the validation dataset index (optional)',
                        default=None)
    parser.add_argument('--batch_size', type=int,
                       help='batch size for training and validation',
                       default=10)
    parser.add_argument('--lr', type=float,
                       help='initial learning rate',
                       default=0.005)
    parser.add_argument('--decay', type=float,
                       help='learning rate decay',
                       default=1.0)
    parser.add_argument('--epochs', type=int,
                       help='number of epochs to use for training',
                       default=25)
    parser.add_argument('--tensorboard', type=bool,
                       help='if tensorboard log will be written',
                       default=False)
    parser.add_argument('--restart_filename', type=str,
                       help='filename of the checkpoint to restart from',
                       default=None)
    parser.add_argument('--mp', type=bool,
                       help='if mixed precision training is requested',
                       default=False)
    args = parser.parse_args()
    
    train_model(filename=args.filename, dataset_idx=args.dataset,
                val_dataset_idx=args.val_dataset, epochs=args.epochs,
                tensorboard=args.tensorboard, restart_filename=args.restart_filename,
                is_mixed_precision=args.mp, initial_lr=args.lr, decay=args.decay)

training times: 25 epochs - 10 batch / epoch
 tt mixed - gpu - 9427s - batch = 1
 tt mixed - cpu - 29900s - batch = 1
 tt full  - cpu - 14050s - batch = 1

## Test

In [9]:
dataset = asr.dataset.Audio.from_csv('./data/dev-clean-index.csv', batch_size=32,
                                         use_filesizes=True)

In [10]:
####

In [13]:
model1 = asr.model.get_quartznet(64, 29, is_mixed_precision=False, num_b_block_repeats=1)
model1.load_weights('models/quartznet_5x5_train/311392-25-43.77.h5')  # dev loss = 
pipeline1 = get_pipeline(model1)

In [14]:
tim = time.time()
wer, cer = asr.evaluate.calculate_error_rates(pipeline1, dataset)
print(f'wer: {wer}, cer: {cer}')
print('Elapsed: {}'.format(time.time() - tim))

wer: 0.3065190669997668, cer: 0.10590696100817153
Elapsed: 383.51615619659424


In [16]:
####

In [15]:
model2 = asr.model.get_quartznet(64, 29, is_mixed_precision=False, num_b_block_repeats=1)
model2.load_weights('models/quartznet_5x5_train/311392_best.h5')
pipeline2 = get_pipeline(model2)

In [16]:
tim = time.time()
wer, cer = asr.evaluate.calculate_error_rates(pipeline2, dataset)
print(f'wer: {wer}, cer: {cer}')
print('Elapsed: {}'.format(time.time() - tim))

wer: 0.3089154097474338, cer: 0.10428136521928719
Elapsed: 368.7724950313568


In [19]:
####

In [22]:
model3 = asr.model.get_quartznet(64, 29, is_mixed_precision=False, num_b_block_repeats=1)
model3.load_weights('./models/quartznet_5x5_mp_train/20200623-020711-8.12.h5')
pipeline3 = get_pipeline(model3)

In [23]:
tim = time.time()
wer, cer = asr.evaluate.calculate_error_rates(pipeline3, dataset)
print(f'wer: {wer}, cer: {cer}')
print('Elapsed: {}'.format(time.time() - tim))

wer: 0.1774903358866388, cer: 0.059797317259752056
Elapsed: 364.98943161964417


In [24]:
#### reference

In [50]:
model4 = asr.model.get_quartznet(64, 29, is_mixed_precision=False, num_b_block_repeats=1)
model4.load_weights('./models/quartznet_5x5_mp_train/20200628-015425_best.h5')
pipeline4 = get_pipeline(model4)

In [51]:
asr.evaluate.calculate_error_rates(pipeline4, dataset)



(0.16518824948910635, 0.0783160187089817)

In [17]:
####

In [18]:
model5 = asr.model.get_quartznet(64, 29, is_mixed_precision=False, num_b_block_repeats=1)
model5.load_weights('./models/quartznet_5x5_train/311397_best.h5')
pipeline5 = get_pipeline(model5)

In [19]:
asr.evaluate.calculate_error_rates(pipeline5, dataset)



(0.2697527800274432, 0.08946972913907525)

In [21]:
schedule=tf.keras.experimental.CosineDecayRestarts(
        0.01, 100, t_mul=2.0,
        m_mul=1e-3, alpha=0.0)

In [28]:
schedule(1)

<tf.Tensor: shape=(), dtype=float32, numpy=0.009997532>

In [26]:
tf.convert_to_tensor(0.01)

<tf.Tensor: shape=(), dtype=float32, numpy=0.01>

In [12]:
44000* 17

748000

In [None]:
750000