In [1]:
%pdb on

Automatic pdb calling has been turned ON


In [1]:
# import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
import sys
import os
if os.path.abspath('../') not in sys.path:
    sys.path.append(os.path.abspath('../'))
if os.path.abspath('../../tt_keras') not in sys.path:
    sys.path.append(os.path.abspath('../../tt_keras'))

if os.path.abspath('../../t3f') not in sys.path:
    sys.path.append(os.path.abspath('../../t3f'))

import automatic_speech_recognition as asr
from tensorflow.keras.callbacks import LearningRateScheduler
import time
from tensorflow import keras
import horovod.tensorflow.keras as hvd
from datetime import datetime

# physical_devices = tf.config.list_physical_devices('GPU')
# try:
#       tf.config.experimental.set_memory_growth(physical_devices[0], True)
# except:
#       pass
from pathlib import Path


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from tqdm.notebook import tqdm

# Train/Eval the model

In [3]:
def get_pipeline(model, optimizer=None):
    alphabet = asr.text.Alphabet(lang='en')
    features_extractor = asr.features.TfMFCC(
        features_num=26,
        winlen=0.032,
        winstep=0.02,
    )
    
    if optimizer is None:
        optimizer = tf.optimizers.Adam(lr=1e-3, beta_1=0.9, beta_2=0.999)

    decoder = asr.decoder.RNNTGreedyDecoder(model, alphabet.blank_token)
    pipeline = asr.pipeline.RNNTPipeline(
        alphabet, features_extractor, model, optimizer, decoder
    )
    callbacks = []
    return pipeline

In [4]:
dev_dataset = asr.dataset.Audio.from_csv('./data/dev-clean-index.csv', batch_size=2, use_filesizes=True, librosa_read=False)

In [5]:
alphabet = asr.text.Alphabet(lang='en')
model = asr.model.get_rnnt(26, 
                           num_layers_encoder=8, units_encoder=2048, projection_encoder=640, encoder_reduction_indexes=[1],
                           units_prediction=2048, projection_prediction=640, num_layers_prediction=2, 
                           vocab_size=alphabet.size, 
                           blank_label=alphabet.blank_token)

model.load_weights('./RNNT_experiments/rnnt_joint_nohidden_sum/rnnt_joint_nohidden_sum_train/rnnt_best.ckpt')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1554545cb668>

In [6]:
pipeline = get_pipeline(model)
pipeline.compile_model()



In [7]:
# folder = './rnnt_joint_nohidden_sum_train'
# callbacks = []
# # schedule = tf.keras.experimental.CosineDecayRestarts(
# #     1e-3, 10, t_mul=2.0, m_mul=1.0, alpha=0.0)
# # callbacks.append(LearningRateScheduler(schedule))

# callbacks.append(tf.keras.callbacks.ModelCheckpoint(
#                     os.path.join(folder, 'rnnt_best.ckpt'),
#                     monitor='loss', save_weights_only=True,
#                     save_best_only=True))

# pipeline.fit(dev_dataset, epochs=10, callbacks=callbacks)
# pipeline.model.save_weights('./rnnt_weights.h5')

In [8]:
decoder = asr.decoder.RNNTGreedyDecoder(model, alphabet.blank_token)

eval_dataset = asr.dataset.Audio.from_csv('./data/dev-clean-index.csv', batch_size=5, use_filesizes=True, librosa_read=False)
X, y = eval_dataset[3]
print(pipeline.predict(X))
print(y)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

['the lady and the guitar certainly passed the night', 'the lady and the guitar certainly passed the night', 'the lady and the guitar certainly passed the night', 'the lady and the guitar certainly passed the night', 'the lady and the guitar certainly passed the night']
['these intruders are very peculiar people remarked a man in the crowd', 'they seem very ignorant poor things said another in reply', 'the people must wait outside for there is no room for them in the palace', 'so they followed her through the low archway and in a room beyond very simply furnished sat a young girl engaged in darning a pair of pink stockings', 'she was a beautiful girl of about seventeen years of age not fat 

In [9]:
alphabet = asr.text.Alphabet(lang='en')
model = asr.model.get_rnnt(26, 
                           num_layers_encoder=8, units_encoder=2048, projection_encoder=640, encoder_reduction_indexes=[1],
                           units_prediction=2048, projection_prediction=640, num_layers_prediction=2, 
                           joint_additional_size=640, joint_aggregation_type='sum',
                           vocab_size=alphabet.size, 
                           blank_label=alphabet.blank_token)

model.load_weights('./RNNT_experiments/rnnt_joint_hidden_sum/rnnt_joint_nohidden_sum_train/rnnt_best.ckpt')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1554096845c0>

In [10]:
pipeline = get_pipeline(model)
pipeline.compile_model()



In [11]:
decoder = asr.decoder.RNNTGreedyDecoder(model, alphabet.blank_token)

eval_dataset = asr.dataset.Audio.from_csv('./data/dev-clean-index.csv', batch_size=5, use_filesizes=True, librosa_read=False)
X, y = eval_dataset[3]
print(pipeline.predict(X))
print(y)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

['the', 'the', 'the', 'the', 'the']
['these intruders are very peculiar people remarked a man in the crowd', 'they seem very ignorant poor things said another in reply', 'the people must wait outside for there is no room for them in the palace', 'so they followed her through the low archway and in a room beyond very simply furnished sat a young girl engaged in darning a pair of pink stockings', 'she was a beautiful girl of about seventeen years of age not fat like all the rest of the pinkies but slender and well formed according to our own ideas of beauty']


In [12]:
alphabet = asr.text.Alphabet(lang='en')
model = asr.model.get_rnnt(26, 
                           num_layers_encoder=8, units_encoder=2048, projection_encoder=640, encoder_reduction_indexes=[1],
                           units_prediction=2048, projection_prediction=640, num_layers_prediction=2, 
                           joint_aggregation_type='concat',
                           vocab_size=alphabet.size, 
                           blank_label=alphabet.blank_token)
model.load_weights('./RNNT_experiments/rnnt_joint_nohidden_concat/rnnt_joint_nohidden_sum_train/rnnt_best.ckpt')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1553f4fec048>

In [13]:
decoder = asr.decoder.RNNTGreedyDecoder(model, alphabet.blank_token)

eval_dataset = asr.dataset.Audio.from_csv('./data/dev-clean-index.csv', batch_size=5, use_filesizes=True, librosa_read=False)
X, y = eval_dataset[3]
print(pipeline.predict(X))
print(y)

['the', 'the', 'the', 'the', 'the']
['these intruders are very peculiar people remarked a man in the crowd', 'they seem very ignorant poor things said another in reply', 'the people must wait outside for there is no room for them in the palace', 'so they followed her through the low archway and in a room beyond very simply furnished sat a young girl engaged in darning a pair of pink stockings', 'she was a beautiful girl of about seventeen years of age not fat like all the rest of the pinkies but slender and well formed according to our own ideas of beauty']


In [14]:
alphabet = asr.text.Alphabet(lang='en')
model = asr.model.get_rnnt(26, 
                           num_layers_encoder=8, units_encoder=2048, projection_encoder=640, encoder_reduction_indexes=[1],
                           units_prediction=2048, projection_prediction=640, num_layers_prediction=2, 
                            joint_additional_size=640, joint_aggregation_type='concat',
                           vocab_size=alphabet.size, 
                           blank_label=alphabet.blank_token)
model.load_weights('./RNNT_experiments/rnnt_joint_hidden_concat/rnnt_joint_nohidden_sum_train/rnnt_best.ckpt')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1551546b2208>

In [15]:
decoder = asr.decoder.RNNTGreedyDecoder(model, alphabet.blank_token)

eval_dataset = asr.dataset.Audio.from_csv('./data/dev-clean-index.csv', batch_size=5, use_filesizes=True, librosa_read=False)
X, y = eval_dataset[3]
print(pipeline.predict(X))
print(y)

['the', 'the', 'the', 'the', 'the']
['these intruders are very peculiar people remarked a man in the crowd', 'they seem very ignorant poor things said another in reply', 'the people must wait outside for there is no room for them in the palace', 'so they followed her through the low archway and in a room beyond very simply furnished sat a young girl engaged in darning a pair of pink stockings', 'she was a beautiful girl of about seventeen years of age not fat like all the rest of the pinkies but slender and well formed according to our own ideas of beauty']


In [6]:
def run_sbatch_scripts(script_paths, environ_vars=None):
    """
    Runs sbatch {script_name}.sh for every script in script_paths. 
    Before running changes working directory to script dir.
    """
    # We will set variables using "A=B C=D sbatch script.sh" syntax
    var_init_str = ' '.join([f'{name}={value}' for name, value in environ_vars.items()])
    var_init_str += ' '
    
    for path in script_paths:
        command_string = f"cd {path.parent}; {var_init_str} sbatch ../../subm_python_small.sh train.py"
        print(os.popen(command_string).read())

exp_names = [
    'rnnt_joint_nohidden_sum',
    'rnnt_joint_hidden_sum',
    'rnnt_joint_nohidden_concat',
    'rnnt_joint_hidden_concat'
]
script_paths = [Path(f'./RNNT_experiments/{name}/{name}.py') for name in exp_names]

In [7]:
run_sbatch_scripts(
    script_paths, 
    {
        'PYTHONPATH': os.environ['PYTHONPATH']+':/trinity/home/g.leleitner/lab/Horovod/Automatic-Speech-Recognition:\
/trinity/home/g.leleitner/lab/Horovod/tt_keras:/trinity/home/g.leleitner/lab/Horovod/t3f:\
/trinity/home/g.leleitner/lab/Horovod/tf2-gradient-checkpointing'
    }
)

Submitted batch job 428194

Submitted batch job 428195

Submitted batch job 428196

Submitted batch job 428197



In [22]:
# def train_model(model,
#                 model_name,
#                 dataset_idx, 
#                 val_dataset_idx=None, 
#                 batch_size=2, 
#                 epochs=25, 
#                 tensorboard=False, 
#                 restart_filename=None):
#     model_dir = os.path.join(model_name + '_train')
#     os.makedirs(model_dir, exist_ok=True)

#     if restart_filename:
#         model.load_weights(restart_filename)
#     dataset = asr.dataset.Audio.from_csv(dataset_idx, batch_size=batch_size, use_filesizes=True, librosa_read=False)
#     dataset.sort_by_length()
#     dataset.shuffle_indices()
#     if val_dataset_idx:
#         val_dataset = asr.dataset.Audio.from_csv(val_dataset_idx, batch_size=batch_size, use_filesizes=True, librosa_read=False)

#     opt = tf.optimizers.Adam(lr=1e-3, beta_1=0.9, beta_2=0.999)
#     opt = hvd.DistributedOptimizer(opt)
    
#     pipeline = get_pipeline(model, opt)
#     callbacks = [
#         hvd.callbacks.BroadcastGlobalVariablesCallback(0),
#         hvd.callbacks.MetricAverageCallback(),
#     ]
    
#     if hvd.rank() == 0:
#         prefix = datetime.now().strftime("%Y%m%d-%H%M%S")
#         monitor_metric_name = 'loss' # if not val_dataset_idx else 'val_loss'  # val_loss is wrong and broken
#         callbacks.append(
#             keras.callbacks.ModelCheckpoint(
#                 os.path.join(model_dir, prefix + '_best.ckpt'),
#                 monitor=monitor_metric_name, save_weights_only=True,
#                 save_best_only=True))
#         if tensorboard:
#             logdir = os.path.join(model_dir, 'tb', prefix)
#             tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir, profile_batch=1)
#             callbacks.append(tensorboard_callback)
#     pipeline.compile_model(experimental_run_tf_function=False)
#     pipeline.fit(dataset, epochs=epochs, dev_dataset=val_dataset,
#                         callbacks=callbacks,
#                         verbose=1 if hvd.rank() == 0 else 0,
#                         validation_steps=10)

# alphabet = asr.text.Alphabet(lang='en')
# model = asr.model.get_rnnt(26, 
#                            num_layers_encoder=8, units_encoder=2048, projection_encoder=640, encoder_reduction_indexes=[1],
#                            units_prediction=2048, projection_prediction=640, num_layers_prediction=2, 
#                            vocab_size=alphabet.size, 
#                            blank_label=alphabet.blank_token)

# hvd.init()
# train_model(model, 'rnnt_joint_nohidden_sum', './data/dev-clean-index.csv', './data/dev-clean-index.csv')