LACE dataaug configs

rwth-i6 · Nov 22, 2019 · f5f36fa · f5f36fa
1 parent 1259529
commit f5f36fa
Show file tree

Hide file tree

Showing 2 changed files with 620 additions and 0 deletions.
diff --git a/2019-asr-resnet-lace-cnn/LACE/DataAug/lace_specaug_logmel+d+dd.config b/2019-asr-resnet-lace-cnn/LACE/DataAug/lace_specaug_logmel+d+dd.config
@@ -0,0 +1,314 @@
+#!crnn/rnn.py
+# kate: syntax python;
+# see also file:///u/zeyer/setups/quaero-en/training/quaero-train11/50h/ann/2015-07-29--lstm-gt50/config-train/dropout01.3l.n500.custom_lstm.adam.lr1e_3.config
+
+import os
+import numpy
+from subprocess import check_output
+
+# task
+use_tensorflow = True
+task = "train"
+device = "gpu"
+multiprocessing = True
+update_on_device = True
+
+_cf_cache = {}
+
+def cf(filename):
+    """Cache manager"""
+    if filename in _cf_cache:
+        return _cf_cache[filename]
+    if check_output(["hostname"]).strip().decode("utf8") in ["sulfid", "zink", "cobalt", "niob"]:
+        print("use local file: %s" % filename)
+        return filename  # for debugging
+    cached_fn = check_output(["cf", filename]).strip().decode("utf8")
+    assert os.path.exists(cached_fn)
+    _cf_cache[filename] = cached_fn
+    return cached_fn
+
+# data
+context_window = 1
+
+window = 1 # 
+feature_dim = 64  # LogMel 64-dim
+channel_num = 3
+num_inputs = feature_dim * channel_num * window
+num_outputs = 9001  # CART labels
+EpochSplit = 6
+
+def get_sprint_dataset(data):
+    assert data in ["train", "cv"]
+    epochSplit = {"train": EpochSplit, "cv": 1}
+
+    # see /u/tuske/work/ASR/switchboard/corpus/readme
+    # and zoltans mail https://mail.google.com/mail/u/0/#inbox/152891802cbb2b40
+    files = {}
+    files["config"] = "config/training.config"
+    files["corpus"] = "/u/corpora/speech/switchboard-1/xml/swb1-all/swb1-all.corpus.gz"
+    files["segments"] = "dependencies/seg_%s" % {"train":"train", "cv":"cv_head3000"}[data]
+    files["features"] = "/u/bozheniuk/setups/switchboard/feature_extraction/cluster_setup/logmel64_30/data/logmel.train.bundle"    
+    files["lexicon"] = "/u/tuske/work/ASR/switchboard/corpus/train.lex.v1_0_3.ci.gz"
+    files["alignment"] = "dependencies/tuske__2016_01_28__align.combined.train"
+    files["cart"] = "/u/tuske/work/ASR/switchboard/initalign/data/%s" % {9001: "cart-9000"}[num_outputs]
+    for k, v in sorted(files.items()):
+        assert os.path.exists(v), "%s %r does not exist" % (k, v)
+    estimated_num_seqs = {"train": 227047, "cv": 3000}  # wc -l segment-file
+
+    # features: /u/tuske/work/ASR/switchboard/feature.extraction/gt40_40/data/gt.train.*
+    args = [
+    "--config=" + files["config"],
+    lambda: "--*.corpus.file=" + cf(files["corpus"]),
+    lambda: "--*.corpus.segments.file=" + cf(files["segments"]),
+    {"train": "--*.corpus.segment-order-shuffle=true", "cv": "--*.corpus.segment-order-sort-by-time-length=true"}[data],
+    "--*.state-tying.type=cart",
+    lambda: "--*.state-tying.file=" + cf(files["cart"]),
+    "--*.trainer-output-dimension=%i" % num_outputs,
+    lambda: "--*.lexicon.file=" + cf(files["lexicon"]),
+    lambda: "--*.alignment-cache-path=" + cf(files["alignment"]),
+    lambda: "--*.feature-cache-path=" + cf(files["features"]),
+    #"--*.mean-normalization.file=dependencies/setup-base/step254-hybrid-mlp-ibm-cmllr/mlp.4/train_mlp/normalize_layer1/mean",
+    #"--*.variance-normalization.file=dependencies/setup-base/step254-hybrid-mlp-ibm-cmllr/mlp.4/train_mlp/normalize_layer1/std",
+    "--*.log-channel.file=log/crnn.sprint.train-dataset.xml",
+    "--*.window-size=1",
+    "--*.trainer-output-dimension=%i" % num_outputs
+    ]
+    return {
+    "class": "ExternSprintDataset", "sprintTrainerExecPath": "sprint-executables/nn-trainer",
+    "sprintConfigStr": args,
+    "partitionEpoch": epochSplit[data],
+    "estimated_num_seqs": estimated_num_seqs[data] // (epochSplit[data] or 1),
+    }
+
+cache_size = "0"
+# network
+# (also defined by num_inputs & num_outputs)
+dropout = 0.05
+ldropout = 0.05
+L2 = 0.1
+
+# bn params
+masked_time = True
+fused = True
+axes = ["f"]
+
+bn_momentum = 0.997
+bn_epsilon = 1e-5
+
+cur_feat_dim = feature_dim
+network = {}
+_last = "data"
+
+def add_sequential_layer(name, d, from_=None):
+    global _last, network
+    assert "from" not in d
+    if from_ is not None:
+        d["from"] = from_
+    else:
+        d["from"] = [_last]
+    assert name not in network
+    network[name] = d
+    _last = name   
+
+# data augmentation
+def summary(name, x):
+    """
+    :param str name:
+    :param tf.Tensor x: (batch,time,feature)
+    """
+    import tensorflow as tf
+    # tf.summary.image wants [batch_size, height,  width, channels],
+    # we have (batch, time, feature).
+    img = tf.expand_dims(x, axis=3)  # (batch,time,feature,1)
+    img = tf.transpose(img, [0, 2, 1, 3])  # (batch,feature,time,1)
+    tf.summary.image(name, img, max_outputs=10)
+    tf.summary.scalar("%s_max_abs" % name, tf.reduce_max(tf.abs(x)))
+    mean = tf.reduce_mean(x)
+    tf.summary.scalar("%s_mean" % name, mean)
+    stddev = tf.sqrt(tf.reduce_mean(tf.square(x - mean)))
+    tf.summary.scalar("%s_stddev" % name, stddev)
+    tf.summary.histogram("%s_hist" % name, tf.reduce_max(tf.abs(x), axis=2))
+
+
+def _mask(x, axis, pos, max_amount):
+    """
+    :param tf.Tensor x: (batch,time,feature)
+    :param int axis:
+    :param tf.Tensor pos: (batch,)
+    :param int max_amount: inclusive
+    """
+    import tensorflow as tf
+    ndim = x.get_shape().ndims
+    n_batch = tf.shape(x)[0]
+    dim = tf.shape(x)[axis]
+    amount = tf.random_uniform(shape=(n_batch,), minval=1, maxval=max_amount + 1, dtype=tf.int32)
+    pos2 = tf.minimum(pos + amount, dim)
+    idxs = tf.expand_dims(tf.range(0, dim), 0)  # (1,dim)
+    pos_bc = tf.expand_dims(pos, 1)  # (batch,1)
+    pos2_bc = tf.expand_dims(pos2, 1)  # (batch,1)
+    cond = tf.logical_and(tf.greater_equal(idxs, pos_bc), tf.less(idxs, pos2_bc))  # (batch,dim)
+    cond = tf.reshape(cond, [tf.shape(x)[i] if i in (0, axis) else 1 for i in range(ndim)])
+    from TFUtil import where_bc
+    x = where_bc(cond, 0.0, x)
+    return x
+
+
+def random_mask(x, axis, min_num, max_num, max_dims):
+    """
+    :param tf.Tensor x: (batch,time,feature)
+    :param int axis:
+    :param int|tf.Tensor min_num:
+    :param int|tf.Tensor max_num: inclusive
+    :param int max_dims: inclusive
+    """
+    import tensorflow as tf
+    n_batch = tf.shape(x)[0]
+    num = tf.random_uniform(shape=(n_batch,), minval=min_num, maxval=max_num + 1, dtype=tf.int32)
+    # https://github.com/tensorflow/tensorflow/issues/9260
+    # https://timvieira.github.io/blog/post/2014/08/01/gumbel-max-trick-and-weighted-reservoir-sampling/
+    z = -tf.log(-tf.log(tf.random_uniform((n_batch, tf.shape(x)[axis]), 0, 1)))
+    _, indices = tf.nn.top_k(z, tf.reduce_max(num))
+    _, x = tf.while_loop(
+        cond=lambda i, _: tf.less(i, tf.reduce_max(num)),
+        body=lambda i, x: (
+            i + 1, 
+            tf.where(
+                tf.less(i, num),
+                _mask(x, axis=axis, pos=indices[:, i], max_amount=max_dims),
+                x)),
+        loop_vars=(0, x))
+    return x
+
+
+def random_warp(x, std, scale):
+    """
+    :param tf.Tensor x: (batch,time,dim)
+    :param (float,float) std:
+    :param (float,float) scale:
+    :rtype: tf.Tensor
+    :return: x transformed
+    """
+    import tensorflow as tf
+    from TFUtil import create_random_warp_flow_2d, dense_image_warp
+    x = tf.expand_dims(x, axis=-1)
+    flow = create_random_warp_flow_2d(tf.shape(x)[:-1], std=std, scale=scale)
+    x = dense_image_warp(x, flow=flow)
+    x = tf.squeeze(x, axis=-1)
+    return x
+
+
+def transform(x, network):
+    import tensorflow as tf
+    def get_masked():
+        x_masked = x
+        x_masked = random_mask(x_masked, axis=1, min_num=1, max_num=3, max_dims=40)
+        x_masked = random_mask(x_masked, axis=2, min_num=1, max_num=2, max_dims=30)
+        return x_masked
+    x = network.cond_on_train(get_masked, lambda: x)
+    return x
+
+
+def jump_net(prefix, inputs, filters, dilation_rate, data_format, conv_time_dim=False):
+    if data_format == 'channels_first':         
+        NCHW = True
+    else:
+        NCHW = False
+    filter_size = (3, 3)
+    strides = (1, 1)
+    padding = "SAME"
+    use_frequency = False
+    add_sequential_layer("%s_c1" % prefix, {"class": "conv", "n_out": filters, "filter_size": filter_size, "auto_use_channel_first": NCHW,
+                         "strides": strides, "dilation_rate": (dilation_rate, 1), "padding": padding, "activation": None, "with_bias": False, "dropout": dropout, 
+                         "forward_weights_init": "xavier", "L2": L2})
+    add_sequential_layer("%s_bn1" % prefix, {"class": "batch_norm", "masked_time": masked_time, "axes": axes, "fused": fused, "momentum": bn_momentum, "epsilon": bn_epsilon})
+    add_sequential_layer("%s_y1" % prefix, {"class": "activation", "activation": "relu", "batch_norm": False})
+    add_sequential_layer("%s_c2" % prefix, {"class": "conv", "n_out": filters, "filter_size": filter_size, "auto_use_channel_first": NCHW,
+                         "strides": strides, "dilation_rate": (dilation_rate, 1), "padding": padding, "activation": None, "with_bias": False, "dropout": dropout, 
+                         "forward_weights_init": "xavier", "L2": L2})
+
+    add_sequential_layer("%s_p" % prefix, {"class": "combine", "kind": "add"}, from_=["%s_c2" % prefix, inputs])
+    add_sequential_layer("%s_bn2" % prefix, {"class": "batch_norm", "masked_time": masked_time, "axes": axes, "fused": fused, "momentum": bn_momentum, "epsilon": bn_epsilon})
+    add_sequential_layer("%s_o" % prefix, {"class": "activation", "activation": "relu", "batch_norm": False})
+    return "%s_o" % prefix
+
+
+def jump_block(prefix, inputs, jump_net_num, filters, data_format, dilation_rate, attention=False, conv_time_dim=False):
+    if data_format == 'channels_first':         
+        NCHW = True
+    else:
+        NCHW = False    
+    filter_size = (3, 3)
+    strides = (1, 1)
+    padding = "SAME"
+    use_frequency = False
+    add_sequential_layer("%s_c" % prefix, {"class": "conv", "n_out": filters, "filter_size": filter_size, "auto_use_channel_first": NCHW,
+                         "strides": strides, "dilation_rate": dilation_rate, "padding": padding, "activation": None, "with_bias": False, "dropout": dropout, 
+                         "forward_weights_init": "xavier", "L2": L2})
+    # doing strides
+    add_sequential_layer("%s_c_strides" % prefix, {"class": "slice", "axis": "s:1", "slice_step": 2})
+
+    dilation_rate *= 2
+    # print("dilation_rate: ", dilation_rate)
+    inputs = "%s_c_strides" % prefix
+    for i in range(1, jump_net_num + 1):
+        inputs = jump_net("%s_net%i" % (prefix, i + 1), inputs, filters, dilation_rate, data_format=data_format, conv_time_dim=conv_time_dim)    
+    return inputs
+
+
+def lacea(inputs):
+    block_sizes = [2, 2, 2, 2]
+    filters = 128
+    data_format = 'channels_last'
+    conv_time_dim = True
+    dilation_rate = 1
+    attention = False
+
+    for i, num_blocks in enumerate(block_sizes):    
+        inputs = jump_block("block%i" % (i + 1), inputs, num_blocks, filters, data_format, dilation_rate, attention=attention, conv_time_dim=conv_time_dim)
+        filters *= 2
+        dilation_rate *= 2
+    return
+
+add_sequential_layer("split", {"class": "split_dims", "axis": "f", "dims": (channel_num, feature_dim)}) # output: (batch, time, window = 61, feature = 40, channel = 1)
+add_sequential_layer("swap_axes", {"class": "swap_axes", "axis1": "s:1", "axis2": "f"})
+add_sequential_layer("data_aug", {"class": "eval", "eval": "self.network.get_config().typed_value('transform')(source(0), network=self.network)"})
+lacea(inputs="data_aug")
+add_sequential_layer("out_pool", {"class": "reduce", "mode": "avg", "axes": "s:1", "keep_dims": False})
+add_sequential_layer("output", {"class": "softmax", "loss": "ce", "L2": L2, "n_out": num_outputs, "loss_opts": {"focal_loss_factor": 2.0}, "dropout": ldropout})
+
+train = get_sprint_dataset("train")
+dev = get_sprint_dataset("cv")
+
+############## debug stuff
+debug_print_layer_output_template = True  # useful for debugging
+#debug_print_layer_output_sizes = True
+#debug_print_layer_output_shape = True  # might be useful for debugging
+#debug_shell_in_runner = True
+log_batch_size = True
+tf_log_memory_usage = True
+############## debug stuff
+
+# trainer
+batching = "random"
+batch_size = 24 * (150)
+max_seqs = 500
+chunking = "150:150"
+truncation = -1
+num_epochs = 120
+gradient_clip = 0
+gradient_noise = 0.1
+momentum = 0.99
+learning_rate = 5e-6
+learning_rate_file = "newbob.data"
+learning_rate_control = "newbob_multi_epoch"
+learning_rate_control_relative_error_relative_lr = True
+newbob_multi_num_epochs = 6
+newbob_multi_update_interval = 1
+model = "net-model/network"
+
+cleanup_old_models = True
+store_metadata_mod_step = None
+
+# log
+log = "log/crnn.train.log"
+log_verbosity = 5