diff --git a/2019-asr-resnet-lace-cnn/ResNet/DataAug/resnet_specaug_gt.config b/2019-asr-resnet-lace-cnn/ResNet/DataAug/resnet_specaug_gt.config
new file mode 100644
index 00000000..10328d77
--- /dev/null
+++ b/2019-asr-resnet-lace-cnn/ResNet/DataAug/resnet_specaug_gt.config
@@ -0,0 +1,415 @@
+#!crnn/rnn.py
+# kate: syntax python;
+# see also file:///u/zeyer/setups/quaero-en/training/quaero-train11/50h/ann/2015-07-29--lstm-gt50/config-train/dropout01.3l.n500.custom_lstm.adam.lr1e_3.config
+
+import os
+import numpy
+from subprocess import check_output
+
+# task
+use_tensorflow = True
+task = "train"
+device = "gpu"
+multiprocessing = True
+update_on_device = True
+
+_cf_cache = {}
+
+def cf(filename):
+	"""Cache manager"""
+	if filename in _cf_cache:
+		return _cf_cache[filename]
+	if check_output(["hostname"]).strip().decode("utf8") in ["osmium", "sulfid", "zink", "cobalt", "niob"]:
+		print("use local file: %s" % filename)
+		return filename  # for debugging
+	cached_fn = check_output(["cf", filename]).strip().decode("utf8")
+	assert os.path.exists(cached_fn)
+	_cf_cache[filename] = cached_fn
+	return cached_fn
+
+# data
+context_window = 1
+window = 1
+feature_dim = 40  # GT 40-dim
+channel_num = 1
+num_inputs = feature_dim * channel_num * window
+num_outputs = 9001  # CART labels
+EpochSplit = 6
+
+def get_sprint_dataset(data):
+    assert context_window > 1
+    assert data in ["train", "cv"]
+    epochSplit = {"train": EpochSplit, "cv": 1}
+    # see /u/tuske/work/ASR/switchboard/corpus/readme
+    # and zoltans mail https://mail.google.com/mail/u/0/#inbox/152891802cbb2b40
+    files = {}
+    files["config"] = "config/training.config"
+    files["corpus"] = "/u/corpora/speech/switchboard-1/xml/swb1-all/swb1-all.corpus.gz"
+    files["segments"] = "dependencies/seg_%s" % {"train":"train", "cv":"cv_head3000"}[data]
+    files["features"] = "/u/tuske/work/ASR/switchboard/feature.extraction/gt40_40/data/gt.train.bundle"    
+    files["lexicon"] = "/u/tuske/work/ASR/switchboard/corpus/train.lex.v1_0_3.ci.gz"
+    files["alignment"] = "dependencies/tuske__2016_01_28__align.combined.train"
+    files["cart"] = "/u/tuske/work/ASR/switchboard/initalign/data/%s" % {9001: "cart-9000"}[num_outputs]
+    for k, v in sorted(files.items()):
+        assert os.path.exists(v), "%s %r does not exist" % (k, v)
+    estimated_num_seqs = {"train": 227047, "cv": 3000}  # wc -l segment-file
+    args = [
+    "--config=" + files["config"],
+    lambda: "--*.corpus.file=" + cf(files["corpus"]),
+    lambda: "--*.corpus.segments.file=" + cf(files["segments"]),
+    {"train": "--*.corpus.segment-order-shuffle=true", "cv": "--*.corpus.segment-order-sort-by-time-length=true"}[data],
+    "--*.state-tying.type=cart",
+    lambda: "--*.state-tying.file=" + cf(files["cart"]),
+    "--*.trainer-output-dimension=%i" % num_outputs,
+    lambda: "--*.lexicon.file=" + cf(files["lexicon"]),
+    lambda: "--*.alignment-cache-path=" + cf(files["alignment"]),
+    lambda: "--*.feature-cache-path=" + cf(files["features"]),
+    "--*.log-channel.file=log/crnn.sprint.train-dataset.xml",
+    "--*.window-size=1",
+    "--*.trainer-output-dimension=%i" % num_outputs
+    ]
+    return {
+    "class": "ExternSprintDataset", "sprintTrainerExecPath": "sprint-executables/nn-trainer",
+    "sprintConfigStr": args,
+    "partitionEpoch": epochSplit[data],
+    "estimated_num_seqs": estimated_num_seqs[data] // (epochSplit[data] or 1),
+    "context_window": context_window}
+cache_size = "0"
+
+# network
+# (also defined by num_inputs & num_outputs)
+dropout = 0.05
+L2 = 0.1
+filter_size = (3, 3)  # for 2D conv on (window, feature) axes
+
+# bn params
+masked_time = False
+fused = True
+axes = ["f"]
+
+bn_momentum = 0.997
+bn_epsilon = 1e-5
+
+cur_feat_dim = feature_dim
+network = {}
+_last = "data"
+def add_sequential_layer(name, d, from_=None):
+    global _last, network
+    assert "from" not in d
+    if from_ is not None:
+        d["from"] = from_
+    else:
+        d["from"] = [_last]
+    assert name not in network
+    network[name] = d
+    _last = name
+    return name
+
+# data augmentation
+def summary(name, x):
+    """
+    :param str name:
+    :param tf.Tensor x: (batch,time,feature)
+    """
+    import tensorflow as tf
+    # tf.summary.image wants [batch_size, height,  width, channels],
+    # we have (batch, time, feature).
+    img = tf.expand_dims(x, axis=3)  # (batch,time,feature,1)
+    img = tf.transpose(img, [0, 2, 1, 3])  # (batch,feature,time,1)
+    tf.summary.image(name, img, max_outputs=10)
+    tf.summary.scalar("%s_max_abs" % name, tf.reduce_max(tf.abs(x)))
+    mean = tf.reduce_mean(x)
+    tf.summary.scalar("%s_mean" % name, mean)
+    stddev = tf.sqrt(tf.reduce_mean(tf.square(x - mean)))
+    tf.summary.scalar("%s_stddev" % name, stddev)
+    tf.summary.histogram("%s_hist" % name, tf.reduce_max(tf.abs(x), axis=2))
+
+
+def _mask(x, axis, pos, max_amount):
+    """
+    :param tf.Tensor x: (batch,time,feature)
+    :param int axis:
+    :param tf.Tensor pos: (batch,)
+    :param int max_amount: inclusive
+    """
+    import tensorflow as tf
+    ndim = x.get_shape().ndims
+    n_batch = tf.shape(x)[0]
+    dim = tf.shape(x)[axis]
+    amount = tf.random_uniform(shape=(n_batch,), minval=1, maxval=max_amount + 1, dtype=tf.int32)
+    pos2 = tf.minimum(pos + amount, dim)
+    idxs = tf.expand_dims(tf.range(0, dim), 0)  # (1,dim)
+    pos_bc = tf.expand_dims(pos, 1)  # (batch,1)
+    pos2_bc = tf.expand_dims(pos2, 1)  # (batch,1)
+    cond = tf.logical_and(tf.greater_equal(idxs, pos_bc), tf.less(idxs, pos2_bc))  # (batch,dim)
+    cond = tf.reshape(cond, [tf.shape(x)[i] if i in (0, axis) else 1 for i in range(ndim)])
+    from TFUtil import where_bc
+    x = where_bc(cond, 0.0, x)
+    return x
+
+
+def random_mask(x, axis, min_num, max_num, max_dims):
+    """
+    :param tf.Tensor x: (batch,time,feature)
+    :param int axis:
+    :param int|tf.Tensor min_num:
+    :param int|tf.Tensor max_num: inclusive
+    :param int max_dims: inclusive
+    """
+    import tensorflow as tf
+    n_batch = tf.shape(x)[0]
+    num = tf.random_uniform(shape=(n_batch,), minval=min_num, maxval=max_num + 1, dtype=tf.int32)
+    # https://github.com/tensorflow/tensorflow/issues/9260
+    # https://timvieira.github.io/blog/post/2014/08/01/gumbel-max-trick-and-weighted-reservoir-sampling/
+    z = -tf.log(-tf.log(tf.random_uniform((n_batch, tf.shape(x)[axis]), 0, 1)))
+    _, indices = tf.nn.top_k(z, tf.reduce_max(num))
+    _, x = tf.while_loop(
+        cond=lambda i, _: tf.less(i, tf.reduce_max(num)),
+        body=lambda i, x: (
+            i + 1, 
+            tf.where(
+                tf.less(i, num),
+                _mask(x, axis=axis, pos=indices[:, i], max_amount=max_dims),
+                x)),
+        loop_vars=(0, x))
+    return x
+
+
+def random_warp(x, std, scale):
+    """
+    :param tf.Tensor x: (batch,time,dim)
+    :param (float,float) std:
+    :param (float,float) scale:
+    :rtype: tf.Tensor
+    :return: x transformed
+    """
+    import tensorflow as tf
+    from TFUtil import create_random_warp_flow_2d, dense_image_warp
+    x = tf.expand_dims(x, axis=-1)
+    flow = create_random_warp_flow_2d(tf.shape(x)[:-1], std=std, scale=scale)
+    x = dense_image_warp(x, flow=flow)
+    x = tf.squeeze(x, axis=-1)
+    return x
+
+
+def transform(x, network):
+    import tensorflow as tf
+    def get_masked():
+        x_masked = x
+        x_masked = random_mask(x_masked, axis=1, min_num=0, max_num=4, max_dims=40)
+        x_masked = random_mask(x_masked, axis=2, min_num=0, max_num=2, max_dims=20)
+        return x_masked
+    x = network.cond_on_train(get_masked, lambda: x)
+    return x
+
+
+def fixed_padding(prefix, inputs, kernel_size, data_format, conv_time_dim):
+    pad_total = kernel_size - 1
+    feature_pad_beg = pad_total // 2
+    feature_pad_end = pad_total - feature_pad_beg
+
+    time_pad_beg = 0
+    time_pad_end = 0
+ 
+    return add_sequential_layer("%s_pad" % prefix, {"class": "pad", "axes": ("s:0", "s:1"), "padding": [(time_pad_beg, time_pad_end), (feature_pad_end, feature_pad_end)]}, from_=inputs)
+      
+
+def conv2d_fixed_padding(prefix, filters, kernel_size, strides, dilation_rate, 
+                         data_format, conv_time_dim, inputs=None):
+    """Strided 2-D convolution with explicit padding."""
+    # The padding is consistent and is based only on `kernel_size`, not on the
+    # dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
+    pad_out = fixed_padding("%s_pad" % prefix, inputs, kernel_size, data_format, conv_time_dim)    
+        
+    padding = 'VALID'
+    strides = (1, strides) if conv_time_dim else strides
+    filter_size = (kernel_size, kernel_size)
+    dilation_rate = (dilation_rate, 1) if conv_time_dim else (1, 1)
+
+    if data_format == 'channels_first':         
+        NCHW = True
+    else:
+        NCHW = False
+    return add_sequential_layer("%s_conv" % prefix, {"class": "conv", "n_out": filters, "filter_size": filter_size, "auto_use_channel_first": NCHW,
+                                "strides": strides, "dilation_rate": dilation_rate, "padding": padding, "activation": None, "with_bias": False, "dropout": 0, 
+                                "forward_weights_init": "xavier", "L2": L2},
+                                from_=pad_out)
+
+
+def _building_block_v2(prefix, inputs, filters, projection_shortcut, strides, 
+                       dilation_rate, kernel_size, data_format, conv_time_dim):
+    bn1 = add_sequential_layer("%s_bn1" % prefix, {"class": "batch_norm", "masked_time": masked_time, "axes": axes, "fused": fused, "momentum": bn_momentum, "epsilon": bn_epsilon}, from_=inputs)
+    relu1 = add_sequential_layer("%s_relu1" % prefix, {"class": "activation", "activation": "relu", "batch_norm": False}, from_=bn1)
+    if strides > 1 and conv_time_dim:
+        conv1 = conv2d_fixed_padding(prefix=("%s_conv_1" % prefix), inputs=relu1, filters=filters, 
+                                    kernel_size=kernel_size, strides=1,
+                                    dilation_rate=dilation_rate, 
+                                    data_format=data_format, conv_time_dim=conv_time_dim)
+        conv1 = add_sequential_layer("%s_stride" % prefix, {"class": "slice", "axis": "s:1", "slice_step": strides}, from_=conv1)
+        dilation_rate *= 2
+    else:
+        conv1 = conv2d_fixed_padding(prefix=("%s_conv_1" % prefix), inputs=relu1, filters=filters, 
+                                     kernel_size=kernel_size, strides=strides,
+                                     dilation_rate=dilation_rate, 
+                                     data_format=data_format, conv_time_dim=conv_time_dim)
+                
+    bn2 = add_sequential_layer("%s_bn2" % prefix, {"class": "batch_norm", "masked_time": masked_time, "axes": axes, "fused": fused, "momentum": bn_momentum, "epsilon": bn_epsilon}, from_=conv1)
+    relu2 = add_sequential_layer("%s_relu2" % prefix, {"class": "activation", "activation": "relu", "batch_norm": False}, from_=bn2)
+
+    conv2 = conv2d_fixed_padding(prefix=("%s_conv_2" % prefix), inputs=relu2, filters=filters, 
+                                 kernel_size=kernel_size, strides=1,
+                                 dilation_rate=dilation_rate,
+                                 data_format=data_format, conv_time_dim=conv_time_dim)
+    # The projection shortcut should come after the first batch norm and ReLU
+    # since it performs a 1x1 convolution.
+    crop_lr = filter_size[0] - 1
+    crop_left = crop_lr // 2
+    crop_right = crop_lr - crop_left
+
+    if conv_time_dim:
+      if strides > 1:
+        crop = int(crop_left * (dilation_rate/2 + dilation_rate))
+      else:
+        crop = int(crop_left * 2 * dilation_rate)
+      shortcut = add_sequential_layer("%s_crop" % prefix, {"class": "slice", "axis": "T", "slice_start": crop, "slice_end": -crop}, from_=relu1)
+      # shortcut = "%s_crop" % prefix
+      if projection_shortcut is not None:
+        shortcut = projection_shortcut(inputs=shortcut)
+    else:
+      crop = crop_left
+      shortcut = add_sequential_layer("%s_crop_1" % prefix, {"class": "slice", "axis": "T", "slice_start": crop, "slice_end": -crop}, from_=relu1)
+      
+      if projection_shortcut is not None:
+          shortcut = projection_shortcut(inputs=shortcut)
+
+      shortcut = add_sequential_layer("%s_crop_2" % prefix, {"class": "slice", "axis": "T", "slice_start": crop, "slice_end": -crop}, from_=shortcut)
+    return add_sequential_layer("%s_out" % prefix, {"class": "combine", "kind": "add"}, from_=[conv2, shortcut])
+    
+
+def block_layer(prefix, inputs, filters, bottleneck, block_fn, blocks, strides,
+                dilation_rate, kernel_size, data_format, conv_time_dim):
+    filters_out = filters * 4 if bottleneck else filters
+    def projection_shortcut(inputs=None):
+        return conv2d_fixed_padding(
+            prefix=("%s_sc" % prefix), filters=filters_out, kernel_size=1, strides=strides,
+            dilation_rate=1, data_format=data_format, conv_time_dim=conv_time_dim, inputs=inputs)
+    inputs = block_fn("%s_0" % prefix, inputs, filters, projection_shortcut, strides,
+                      dilation_rate, kernel_size, data_format, conv_time_dim)
+    if strides > 1:
+        dilation_rate *= strides
+    for i in range(1, blocks):
+        inputs = block_fn("%s_%i" % (prefix, i), inputs, filters, None, 1, dilation_rate, 
+                          kernel_size, data_format, conv_time_dim)
+    return inputs
+
+
+def build_resnet(inputs):
+    resnet_version = 2
+    conv_time_dim = True
+    bottleneck = False
+    num_filters = 64
+    first_kernel_size = 5
+    kernel_size = 3
+    conv_stride = 2
+    first_pool_size = (2, 1)
+    first_pool_stride = (1, 1)
+    block_sizes = [3, 3, 3, 3]
+    block_strides = [1, 2, 2, 2]
+    block_fn = _building_block_v2
+    data_format = 'channels_last'
+    pre_activation = resnet_version == 2
+
+    if data_format == 'channels_first':         
+        NCHW = True
+    else:
+        NCHW = False
+
+    if conv_time_dim:
+        multiplier = 1 if bottleneck else 2    
+        building_block_reduction = multiplier * 2 * (kernel_size // 2)
+        total_reduction = 2 * (first_kernel_size // 2)
+
+        dilation_rate_multiplier = 2
+        
+        for i, bs in enumerate(block_sizes):
+            total_reduction += building_block_reduction/multiplier * dilation_rate_multiplier
+            dilation_rate_multiplier *= block_strides[i]
+            total_reduction += building_block_reduction/multiplier * dilation_rate_multiplier
+            total_reduction += building_block_reduction * (bs - 1) * dilation_rate_multiplier
+
+        time_dim_reduction = total_reduction 
+        context_window = int(2 * (total_reduction // 2) + 1)
+
+    else:
+        time_dim_reduction = 0
+
+    inputs = conv2d_fixed_padding(prefix="c_init", inputs=inputs, filters=num_filters,
+                                  kernel_size=first_kernel_size, strides=conv_stride,
+                                  dilation_rate=1,        
+                                  data_format=data_format, conv_time_dim=conv_time_dim)
+
+    dilation_rate = 2
+
+    if resnet_version == 1:
+        inputs = add_sequential_layer("c_init_bn", {"class": "batch_norm", "masked_time": masked_time, "axes": axes, "fused": fused, "momentum": bn_momentum, "epsilon": bn_epsilon}, from_=inputs)
+        inputs = add_sequential_layer("c_init_relu", {"class": "activation", "activation": "relu", "batch_norm": False}, from_=inputs)         
+
+    for i, num_blocks in enumerate(block_sizes):
+        filters = num_filters * (2**i)
+        inputs = block_layer(
+            prefix="c_%i" % i, inputs=inputs, filters=filters, bottleneck=bottleneck,
+            block_fn=block_fn, blocks=num_blocks,
+            strides=block_strides[i], dilation_rate=dilation_rate,  
+            kernel_size=kernel_size,
+            data_format=data_format, conv_time_dim=conv_time_dim)
+        dilation_rate *= block_strides[i]
+
+    if pre_activation:
+        inputs = add_sequential_layer("c_out_bn", {"class": "batch_norm", "masked_time": masked_time, "axes": axes, "fused": fused, "momentum": bn_momentum, "epsilon": bn_epsilon}, from_=inputs)
+        inputs = add_sequential_layer("c_out_relu", {"class": "activation", "activation": "relu", "batch_norm": False}, from_=inputs)
+
+    inputs = add_sequential_layer("out_pool", {"class": "reduce", "mode": "avg", "axes": ("s:1"), "keep_dims": False}, from_=inputs)         
+    inputs = add_sequential_layer("linear", {"class": "linear", "activation": "relu", "dropout": 0.05, "n_out": 2048}, from_=inputs)
+    return inputs, context_window  
+
+inputs = add_sequential_layer("split", {"class": "split_dims", "axis": "f", "dims": (channel_num, feature_dim)}) # output: (batch, time, window = 61, feature = 40, channel = 1)
+inputs = add_sequential_layer("swap_axes", {"class": "swap_axes", "axis1": "s:1", "axis2": "f"})
+inputs = add_sequential_layer("data_aug", {"class": "eval", "eval": "self.network.get_config().typed_value('transform')(source(0), network=self.network)"})
+resnet_out, context_window = build_resnet(inputs=inputs)
+add_sequential_layer("output", {"class": "softmax", "loss": "ce", "dropout": 0.05, "n_out": num_outputs}, from_=resnet_out)
+
+train = get_sprint_dataset("train")
+dev = get_sprint_dataset("cv")
+
+############## debug stuff
+debug_print_layer_output_template = True  # useful for debugging
+#debug_print_layer_output_sizes = True
+#debug_print_layer_output_shape = True  # might be useful for debugging
+#debug_shell_in_runner = True
+log_batch_size = True
+tf_log_memory_usage = True
+############## debug stuff
+
+# trainer
+batching = "random"
+batch_size = 24 * (150 + context_window)
+max_seqs = 500
+chunking = "150:150"
+truncation = -1
+num_epochs = 100
+gradient_clip = 0
+gradient_noise = 0.0
+momentum = 0.99
+learning_rate = 5e-6
+learning_rate_file = "newbob.data"
+learning_rate_control = "newbob_multi_epoch"
+learning_rate_control_relative_error_relative_lr = True
+newbob_multi_num_epochs = 6
+newbob_multi_update_interval = 1
+model = "net-model/network"
+cleanup_old_models = True
+
+# log
+log = "log/crnn.train.log"
+log_verbosity = 5
+
diff --git a/2019-asr-resnet-lace-cnn/ResNet/DataAug/resnet_specaug_logmel+d+dd.config b/2019-asr-resnet-lace-cnn/ResNet/DataAug/resnet_specaug_logmel+d+dd.config
new file mode 100644
index 00000000..c1db03f0
--- /dev/null
+++ b/2019-asr-resnet-lace-cnn/ResNet/DataAug/resnet_specaug_logmel+d+dd.config
@@ -0,0 +1,415 @@
+#!crnn/rnn.py
+# kate: syntax python;
+# see also file:///u/zeyer/setups/quaero-en/training/quaero-train11/50h/ann/2015-07-29--lstm-gt50/config-train/dropout01.3l.n500.custom_lstm.adam.lr1e_3.config
+
+import os
+import numpy
+from subprocess import check_output
+
+# task
+use_tensorflow = True
+task = "train"
+device = "gpu"
+multiprocessing = True
+update_on_device = True
+
+_cf_cache = {}
+
+def cf(filename):
+	"""Cache manager"""
+	if filename in _cf_cache:
+		return _cf_cache[filename]
+	if check_output(["hostname"]).strip().decode("utf8") in ["osmium", "sulfid", "zink", "cobalt", "niob"]:
+		print("use local file: %s" % filename)
+		return filename  # for debugging
+	cached_fn = check_output(["cf", filename]).strip().decode("utf8")
+	assert os.path.exists(cached_fn)
+	_cf_cache[filename] = cached_fn
+	return cached_fn
+
+# data
+context_window = 1
+window = 1
+feature_dim = 64  # LogMel 64-dim
+channel_num = 3
+num_inputs = feature_dim * channel_num * window
+num_outputs = 9001  # CART labels
+EpochSplit = 6
+
+def get_sprint_dataset(data):
+    assert context_window > 1
+    assert data in ["train", "cv"]
+    epochSplit = {"train": EpochSplit, "cv": 1}
+    # see /u/tuske/work/ASR/switchboard/corpus/readme
+    # and zoltans mail https://mail.google.com/mail/u/0/#inbox/152891802cbb2b40
+    files = {}
+    files["config"] = "config/training.config"
+    files["corpus"] = "/u/corpora/speech/switchboard-1/xml/swb1-all/swb1-all.corpus.gz"
+    files["segments"] = "dependencies/seg_%s" % {"train":"train", "cv":"cv_head3000"}[data]
+    files["features"] = "/u/bozheniuk/setups/switchboard/feature_extraction/cluster_setup/logmel64_30/data/logmel.train.bundle"    
+    files["lexicon"] = "/u/tuske/work/ASR/switchboard/corpus/train.lex.v1_0_3.ci.gz"
+    files["alignment"] = "dependencies/tuske__2016_01_28__align.combined.train"
+    files["cart"] = "/u/tuske/work/ASR/switchboard/initalign/data/%s" % {9001: "cart-9000"}[num_outputs]
+    for k, v in sorted(files.items()):
+        assert os.path.exists(v), "%s %r does not exist" % (k, v)
+    estimated_num_seqs = {"train": 227047, "cv": 3000}  # wc -l segment-file
+    args = [
+    "--config=" + files["config"],
+    lambda: "--*.corpus.file=" + cf(files["corpus"]),
+    lambda: "--*.corpus.segments.file=" + cf(files["segments"]),
+    {"train": "--*.corpus.segment-order-shuffle=true", "cv": "--*.corpus.segment-order-sort-by-time-length=true"}[data],
+    "--*.state-tying.type=cart",
+    lambda: "--*.state-tying.file=" + cf(files["cart"]),
+    "--*.trainer-output-dimension=%i" % num_outputs,
+    lambda: "--*.lexicon.file=" + cf(files["lexicon"]),
+    lambda: "--*.alignment-cache-path=" + cf(files["alignment"]),
+    lambda: "--*.feature-cache-path=" + cf(files["features"]),
+    "--*.log-channel.file=log/crnn.sprint.train-dataset.xml",
+    "--*.window-size=1",
+    "--*.trainer-output-dimension=%i" % num_outputs
+    ]
+    return {
+    "class": "ExternSprintDataset", "sprintTrainerExecPath": "sprint-executables/nn-trainer",
+    "sprintConfigStr": args,
+    "partitionEpoch": epochSplit[data],
+    "estimated_num_seqs": estimated_num_seqs[data] // (epochSplit[data] or 1),
+    "context_window": context_window}
+cache_size = "0"
+
+# network
+# (also defined by num_inputs & num_outputs)
+dropout = 0.05
+L2 = 0.1
+filter_size = (3, 3)  # for 2D conv on (window, feature) axes
+
+# bn params
+masked_time = False
+fused = True
+axes = ["f"]
+
+bn_momentum = 0.997
+bn_epsilon = 1e-5
+
+cur_feat_dim = feature_dim
+network = {}
+_last = "data"
+def add_sequential_layer(name, d, from_=None):
+    global _last, network
+    assert "from" not in d
+    if from_ is not None:
+        d["from"] = from_
+    else:
+        d["from"] = [_last]
+    assert name not in network
+    network[name] = d
+    _last = name
+    return name
+
+# data augmentation
+def summary(name, x):
+    """
+    :param str name:
+    :param tf.Tensor x: (batch,time,feature)
+    """
+    import tensorflow as tf
+    # tf.summary.image wants [batch_size, height,  width, channels],
+    # we have (batch, time, feature).
+    img = tf.expand_dims(x, axis=3)  # (batch,time,feature,1)
+    img = tf.transpose(img, [0, 2, 1, 3])  # (batch,feature,time,1)
+    tf.summary.image(name, img, max_outputs=10)
+    tf.summary.scalar("%s_max_abs" % name, tf.reduce_max(tf.abs(x)))
+    mean = tf.reduce_mean(x)
+    tf.summary.scalar("%s_mean" % name, mean)
+    stddev = tf.sqrt(tf.reduce_mean(tf.square(x - mean)))
+    tf.summary.scalar("%s_stddev" % name, stddev)
+    tf.summary.histogram("%s_hist" % name, tf.reduce_max(tf.abs(x), axis=2))
+
+
+def _mask(x, axis, pos, max_amount):
+    """
+    :param tf.Tensor x: (batch,time,feature)
+    :param int axis:
+    :param tf.Tensor pos: (batch,)
+    :param int max_amount: inclusive
+    """
+    import tensorflow as tf
+    ndim = x.get_shape().ndims
+    n_batch = tf.shape(x)[0]
+    dim = tf.shape(x)[axis]
+    amount = tf.random_uniform(shape=(n_batch,), minval=1, maxval=max_amount + 1, dtype=tf.int32)
+    pos2 = tf.minimum(pos + amount, dim)
+    idxs = tf.expand_dims(tf.range(0, dim), 0)  # (1,dim)
+    pos_bc = tf.expand_dims(pos, 1)  # (batch,1)
+    pos2_bc = tf.expand_dims(pos2, 1)  # (batch,1)
+    cond = tf.logical_and(tf.greater_equal(idxs, pos_bc), tf.less(idxs, pos2_bc))  # (batch,dim)
+    cond = tf.reshape(cond, [tf.shape(x)[i] if i in (0, axis) else 1 for i in range(ndim)])
+    from TFUtil import where_bc
+    x = where_bc(cond, 0.0, x)
+    return x
+
+
+def random_mask(x, axis, min_num, max_num, max_dims):
+    """
+    :param tf.Tensor x: (batch,time,feature)
+    :param int axis:
+    :param int|tf.Tensor min_num:
+    :param int|tf.Tensor max_num: inclusive
+    :param int max_dims: inclusive
+    """
+    import tensorflow as tf
+    n_batch = tf.shape(x)[0]
+    num = tf.random_uniform(shape=(n_batch,), minval=min_num, maxval=max_num + 1, dtype=tf.int32)
+    # https://github.com/tensorflow/tensorflow/issues/9260
+    # https://timvieira.github.io/blog/post/2014/08/01/gumbel-max-trick-and-weighted-reservoir-sampling/
+    z = -tf.log(-tf.log(tf.random_uniform((n_batch, tf.shape(x)[axis]), 0, 1)))
+    _, indices = tf.nn.top_k(z, tf.reduce_max(num))
+    _, x = tf.while_loop(
+        cond=lambda i, _: tf.less(i, tf.reduce_max(num)),
+        body=lambda i, x: (
+            i + 1, 
+            tf.where(
+                tf.less(i, num),
+                _mask(x, axis=axis, pos=indices[:, i], max_amount=max_dims),
+                x)),
+        loop_vars=(0, x))
+    return x
+
+
+def random_warp(x, std, scale):
+    """
+    :param tf.Tensor x: (batch,time,dim)
+    :param (float,float) std:
+    :param (float,float) scale:
+    :rtype: tf.Tensor
+    :return: x transformed
+    """
+    import tensorflow as tf
+    from TFUtil import create_random_warp_flow_2d, dense_image_warp
+    x = tf.expand_dims(x, axis=-1)
+    flow = create_random_warp_flow_2d(tf.shape(x)[:-1], std=std, scale=scale)
+    x = dense_image_warp(x, flow=flow)
+    x = tf.squeeze(x, axis=-1)
+    return x
+
+
+def transform(x, network):
+    import tensorflow as tf
+    def get_masked():
+        x_masked = x
+        x_masked = random_mask(x_masked, axis=1, min_num=0, max_num=4, max_dims=40)
+        x_masked = random_mask(x_masked, axis=2, min_num=0, max_num=2, max_dims=20)
+        return x_masked
+    x = network.cond_on_train(get_masked, lambda: x)
+    return x
+
+
+def fixed_padding(prefix, inputs, kernel_size, data_format, conv_time_dim):
+    pad_total = kernel_size - 1
+    feature_pad_beg = pad_total // 2
+    feature_pad_end = pad_total - feature_pad_beg
+
+    time_pad_beg = 0
+    time_pad_end = 0
+ 
+    return add_sequential_layer("%s_pad" % prefix, {"class": "pad", "axes": ("s:0", "s:1"), "padding": [(time_pad_beg, time_pad_end), (feature_pad_end, feature_pad_end)]}, from_=inputs)
+      
+
+def conv2d_fixed_padding(prefix, filters, kernel_size, strides, dilation_rate, 
+                         data_format, conv_time_dim, inputs=None):
+    """Strided 2-D convolution with explicit padding."""
+    # The padding is consistent and is based only on `kernel_size`, not on the
+    # dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
+    pad_out = fixed_padding("%s_pad" % prefix, inputs, kernel_size, data_format, conv_time_dim)    
+        
+    padding = 'VALID'
+    strides = (1, strides) if conv_time_dim else strides
+    filter_size = (kernel_size, kernel_size)
+    dilation_rate = (dilation_rate, 1) if conv_time_dim else (1, 1)
+
+    if data_format == 'channels_first':         
+        NCHW = True
+    else:
+        NCHW = False
+    return add_sequential_layer("%s_conv" % prefix, {"class": "conv", "n_out": filters, "filter_size": filter_size, "auto_use_channel_first": NCHW,
+                                "strides": strides, "dilation_rate": dilation_rate, "padding": padding, "activation": None, "with_bias": False, "dropout": 0, 
+                                "forward_weights_init": "xavier", "L2": L2},
+                                from_=pad_out)
+
+
+def _building_block_v2(prefix, inputs, filters, projection_shortcut, strides, 
+                       dilation_rate, kernel_size, data_format, conv_time_dim):
+    bn1 = add_sequential_layer("%s_bn1" % prefix, {"class": "batch_norm", "masked_time": masked_time, "axes": axes, "fused": fused, "momentum": bn_momentum, "epsilon": bn_epsilon}, from_=inputs)
+    relu1 = add_sequential_layer("%s_relu1" % prefix, {"class": "activation", "activation": "relu", "batch_norm": False}, from_=bn1)
+    if strides > 1 and conv_time_dim:
+        conv1 = conv2d_fixed_padding(prefix=("%s_conv_1" % prefix), inputs=relu1, filters=filters, 
+                                    kernel_size=kernel_size, strides=1,
+                                    dilation_rate=dilation_rate, 
+                                    data_format=data_format, conv_time_dim=conv_time_dim)
+        conv1 = add_sequential_layer("%s_stride" % prefix, {"class": "slice", "axis": "s:1", "slice_step": strides}, from_=conv1)
+        dilation_rate *= 2
+    else:
+        conv1 = conv2d_fixed_padding(prefix=("%s_conv_1" % prefix), inputs=relu1, filters=filters, 
+                                     kernel_size=kernel_size, strides=strides,
+                                     dilation_rate=dilation_rate, 
+                                     data_format=data_format, conv_time_dim=conv_time_dim)
+                
+    bn2 = add_sequential_layer("%s_bn2" % prefix, {"class": "batch_norm", "masked_time": masked_time, "axes": axes, "fused": fused, "momentum": bn_momentum, "epsilon": bn_epsilon}, from_=conv1)
+    relu2 = add_sequential_layer("%s_relu2" % prefix, {"class": "activation", "activation": "relu", "batch_norm": False}, from_=bn2)
+
+    conv2 = conv2d_fixed_padding(prefix=("%s_conv_2" % prefix), inputs=relu2, filters=filters, 
+                                 kernel_size=kernel_size, strides=1,
+                                 dilation_rate=dilation_rate,
+                                 data_format=data_format, conv_time_dim=conv_time_dim)
+    # The projection shortcut should come after the first batch norm and ReLU
+    # since it performs a 1x1 convolution.
+    crop_lr = filter_size[0] - 1
+    crop_left = crop_lr // 2
+    crop_right = crop_lr - crop_left
+
+    if conv_time_dim:
+      if strides > 1:
+        crop = int(crop_left * (dilation_rate/2 + dilation_rate))
+      else:
+        crop = int(crop_left * 2 * dilation_rate)
+      shortcut = add_sequential_layer("%s_crop" % prefix, {"class": "slice", "axis": "T", "slice_start": crop, "slice_end": -crop}, from_=relu1)
+      # shortcut = "%s_crop" % prefix
+      if projection_shortcut is not None:
+        shortcut = projection_shortcut(inputs=shortcut)
+    else:
+      crop = crop_left
+      shortcut = add_sequential_layer("%s_crop_1" % prefix, {"class": "slice", "axis": "T", "slice_start": crop, "slice_end": -crop}, from_=relu1)
+      
+      if projection_shortcut is not None:
+          shortcut = projection_shortcut(inputs=shortcut)
+
+      shortcut = add_sequential_layer("%s_crop_2" % prefix, {"class": "slice", "axis": "T", "slice_start": crop, "slice_end": -crop}, from_=shortcut)
+    return add_sequential_layer("%s_out" % prefix, {"class": "combine", "kind": "add"}, from_=[conv2, shortcut])
+    
+
+def block_layer(prefix, inputs, filters, bottleneck, block_fn, blocks, strides,
+                dilation_rate, kernel_size, data_format, conv_time_dim):
+    filters_out = filters * 4 if bottleneck else filters
+    def projection_shortcut(inputs=None):
+        return conv2d_fixed_padding(
+            prefix=("%s_sc" % prefix), filters=filters_out, kernel_size=1, strides=strides,
+            dilation_rate=1, data_format=data_format, conv_time_dim=conv_time_dim, inputs=inputs)
+    inputs = block_fn("%s_0" % prefix, inputs, filters, projection_shortcut, strides,
+                      dilation_rate, kernel_size, data_format, conv_time_dim)
+    if strides > 1:
+        dilation_rate *= strides
+    for i in range(1, blocks):
+        inputs = block_fn("%s_%i" % (prefix, i), inputs, filters, None, 1, dilation_rate, 
+                          kernel_size, data_format, conv_time_dim)
+    return inputs
+
+
+def build_resnet(inputs):
+    resnet_version = 2
+    conv_time_dim = True
+    bottleneck = False
+    num_filters = 64
+    first_kernel_size = 5
+    kernel_size = 3
+    conv_stride = 2
+    first_pool_size = (2, 1)
+    first_pool_stride = (1, 1)
+    block_sizes = [3, 3, 3, 3]
+    block_strides = [1, 2, 2, 2]
+    block_fn = _building_block_v2
+    data_format = 'channels_last'
+    pre_activation = resnet_version == 2
+
+    if data_format == 'channels_first':         
+        NCHW = True
+    else:
+        NCHW = False
+
+    if conv_time_dim:
+        multiplier = 1 if bottleneck else 2    
+        building_block_reduction = multiplier * 2 * (kernel_size // 2)
+        total_reduction = 2 * (first_kernel_size // 2)
+
+        dilation_rate_multiplier = 2
+        
+        for i, bs in enumerate(block_sizes):
+            total_reduction += building_block_reduction/multiplier * dilation_rate_multiplier
+            dilation_rate_multiplier *= block_strides[i]
+            total_reduction += building_block_reduction/multiplier * dilation_rate_multiplier
+            total_reduction += building_block_reduction * (bs - 1) * dilation_rate_multiplier
+
+        time_dim_reduction = total_reduction 
+        context_window = int(2 * (total_reduction // 2) + 1)
+
+    else:
+        time_dim_reduction = 0
+
+    inputs = conv2d_fixed_padding(prefix="c_init", inputs=inputs, filters=num_filters,
+                                  kernel_size=first_kernel_size, strides=conv_stride,
+                                  dilation_rate=1,        
+                                  data_format=data_format, conv_time_dim=conv_time_dim)
+
+    dilation_rate = 2
+
+    if resnet_version == 1:
+        inputs = add_sequential_layer("c_init_bn", {"class": "batch_norm", "masked_time": masked_time, "axes": axes, "fused": fused, "momentum": bn_momentum, "epsilon": bn_epsilon}, from_=inputs)
+        inputs = add_sequential_layer("c_init_relu", {"class": "activation", "activation": "relu", "batch_norm": False}, from_=inputs)         
+
+    for i, num_blocks in enumerate(block_sizes):
+        filters = num_filters * (2**i)
+        inputs = block_layer(
+            prefix="c_%i" % i, inputs=inputs, filters=filters, bottleneck=bottleneck,
+            block_fn=block_fn, blocks=num_blocks,
+            strides=block_strides[i], dilation_rate=dilation_rate,  
+            kernel_size=kernel_size,
+            data_format=data_format, conv_time_dim=conv_time_dim)
+        dilation_rate *= block_strides[i]
+
+    if pre_activation:
+        inputs = add_sequential_layer("c_out_bn", {"class": "batch_norm", "masked_time": masked_time, "axes": axes, "fused": fused, "momentum": bn_momentum, "epsilon": bn_epsilon}, from_=inputs)
+        inputs = add_sequential_layer("c_out_relu", {"class": "activation", "activation": "relu", "batch_norm": False}, from_=inputs)
+
+    inputs = add_sequential_layer("out_pool", {"class": "reduce", "mode": "avg", "axes": ("s:1"), "keep_dims": False}, from_=inputs)         
+    inputs = add_sequential_layer("linear", {"class": "linear", "activation": "relu", "dropout": 0.05, "n_out": 2048}, from_=inputs)
+    return inputs, context_window  
+
+inputs = add_sequential_layer("split", {"class": "split_dims", "axis": "f", "dims": (channel_num, feature_dim)}) # output: (batch, time, window = 61, feature = 40, channel = 1)
+inputs = add_sequential_layer("swap_axes", {"class": "swap_axes", "axis1": "s:1", "axis2": "f"})
+inputs = add_sequential_layer("data_aug", {"class": "eval", "eval": "self.network.get_config().typed_value('transform')(source(0), network=self.network)"})
+resnet_out, context_window = build_resnet(inputs=inputs)
+add_sequential_layer("output", {"class": "softmax", "loss": "ce", "dropout": 0.05, "n_out": num_outputs}, from_=resnet_out)
+
+train = get_sprint_dataset("train")
+dev = get_sprint_dataset("cv")
+
+############## debug stuff
+debug_print_layer_output_template = True  # useful for debugging
+#debug_print_layer_output_sizes = True
+#debug_print_layer_output_shape = True  # might be useful for debugging
+#debug_shell_in_runner = True
+log_batch_size = True
+tf_log_memory_usage = True
+############## debug stuff
+
+# trainer
+batching = "random"
+batch_size = 24 * (150 + context_window)
+max_seqs = 500
+chunking = "150:150"
+truncation = -1
+num_epochs = 100
+gradient_clip = 0
+gradient_noise = 0.0
+momentum = 0.99
+learning_rate = 5e-6
+learning_rate_file = "newbob.data"
+learning_rate_control = "newbob_multi_epoch"
+learning_rate_control_relative_error_relative_lr = True
+newbob_multi_num_epochs = 6
+newbob_multi_update_interval = 1
+model = "net-model/network"
+cleanup_old_models = True
+
+# log
+log = "log/crnn.train.log"
+log_verbosity = 5
+