diff --git a/2019-asr-resnet-lace-cnn/ResNet/DataAug/resnet_specaug_gt.config b/2019-asr-resnet-lace-cnn/ResNet/DataAug/resnet_specaug_gt.config new file mode 100644 index 00000000..10328d77 --- /dev/null +++ b/2019-asr-resnet-lace-cnn/ResNet/DataAug/resnet_specaug_gt.config @@ -0,0 +1,415 @@ +#!crnn/rnn.py +# kate: syntax python; +# see also file:///u/zeyer/setups/quaero-en/training/quaero-train11/50h/ann/2015-07-29--lstm-gt50/config-train/dropout01.3l.n500.custom_lstm.adam.lr1e_3.config + +import os +import numpy +from subprocess import check_output + +# task +use_tensorflow = True +task = "train" +device = "gpu" +multiprocessing = True +update_on_device = True + +_cf_cache = {} + +def cf(filename): + """Cache manager""" + if filename in _cf_cache: + return _cf_cache[filename] + if check_output(["hostname"]).strip().decode("utf8") in ["osmium", "sulfid", "zink", "cobalt", "niob"]: + print("use local file: %s" % filename) + return filename # for debugging + cached_fn = check_output(["cf", filename]).strip().decode("utf8") + assert os.path.exists(cached_fn) + _cf_cache[filename] = cached_fn + return cached_fn + +# data +context_window = 1 +window = 1 +feature_dim = 40 # GT 40-dim +channel_num = 1 +num_inputs = feature_dim * channel_num * window +num_outputs = 9001 # CART labels +EpochSplit = 6 + +def get_sprint_dataset(data): + assert context_window > 1 + assert data in ["train", "cv"] + epochSplit = {"train": EpochSplit, "cv": 1} + # see /u/tuske/work/ASR/switchboard/corpus/readme + # and zoltans mail https://mail.google.com/mail/u/0/#inbox/152891802cbb2b40 + files = {} + files["config"] = "config/training.config" + files["corpus"] = "/u/corpora/speech/switchboard-1/xml/swb1-all/swb1-all.corpus.gz" + files["segments"] = "dependencies/seg_%s" % {"train":"train", "cv":"cv_head3000"}[data] + files["features"] = "/u/tuske/work/ASR/switchboard/feature.extraction/gt40_40/data/gt.train.bundle" + files["lexicon"] = "/u/tuske/work/ASR/switchboard/corpus/train.lex.v1_0_3.ci.gz" + files["alignment"] = "dependencies/tuske__2016_01_28__align.combined.train" + files["cart"] = "/u/tuske/work/ASR/switchboard/initalign/data/%s" % {9001: "cart-9000"}[num_outputs] + for k, v in sorted(files.items()): + assert os.path.exists(v), "%s %r does not exist" % (k, v) + estimated_num_seqs = {"train": 227047, "cv": 3000} # wc -l segment-file + args = [ + "--config=" + files["config"], + lambda: "--*.corpus.file=" + cf(files["corpus"]), + lambda: "--*.corpus.segments.file=" + cf(files["segments"]), + {"train": "--*.corpus.segment-order-shuffle=true", "cv": "--*.corpus.segment-order-sort-by-time-length=true"}[data], + "--*.state-tying.type=cart", + lambda: "--*.state-tying.file=" + cf(files["cart"]), + "--*.trainer-output-dimension=%i" % num_outputs, + lambda: "--*.lexicon.file=" + cf(files["lexicon"]), + lambda: "--*.alignment-cache-path=" + cf(files["alignment"]), + lambda: "--*.feature-cache-path=" + cf(files["features"]), + "--*.log-channel.file=log/crnn.sprint.train-dataset.xml", + "--*.window-size=1", + "--*.trainer-output-dimension=%i" % num_outputs + ] + return { + "class": "ExternSprintDataset", "sprintTrainerExecPath": "sprint-executables/nn-trainer", + "sprintConfigStr": args, + "partitionEpoch": epochSplit[data], + "estimated_num_seqs": estimated_num_seqs[data] // (epochSplit[data] or 1), + "context_window": context_window} +cache_size = "0" + +# network +# (also defined by num_inputs & num_outputs) +dropout = 0.05 +L2 = 0.1 +filter_size = (3, 3) # for 2D conv on (window, feature) axes + +# bn params +masked_time = False +fused = True +axes = ["f"] + +bn_momentum = 0.997 +bn_epsilon = 1e-5 + +cur_feat_dim = feature_dim +network = {} +_last = "data" +def add_sequential_layer(name, d, from_=None): + global _last, network + assert "from" not in d + if from_ is not None: + d["from"] = from_ + else: + d["from"] = [_last] + assert name not in network + network[name] = d + _last = name + return name + +# data augmentation +def summary(name, x): + """ + :param str name: + :param tf.Tensor x: (batch,time,feature) + """ + import tensorflow as tf + # tf.summary.image wants [batch_size, height, width, channels], + # we have (batch, time, feature). + img = tf.expand_dims(x, axis=3) # (batch,time,feature,1) + img = tf.transpose(img, [0, 2, 1, 3]) # (batch,feature,time,1) + tf.summary.image(name, img, max_outputs=10) + tf.summary.scalar("%s_max_abs" % name, tf.reduce_max(tf.abs(x))) + mean = tf.reduce_mean(x) + tf.summary.scalar("%s_mean" % name, mean) + stddev = tf.sqrt(tf.reduce_mean(tf.square(x - mean))) + tf.summary.scalar("%s_stddev" % name, stddev) + tf.summary.histogram("%s_hist" % name, tf.reduce_max(tf.abs(x), axis=2)) + + +def _mask(x, axis, pos, max_amount): + """ + :param tf.Tensor x: (batch,time,feature) + :param int axis: + :param tf.Tensor pos: (batch,) + :param int max_amount: inclusive + """ + import tensorflow as tf + ndim = x.get_shape().ndims + n_batch = tf.shape(x)[0] + dim = tf.shape(x)[axis] + amount = tf.random_uniform(shape=(n_batch,), minval=1, maxval=max_amount + 1, dtype=tf.int32) + pos2 = tf.minimum(pos + amount, dim) + idxs = tf.expand_dims(tf.range(0, dim), 0) # (1,dim) + pos_bc = tf.expand_dims(pos, 1) # (batch,1) + pos2_bc = tf.expand_dims(pos2, 1) # (batch,1) + cond = tf.logical_and(tf.greater_equal(idxs, pos_bc), tf.less(idxs, pos2_bc)) # (batch,dim) + cond = tf.reshape(cond, [tf.shape(x)[i] if i in (0, axis) else 1 for i in range(ndim)]) + from TFUtil import where_bc + x = where_bc(cond, 0.0, x) + return x + + +def random_mask(x, axis, min_num, max_num, max_dims): + """ + :param tf.Tensor x: (batch,time,feature) + :param int axis: + :param int|tf.Tensor min_num: + :param int|tf.Tensor max_num: inclusive + :param int max_dims: inclusive + """ + import tensorflow as tf + n_batch = tf.shape(x)[0] + num = tf.random_uniform(shape=(n_batch,), minval=min_num, maxval=max_num + 1, dtype=tf.int32) + # https://github.com/tensorflow/tensorflow/issues/9260 + # https://timvieira.github.io/blog/post/2014/08/01/gumbel-max-trick-and-weighted-reservoir-sampling/ + z = -tf.log(-tf.log(tf.random_uniform((n_batch, tf.shape(x)[axis]), 0, 1))) + _, indices = tf.nn.top_k(z, tf.reduce_max(num)) + _, x = tf.while_loop( + cond=lambda i, _: tf.less(i, tf.reduce_max(num)), + body=lambda i, x: ( + i + 1, + tf.where( + tf.less(i, num), + _mask(x, axis=axis, pos=indices[:, i], max_amount=max_dims), + x)), + loop_vars=(0, x)) + return x + + +def random_warp(x, std, scale): + """ + :param tf.Tensor x: (batch,time,dim) + :param (float,float) std: + :param (float,float) scale: + :rtype: tf.Tensor + :return: x transformed + """ + import tensorflow as tf + from TFUtil import create_random_warp_flow_2d, dense_image_warp + x = tf.expand_dims(x, axis=-1) + flow = create_random_warp_flow_2d(tf.shape(x)[:-1], std=std, scale=scale) + x = dense_image_warp(x, flow=flow) + x = tf.squeeze(x, axis=-1) + return x + + +def transform(x, network): + import tensorflow as tf + def get_masked(): + x_masked = x + x_masked = random_mask(x_masked, axis=1, min_num=0, max_num=4, max_dims=40) + x_masked = random_mask(x_masked, axis=2, min_num=0, max_num=2, max_dims=20) + return x_masked + x = network.cond_on_train(get_masked, lambda: x) + return x + + +def fixed_padding(prefix, inputs, kernel_size, data_format, conv_time_dim): + pad_total = kernel_size - 1 + feature_pad_beg = pad_total // 2 + feature_pad_end = pad_total - feature_pad_beg + + time_pad_beg = 0 + time_pad_end = 0 + + return add_sequential_layer("%s_pad" % prefix, {"class": "pad", "axes": ("s:0", "s:1"), "padding": [(time_pad_beg, time_pad_end), (feature_pad_end, feature_pad_end)]}, from_=inputs) + + +def conv2d_fixed_padding(prefix, filters, kernel_size, strides, dilation_rate, + data_format, conv_time_dim, inputs=None): + """Strided 2-D convolution with explicit padding.""" + # The padding is consistent and is based only on `kernel_size`, not on the + # dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone). + pad_out = fixed_padding("%s_pad" % prefix, inputs, kernel_size, data_format, conv_time_dim) + + padding = 'VALID' + strides = (1, strides) if conv_time_dim else strides + filter_size = (kernel_size, kernel_size) + dilation_rate = (dilation_rate, 1) if conv_time_dim else (1, 1) + + if data_format == 'channels_first': + NCHW = True + else: + NCHW = False + return add_sequential_layer("%s_conv" % prefix, {"class": "conv", "n_out": filters, "filter_size": filter_size, "auto_use_channel_first": NCHW, + "strides": strides, "dilation_rate": dilation_rate, "padding": padding, "activation": None, "with_bias": False, "dropout": 0, + "forward_weights_init": "xavier", "L2": L2}, + from_=pad_out) + + +def _building_block_v2(prefix, inputs, filters, projection_shortcut, strides, + dilation_rate, kernel_size, data_format, conv_time_dim): + bn1 = add_sequential_layer("%s_bn1" % prefix, {"class": "batch_norm", "masked_time": masked_time, "axes": axes, "fused": fused, "momentum": bn_momentum, "epsilon": bn_epsilon}, from_=inputs) + relu1 = add_sequential_layer("%s_relu1" % prefix, {"class": "activation", "activation": "relu", "batch_norm": False}, from_=bn1) + if strides > 1 and conv_time_dim: + conv1 = conv2d_fixed_padding(prefix=("%s_conv_1" % prefix), inputs=relu1, filters=filters, + kernel_size=kernel_size, strides=1, + dilation_rate=dilation_rate, + data_format=data_format, conv_time_dim=conv_time_dim) + conv1 = add_sequential_layer("%s_stride" % prefix, {"class": "slice", "axis": "s:1", "slice_step": strides}, from_=conv1) + dilation_rate *= 2 + else: + conv1 = conv2d_fixed_padding(prefix=("%s_conv_1" % prefix), inputs=relu1, filters=filters, + kernel_size=kernel_size, strides=strides, + dilation_rate=dilation_rate, + data_format=data_format, conv_time_dim=conv_time_dim) + + bn2 = add_sequential_layer("%s_bn2" % prefix, {"class": "batch_norm", "masked_time": masked_time, "axes": axes, "fused": fused, "momentum": bn_momentum, "epsilon": bn_epsilon}, from_=conv1) + relu2 = add_sequential_layer("%s_relu2" % prefix, {"class": "activation", "activation": "relu", "batch_norm": False}, from_=bn2) + + conv2 = conv2d_fixed_padding(prefix=("%s_conv_2" % prefix), inputs=relu2, filters=filters, + kernel_size=kernel_size, strides=1, + dilation_rate=dilation_rate, + data_format=data_format, conv_time_dim=conv_time_dim) + # The projection shortcut should come after the first batch norm and ReLU + # since it performs a 1x1 convolution. + crop_lr = filter_size[0] - 1 + crop_left = crop_lr // 2 + crop_right = crop_lr - crop_left + + if conv_time_dim: + if strides > 1: + crop = int(crop_left * (dilation_rate/2 + dilation_rate)) + else: + crop = int(crop_left * 2 * dilation_rate) + shortcut = add_sequential_layer("%s_crop" % prefix, {"class": "slice", "axis": "T", "slice_start": crop, "slice_end": -crop}, from_=relu1) + # shortcut = "%s_crop" % prefix + if projection_shortcut is not None: + shortcut = projection_shortcut(inputs=shortcut) + else: + crop = crop_left + shortcut = add_sequential_layer("%s_crop_1" % prefix, {"class": "slice", "axis": "T", "slice_start": crop, "slice_end": -crop}, from_=relu1) + + if projection_shortcut is not None: + shortcut = projection_shortcut(inputs=shortcut) + + shortcut = add_sequential_layer("%s_crop_2" % prefix, {"class": "slice", "axis": "T", "slice_start": crop, "slice_end": -crop}, from_=shortcut) + return add_sequential_layer("%s_out" % prefix, {"class": "combine", "kind": "add"}, from_=[conv2, shortcut]) + + +def block_layer(prefix, inputs, filters, bottleneck, block_fn, blocks, strides, + dilation_rate, kernel_size, data_format, conv_time_dim): + filters_out = filters * 4 if bottleneck else filters + def projection_shortcut(inputs=None): + return conv2d_fixed_padding( + prefix=("%s_sc" % prefix), filters=filters_out, kernel_size=1, strides=strides, + dilation_rate=1, data_format=data_format, conv_time_dim=conv_time_dim, inputs=inputs) + inputs = block_fn("%s_0" % prefix, inputs, filters, projection_shortcut, strides, + dilation_rate, kernel_size, data_format, conv_time_dim) + if strides > 1: + dilation_rate *= strides + for i in range(1, blocks): + inputs = block_fn("%s_%i" % (prefix, i), inputs, filters, None, 1, dilation_rate, + kernel_size, data_format, conv_time_dim) + return inputs + + +def build_resnet(inputs): + resnet_version = 2 + conv_time_dim = True + bottleneck = False + num_filters = 64 + first_kernel_size = 5 + kernel_size = 3 + conv_stride = 2 + first_pool_size = (2, 1) + first_pool_stride = (1, 1) + block_sizes = [3, 3, 3, 3] + block_strides = [1, 2, 2, 2] + block_fn = _building_block_v2 + data_format = 'channels_last' + pre_activation = resnet_version == 2 + + if data_format == 'channels_first': + NCHW = True + else: + NCHW = False + + if conv_time_dim: + multiplier = 1 if bottleneck else 2 + building_block_reduction = multiplier * 2 * (kernel_size // 2) + total_reduction = 2 * (first_kernel_size // 2) + + dilation_rate_multiplier = 2 + + for i, bs in enumerate(block_sizes): + total_reduction += building_block_reduction/multiplier * dilation_rate_multiplier + dilation_rate_multiplier *= block_strides[i] + total_reduction += building_block_reduction/multiplier * dilation_rate_multiplier + total_reduction += building_block_reduction * (bs - 1) * dilation_rate_multiplier + + time_dim_reduction = total_reduction + context_window = int(2 * (total_reduction // 2) + 1) + + else: + time_dim_reduction = 0 + + inputs = conv2d_fixed_padding(prefix="c_init", inputs=inputs, filters=num_filters, + kernel_size=first_kernel_size, strides=conv_stride, + dilation_rate=1, + data_format=data_format, conv_time_dim=conv_time_dim) + + dilation_rate = 2 + + if resnet_version == 1: + inputs = add_sequential_layer("c_init_bn", {"class": "batch_norm", "masked_time": masked_time, "axes": axes, "fused": fused, "momentum": bn_momentum, "epsilon": bn_epsilon}, from_=inputs) + inputs = add_sequential_layer("c_init_relu", {"class": "activation", "activation": "relu", "batch_norm": False}, from_=inputs) + + for i, num_blocks in enumerate(block_sizes): + filters = num_filters * (2**i) + inputs = block_layer( + prefix="c_%i" % i, inputs=inputs, filters=filters, bottleneck=bottleneck, + block_fn=block_fn, blocks=num_blocks, + strides=block_strides[i], dilation_rate=dilation_rate, + kernel_size=kernel_size, + data_format=data_format, conv_time_dim=conv_time_dim) + dilation_rate *= block_strides[i] + + if pre_activation: + inputs = add_sequential_layer("c_out_bn", {"class": "batch_norm", "masked_time": masked_time, "axes": axes, "fused": fused, "momentum": bn_momentum, "epsilon": bn_epsilon}, from_=inputs) + inputs = add_sequential_layer("c_out_relu", {"class": "activation", "activation": "relu", "batch_norm": False}, from_=inputs) + + inputs = add_sequential_layer("out_pool", {"class": "reduce", "mode": "avg", "axes": ("s:1"), "keep_dims": False}, from_=inputs) + inputs = add_sequential_layer("linear", {"class": "linear", "activation": "relu", "dropout": 0.05, "n_out": 2048}, from_=inputs) + return inputs, context_window + +inputs = add_sequential_layer("split", {"class": "split_dims", "axis": "f", "dims": (channel_num, feature_dim)}) # output: (batch, time, window = 61, feature = 40, channel = 1) +inputs = add_sequential_layer("swap_axes", {"class": "swap_axes", "axis1": "s:1", "axis2": "f"}) +inputs = add_sequential_layer("data_aug", {"class": "eval", "eval": "self.network.get_config().typed_value('transform')(source(0), network=self.network)"}) +resnet_out, context_window = build_resnet(inputs=inputs) +add_sequential_layer("output", {"class": "softmax", "loss": "ce", "dropout": 0.05, "n_out": num_outputs}, from_=resnet_out) + +train = get_sprint_dataset("train") +dev = get_sprint_dataset("cv") + +############## debug stuff +debug_print_layer_output_template = True # useful for debugging +#debug_print_layer_output_sizes = True +#debug_print_layer_output_shape = True # might be useful for debugging +#debug_shell_in_runner = True +log_batch_size = True +tf_log_memory_usage = True +############## debug stuff + +# trainer +batching = "random" +batch_size = 24 * (150 + context_window) +max_seqs = 500 +chunking = "150:150" +truncation = -1 +num_epochs = 100 +gradient_clip = 0 +gradient_noise = 0.0 +momentum = 0.99 +learning_rate = 5e-6 +learning_rate_file = "newbob.data" +learning_rate_control = "newbob_multi_epoch" +learning_rate_control_relative_error_relative_lr = True +newbob_multi_num_epochs = 6 +newbob_multi_update_interval = 1 +model = "net-model/network" +cleanup_old_models = True + +# log +log = "log/crnn.train.log" +log_verbosity = 5 + diff --git a/2019-asr-resnet-lace-cnn/ResNet/DataAug/resnet_specaug_logmel+d+dd.config b/2019-asr-resnet-lace-cnn/ResNet/DataAug/resnet_specaug_logmel+d+dd.config new file mode 100644 index 00000000..c1db03f0 --- /dev/null +++ b/2019-asr-resnet-lace-cnn/ResNet/DataAug/resnet_specaug_logmel+d+dd.config @@ -0,0 +1,415 @@ +#!crnn/rnn.py +# kate: syntax python; +# see also file:///u/zeyer/setups/quaero-en/training/quaero-train11/50h/ann/2015-07-29--lstm-gt50/config-train/dropout01.3l.n500.custom_lstm.adam.lr1e_3.config + +import os +import numpy +from subprocess import check_output + +# task +use_tensorflow = True +task = "train" +device = "gpu" +multiprocessing = True +update_on_device = True + +_cf_cache = {} + +def cf(filename): + """Cache manager""" + if filename in _cf_cache: + return _cf_cache[filename] + if check_output(["hostname"]).strip().decode("utf8") in ["osmium", "sulfid", "zink", "cobalt", "niob"]: + print("use local file: %s" % filename) + return filename # for debugging + cached_fn = check_output(["cf", filename]).strip().decode("utf8") + assert os.path.exists(cached_fn) + _cf_cache[filename] = cached_fn + return cached_fn + +# data +context_window = 1 +window = 1 +feature_dim = 64 # LogMel 64-dim +channel_num = 3 +num_inputs = feature_dim * channel_num * window +num_outputs = 9001 # CART labels +EpochSplit = 6 + +def get_sprint_dataset(data): + assert context_window > 1 + assert data in ["train", "cv"] + epochSplit = {"train": EpochSplit, "cv": 1} + # see /u/tuske/work/ASR/switchboard/corpus/readme + # and zoltans mail https://mail.google.com/mail/u/0/#inbox/152891802cbb2b40 + files = {} + files["config"] = "config/training.config" + files["corpus"] = "/u/corpora/speech/switchboard-1/xml/swb1-all/swb1-all.corpus.gz" + files["segments"] = "dependencies/seg_%s" % {"train":"train", "cv":"cv_head3000"}[data] + files["features"] = "/u/bozheniuk/setups/switchboard/feature_extraction/cluster_setup/logmel64_30/data/logmel.train.bundle" + files["lexicon"] = "/u/tuske/work/ASR/switchboard/corpus/train.lex.v1_0_3.ci.gz" + files["alignment"] = "dependencies/tuske__2016_01_28__align.combined.train" + files["cart"] = "/u/tuske/work/ASR/switchboard/initalign/data/%s" % {9001: "cart-9000"}[num_outputs] + for k, v in sorted(files.items()): + assert os.path.exists(v), "%s %r does not exist" % (k, v) + estimated_num_seqs = {"train": 227047, "cv": 3000} # wc -l segment-file + args = [ + "--config=" + files["config"], + lambda: "--*.corpus.file=" + cf(files["corpus"]), + lambda: "--*.corpus.segments.file=" + cf(files["segments"]), + {"train": "--*.corpus.segment-order-shuffle=true", "cv": "--*.corpus.segment-order-sort-by-time-length=true"}[data], + "--*.state-tying.type=cart", + lambda: "--*.state-tying.file=" + cf(files["cart"]), + "--*.trainer-output-dimension=%i" % num_outputs, + lambda: "--*.lexicon.file=" + cf(files["lexicon"]), + lambda: "--*.alignment-cache-path=" + cf(files["alignment"]), + lambda: "--*.feature-cache-path=" + cf(files["features"]), + "--*.log-channel.file=log/crnn.sprint.train-dataset.xml", + "--*.window-size=1", + "--*.trainer-output-dimension=%i" % num_outputs + ] + return { + "class": "ExternSprintDataset", "sprintTrainerExecPath": "sprint-executables/nn-trainer", + "sprintConfigStr": args, + "partitionEpoch": epochSplit[data], + "estimated_num_seqs": estimated_num_seqs[data] // (epochSplit[data] or 1), + "context_window": context_window} +cache_size = "0" + +# network +# (also defined by num_inputs & num_outputs) +dropout = 0.05 +L2 = 0.1 +filter_size = (3, 3) # for 2D conv on (window, feature) axes + +# bn params +masked_time = False +fused = True +axes = ["f"] + +bn_momentum = 0.997 +bn_epsilon = 1e-5 + +cur_feat_dim = feature_dim +network = {} +_last = "data" +def add_sequential_layer(name, d, from_=None): + global _last, network + assert "from" not in d + if from_ is not None: + d["from"] = from_ + else: + d["from"] = [_last] + assert name not in network + network[name] = d + _last = name + return name + +# data augmentation +def summary(name, x): + """ + :param str name: + :param tf.Tensor x: (batch,time,feature) + """ + import tensorflow as tf + # tf.summary.image wants [batch_size, height, width, channels], + # we have (batch, time, feature). + img = tf.expand_dims(x, axis=3) # (batch,time,feature,1) + img = tf.transpose(img, [0, 2, 1, 3]) # (batch,feature,time,1) + tf.summary.image(name, img, max_outputs=10) + tf.summary.scalar("%s_max_abs" % name, tf.reduce_max(tf.abs(x))) + mean = tf.reduce_mean(x) + tf.summary.scalar("%s_mean" % name, mean) + stddev = tf.sqrt(tf.reduce_mean(tf.square(x - mean))) + tf.summary.scalar("%s_stddev" % name, stddev) + tf.summary.histogram("%s_hist" % name, tf.reduce_max(tf.abs(x), axis=2)) + + +def _mask(x, axis, pos, max_amount): + """ + :param tf.Tensor x: (batch,time,feature) + :param int axis: + :param tf.Tensor pos: (batch,) + :param int max_amount: inclusive + """ + import tensorflow as tf + ndim = x.get_shape().ndims + n_batch = tf.shape(x)[0] + dim = tf.shape(x)[axis] + amount = tf.random_uniform(shape=(n_batch,), minval=1, maxval=max_amount + 1, dtype=tf.int32) + pos2 = tf.minimum(pos + amount, dim) + idxs = tf.expand_dims(tf.range(0, dim), 0) # (1,dim) + pos_bc = tf.expand_dims(pos, 1) # (batch,1) + pos2_bc = tf.expand_dims(pos2, 1) # (batch,1) + cond = tf.logical_and(tf.greater_equal(idxs, pos_bc), tf.less(idxs, pos2_bc)) # (batch,dim) + cond = tf.reshape(cond, [tf.shape(x)[i] if i in (0, axis) else 1 for i in range(ndim)]) + from TFUtil import where_bc + x = where_bc(cond, 0.0, x) + return x + + +def random_mask(x, axis, min_num, max_num, max_dims): + """ + :param tf.Tensor x: (batch,time,feature) + :param int axis: + :param int|tf.Tensor min_num: + :param int|tf.Tensor max_num: inclusive + :param int max_dims: inclusive + """ + import tensorflow as tf + n_batch = tf.shape(x)[0] + num = tf.random_uniform(shape=(n_batch,), minval=min_num, maxval=max_num + 1, dtype=tf.int32) + # https://github.com/tensorflow/tensorflow/issues/9260 + # https://timvieira.github.io/blog/post/2014/08/01/gumbel-max-trick-and-weighted-reservoir-sampling/ + z = -tf.log(-tf.log(tf.random_uniform((n_batch, tf.shape(x)[axis]), 0, 1))) + _, indices = tf.nn.top_k(z, tf.reduce_max(num)) + _, x = tf.while_loop( + cond=lambda i, _: tf.less(i, tf.reduce_max(num)), + body=lambda i, x: ( + i + 1, + tf.where( + tf.less(i, num), + _mask(x, axis=axis, pos=indices[:, i], max_amount=max_dims), + x)), + loop_vars=(0, x)) + return x + + +def random_warp(x, std, scale): + """ + :param tf.Tensor x: (batch,time,dim) + :param (float,float) std: + :param (float,float) scale: + :rtype: tf.Tensor + :return: x transformed + """ + import tensorflow as tf + from TFUtil import create_random_warp_flow_2d, dense_image_warp + x = tf.expand_dims(x, axis=-1) + flow = create_random_warp_flow_2d(tf.shape(x)[:-1], std=std, scale=scale) + x = dense_image_warp(x, flow=flow) + x = tf.squeeze(x, axis=-1) + return x + + +def transform(x, network): + import tensorflow as tf + def get_masked(): + x_masked = x + x_masked = random_mask(x_masked, axis=1, min_num=0, max_num=4, max_dims=40) + x_masked = random_mask(x_masked, axis=2, min_num=0, max_num=2, max_dims=20) + return x_masked + x = network.cond_on_train(get_masked, lambda: x) + return x + + +def fixed_padding(prefix, inputs, kernel_size, data_format, conv_time_dim): + pad_total = kernel_size - 1 + feature_pad_beg = pad_total // 2 + feature_pad_end = pad_total - feature_pad_beg + + time_pad_beg = 0 + time_pad_end = 0 + + return add_sequential_layer("%s_pad" % prefix, {"class": "pad", "axes": ("s:0", "s:1"), "padding": [(time_pad_beg, time_pad_end), (feature_pad_end, feature_pad_end)]}, from_=inputs) + + +def conv2d_fixed_padding(prefix, filters, kernel_size, strides, dilation_rate, + data_format, conv_time_dim, inputs=None): + """Strided 2-D convolution with explicit padding.""" + # The padding is consistent and is based only on `kernel_size`, not on the + # dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone). + pad_out = fixed_padding("%s_pad" % prefix, inputs, kernel_size, data_format, conv_time_dim) + + padding = 'VALID' + strides = (1, strides) if conv_time_dim else strides + filter_size = (kernel_size, kernel_size) + dilation_rate = (dilation_rate, 1) if conv_time_dim else (1, 1) + + if data_format == 'channels_first': + NCHW = True + else: + NCHW = False + return add_sequential_layer("%s_conv" % prefix, {"class": "conv", "n_out": filters, "filter_size": filter_size, "auto_use_channel_first": NCHW, + "strides": strides, "dilation_rate": dilation_rate, "padding": padding, "activation": None, "with_bias": False, "dropout": 0, + "forward_weights_init": "xavier", "L2": L2}, + from_=pad_out) + + +def _building_block_v2(prefix, inputs, filters, projection_shortcut, strides, + dilation_rate, kernel_size, data_format, conv_time_dim): + bn1 = add_sequential_layer("%s_bn1" % prefix, {"class": "batch_norm", "masked_time": masked_time, "axes": axes, "fused": fused, "momentum": bn_momentum, "epsilon": bn_epsilon}, from_=inputs) + relu1 = add_sequential_layer("%s_relu1" % prefix, {"class": "activation", "activation": "relu", "batch_norm": False}, from_=bn1) + if strides > 1 and conv_time_dim: + conv1 = conv2d_fixed_padding(prefix=("%s_conv_1" % prefix), inputs=relu1, filters=filters, + kernel_size=kernel_size, strides=1, + dilation_rate=dilation_rate, + data_format=data_format, conv_time_dim=conv_time_dim) + conv1 = add_sequential_layer("%s_stride" % prefix, {"class": "slice", "axis": "s:1", "slice_step": strides}, from_=conv1) + dilation_rate *= 2 + else: + conv1 = conv2d_fixed_padding(prefix=("%s_conv_1" % prefix), inputs=relu1, filters=filters, + kernel_size=kernel_size, strides=strides, + dilation_rate=dilation_rate, + data_format=data_format, conv_time_dim=conv_time_dim) + + bn2 = add_sequential_layer("%s_bn2" % prefix, {"class": "batch_norm", "masked_time": masked_time, "axes": axes, "fused": fused, "momentum": bn_momentum, "epsilon": bn_epsilon}, from_=conv1) + relu2 = add_sequential_layer("%s_relu2" % prefix, {"class": "activation", "activation": "relu", "batch_norm": False}, from_=bn2) + + conv2 = conv2d_fixed_padding(prefix=("%s_conv_2" % prefix), inputs=relu2, filters=filters, + kernel_size=kernel_size, strides=1, + dilation_rate=dilation_rate, + data_format=data_format, conv_time_dim=conv_time_dim) + # The projection shortcut should come after the first batch norm and ReLU + # since it performs a 1x1 convolution. + crop_lr = filter_size[0] - 1 + crop_left = crop_lr // 2 + crop_right = crop_lr - crop_left + + if conv_time_dim: + if strides > 1: + crop = int(crop_left * (dilation_rate/2 + dilation_rate)) + else: + crop = int(crop_left * 2 * dilation_rate) + shortcut = add_sequential_layer("%s_crop" % prefix, {"class": "slice", "axis": "T", "slice_start": crop, "slice_end": -crop}, from_=relu1) + # shortcut = "%s_crop" % prefix + if projection_shortcut is not None: + shortcut = projection_shortcut(inputs=shortcut) + else: + crop = crop_left + shortcut = add_sequential_layer("%s_crop_1" % prefix, {"class": "slice", "axis": "T", "slice_start": crop, "slice_end": -crop}, from_=relu1) + + if projection_shortcut is not None: + shortcut = projection_shortcut(inputs=shortcut) + + shortcut = add_sequential_layer("%s_crop_2" % prefix, {"class": "slice", "axis": "T", "slice_start": crop, "slice_end": -crop}, from_=shortcut) + return add_sequential_layer("%s_out" % prefix, {"class": "combine", "kind": "add"}, from_=[conv2, shortcut]) + + +def block_layer(prefix, inputs, filters, bottleneck, block_fn, blocks, strides, + dilation_rate, kernel_size, data_format, conv_time_dim): + filters_out = filters * 4 if bottleneck else filters + def projection_shortcut(inputs=None): + return conv2d_fixed_padding( + prefix=("%s_sc" % prefix), filters=filters_out, kernel_size=1, strides=strides, + dilation_rate=1, data_format=data_format, conv_time_dim=conv_time_dim, inputs=inputs) + inputs = block_fn("%s_0" % prefix, inputs, filters, projection_shortcut, strides, + dilation_rate, kernel_size, data_format, conv_time_dim) + if strides > 1: + dilation_rate *= strides + for i in range(1, blocks): + inputs = block_fn("%s_%i" % (prefix, i), inputs, filters, None, 1, dilation_rate, + kernel_size, data_format, conv_time_dim) + return inputs + + +def build_resnet(inputs): + resnet_version = 2 + conv_time_dim = True + bottleneck = False + num_filters = 64 + first_kernel_size = 5 + kernel_size = 3 + conv_stride = 2 + first_pool_size = (2, 1) + first_pool_stride = (1, 1) + block_sizes = [3, 3, 3, 3] + block_strides = [1, 2, 2, 2] + block_fn = _building_block_v2 + data_format = 'channels_last' + pre_activation = resnet_version == 2 + + if data_format == 'channels_first': + NCHW = True + else: + NCHW = False + + if conv_time_dim: + multiplier = 1 if bottleneck else 2 + building_block_reduction = multiplier * 2 * (kernel_size // 2) + total_reduction = 2 * (first_kernel_size // 2) + + dilation_rate_multiplier = 2 + + for i, bs in enumerate(block_sizes): + total_reduction += building_block_reduction/multiplier * dilation_rate_multiplier + dilation_rate_multiplier *= block_strides[i] + total_reduction += building_block_reduction/multiplier * dilation_rate_multiplier + total_reduction += building_block_reduction * (bs - 1) * dilation_rate_multiplier + + time_dim_reduction = total_reduction + context_window = int(2 * (total_reduction // 2) + 1) + + else: + time_dim_reduction = 0 + + inputs = conv2d_fixed_padding(prefix="c_init", inputs=inputs, filters=num_filters, + kernel_size=first_kernel_size, strides=conv_stride, + dilation_rate=1, + data_format=data_format, conv_time_dim=conv_time_dim) + + dilation_rate = 2 + + if resnet_version == 1: + inputs = add_sequential_layer("c_init_bn", {"class": "batch_norm", "masked_time": masked_time, "axes": axes, "fused": fused, "momentum": bn_momentum, "epsilon": bn_epsilon}, from_=inputs) + inputs = add_sequential_layer("c_init_relu", {"class": "activation", "activation": "relu", "batch_norm": False}, from_=inputs) + + for i, num_blocks in enumerate(block_sizes): + filters = num_filters * (2**i) + inputs = block_layer( + prefix="c_%i" % i, inputs=inputs, filters=filters, bottleneck=bottleneck, + block_fn=block_fn, blocks=num_blocks, + strides=block_strides[i], dilation_rate=dilation_rate, + kernel_size=kernel_size, + data_format=data_format, conv_time_dim=conv_time_dim) + dilation_rate *= block_strides[i] + + if pre_activation: + inputs = add_sequential_layer("c_out_bn", {"class": "batch_norm", "masked_time": masked_time, "axes": axes, "fused": fused, "momentum": bn_momentum, "epsilon": bn_epsilon}, from_=inputs) + inputs = add_sequential_layer("c_out_relu", {"class": "activation", "activation": "relu", "batch_norm": False}, from_=inputs) + + inputs = add_sequential_layer("out_pool", {"class": "reduce", "mode": "avg", "axes": ("s:1"), "keep_dims": False}, from_=inputs) + inputs = add_sequential_layer("linear", {"class": "linear", "activation": "relu", "dropout": 0.05, "n_out": 2048}, from_=inputs) + return inputs, context_window + +inputs = add_sequential_layer("split", {"class": "split_dims", "axis": "f", "dims": (channel_num, feature_dim)}) # output: (batch, time, window = 61, feature = 40, channel = 1) +inputs = add_sequential_layer("swap_axes", {"class": "swap_axes", "axis1": "s:1", "axis2": "f"}) +inputs = add_sequential_layer("data_aug", {"class": "eval", "eval": "self.network.get_config().typed_value('transform')(source(0), network=self.network)"}) +resnet_out, context_window = build_resnet(inputs=inputs) +add_sequential_layer("output", {"class": "softmax", "loss": "ce", "dropout": 0.05, "n_out": num_outputs}, from_=resnet_out) + +train = get_sprint_dataset("train") +dev = get_sprint_dataset("cv") + +############## debug stuff +debug_print_layer_output_template = True # useful for debugging +#debug_print_layer_output_sizes = True +#debug_print_layer_output_shape = True # might be useful for debugging +#debug_shell_in_runner = True +log_batch_size = True +tf_log_memory_usage = True +############## debug stuff + +# trainer +batching = "random" +batch_size = 24 * (150 + context_window) +max_seqs = 500 +chunking = "150:150" +truncation = -1 +num_epochs = 100 +gradient_clip = 0 +gradient_noise = 0.0 +momentum = 0.99 +learning_rate = 5e-6 +learning_rate_file = "newbob.data" +learning_rate_control = "newbob_multi_epoch" +learning_rate_control_relative_error_relative_lr = True +newbob_multi_num_epochs = 6 +newbob_multi_update_interval = 1 +model = "net-model/network" +cleanup_old_models = True + +# log +log = "log/crnn.train.log" +log_verbosity = 5 +