#!crnn/rnn.py # kate: syntax python; # -*- mode: python -*- # sublime: syntax 'Packages/Python Improved/PythonImproved.tmLanguage' # vim:set expandtab tabstop=4 fenc=utf-8 ff=unix ft=python: import os import numpy from subprocess import check_output, CalledProcessError from Pretrain import WrapEpochValue # task use_tensorflow = True task = "train" device = "gpu" multiprocessing = True update_on_device = True setup_name = os.path.splitext(os.path.basename(__file__))[0] debug_mode = False if int(os.environ.get("RETURNN_DEBUG", "0")): print("** DEBUG MODE") debug_mode = True if config.has("beam_size"): beam_size = config.int("beam_size", 0) print("** beam_size %i" % beam_size) else: beam_size = 12 # data num_inputs = 40 num_outputs = {"classes": (10073, 1), "data": (num_inputs, 2)} # see vocab EpochSplit = 20 def get_dataset(key, subset=None, train_partition_epoch=None): d = { 'class': 'LibriSpeechCorpus', #'SpecAugCernEULibriUnescoUnogWipoCorpus', 'path': 'data/dataset_libri', "use_zip": False, "use_cache_manager": not debug_mode, "prefix": key, "bpe": { 'bpe_file': 'data/dataset_libri/trans.bpe.codes', 'vocab_file': 'data/dataset_libri/trans.bpe.vocab', 'seq_postfix': [0], 'unknown_label': ''}, "audio": { "norm_mean": "data/dataset_libri/stats.mean.txt", "norm_std_dev": "data/dataset_libri/stats.mean.txt"}, } if key.startswith("train"): d["partition_epoch"] = train_partition_epoch if key == "train": d["epoch_wise_filter"] = { (1, 5): { 'use_new_filter': True, #'max_mean_len': 50, # chars 'max_mean_len': 75, # chars 'subdirs': ['train-clean-100-libri', 'train-clean-360-libri']}, (5, 10): { 'use_new_filter': True, #'max_mean_len': 150, # chars 'max_mean_len': 150, # chars 'subdirs': ['train-clean-100-libri', 'train-clean-360-libri']}, (11, 20): { 'use_new_filter': True, 'subdirs': ['train-clean-100-libri', 'train-clean-360-libri']} } #d["audio"]["random_permute"] = { # "rnd_scale_lower": 1., "rnd_scale_upper": 1., # } num_seqs = 281241 #812155 #281241 # total d["seq_ordering"] = "laplace:%i" % (num_seqs // 1000) else: d["fixed_random_seed"] = 1 d["seq_ordering"] = "sorted_reverse" if subset: d["fixed_random_subset"] = subset # faster return d train = get_dataset("train", train_partition_epoch=EpochSplit) dev = get_dataset("dev", subset=3000) cache_size = "0" window = 1 # network # (also defined by num_inputs & num_outputs) target = "classes" EncKeyTotalDim = 1024 AttNumHeads = 1 EncKeyPerHeadDim = EncKeyTotalDim // AttNumHeads EncValueTotalDim = 2048 EncValuePerHeadDim = EncValueTotalDim // AttNumHeads LstmDim = EncValueTotalDim // 2 def summary(name, x): """ :param str name: :param tf.Tensor x: (batch,time,feature) """ import tensorflow as tf # tf.summary.image wants [batch_size, height, width, channels], # we have (batch, time, feature). img = tf.expand_dims(x, axis=3) # (batch,time,feature,1) img = tf.transpose(img, [0, 2, 1, 3]) # (batch,feature,time,1) tf.summary.image(name, img, max_outputs=10) tf.summary.scalar("%s_max_abs" % name, tf.reduce_max(tf.abs(x))) mean = tf.reduce_mean(x) tf.summary.scalar("%s_mean" % name, mean) stddev = tf.sqrt(tf.reduce_mean(tf.square(x - mean))) tf.summary.scalar("%s_stddev" % name, stddev) tf.summary.histogram("%s_hist" % name, tf.reduce_max(tf.abs(x), axis=2)) def _mask(x, axis, pos, max_amount): """ :param tf.Tensor x: (batch,time,feature) :param int axis: :param tf.Tensor pos: (batch,) :param int max_amount: inclusive """ import tensorflow as tf ndim = x.get_shape().ndims n_batch = tf.shape(x)[0] dim = tf.shape(x)[axis] amount = tf.random_uniform(shape=(n_batch,), minval=1, maxval=max_amount + 1, dtype=tf.int32) pos2 = tf.minimum(pos + amount, dim) idxs = tf.expand_dims(tf.range(0, dim), 0) # (1,dim) pos_bc = tf.expand_dims(pos, 1) # (batch,1) pos2_bc = tf.expand_dims(pos2, 1) # (batch,1) cond = tf.logical_and(tf.greater_equal(idxs, pos_bc), tf.less(idxs, pos2_bc)) # (batch,dim) cond = tf.reshape(cond, [tf.shape(x)[i] if i in (0, axis) else 1 for i in range(ndim)]) from TFUtil import where_bc x = where_bc(cond, 0.0, x) return x def random_mask(x, axis, min_num, max_num, max_dims): """ :param tf.Tensor x: (batch,time,feature) :param int axis: :param int|tf.Tensor min_num: :param int|tf.Tensor max_num: inclusive :param int max_dims: inclusive """ import tensorflow as tf n_batch = tf.shape(x)[0] num = tf.random_uniform(shape=(n_batch,), minval=min_num, maxval=max_num + 1, dtype=tf.int32) # https://github.com/tensorflow/tensorflow/issues/9260 # https://timvieira.github.io/blog/post/2014/08/01/gumbel-max-trick-and-weighted-reservoir-sampling/ z = -tf.log(-tf.log(tf.random_uniform((n_batch, tf.shape(x)[axis]), 0, 1))) _, indices = tf.nn.top_k(z, tf.reduce_max(num)) # indices should be sorted, and of shape (batch,num), entries (int32) in [0,dim) # indices = tf.Print(indices, ["indices", indices, tf.shape(indices)]) _, x = tf.while_loop( cond=lambda i, _: tf.less(i, tf.reduce_max(num)), body=lambda i, x: ( i + 1, tf.where( tf.less(i, num), _mask(x, axis=axis, pos=indices[:, i], max_amount=max_dims), x)), loop_vars=(0, x)) return x def transform(x, network): import tensorflow as tf # summary("features", x) #x = tf.clip_by_value(x, -3.0, 3.0) #summary("features_clip", x) def get_masked(): x_masked = x x_masked = random_mask(x_masked, axis=1, min_num=1, max_num=tf.maximum(tf.shape(x)[1] // 100, 1), max_dims=20) x_masked = random_mask(x_masked, axis=2, min_num=1, max_num=2, max_dims=num_inputs // 5) #summary("features_mask", x_masked) return x_masked x = network.cond_on_train(get_masked, lambda: x) return x network = { "source": {"class": "eval", "eval": "self.network.get_config().typed_value('transform')(source(0), network=self.network)"}, "source0": {"class": "split_dims", "axis": "F", "dims": (-1, 1), "from": "source"}, # (T,40,1) # Lingvo: ep.conv_filter_shapes = [(3, 3, 1, 32), (3, 3, 32, 32)], ep.conv_filter_strides = [(2, 2), (2, 2)] "conv0": {"class": "conv", "from": "source0", "padding": "same", "filter_size": (3, 3), "n_out": 32, "activation": None, "with_bias": True}, # (T,40,32) "conv0p": {"class": "pool", "mode": "max", "padding": "same", "pool_size": (1, 2), "from": "conv0"}, # (T,20,32) "conv1": {"class": "conv", "from": "conv0p", "padding": "same", "filter_size": (3, 3), "n_out": 32, "activation": None, "with_bias": True}, # (T,20,32) "conv1p": {"class": "pool", "mode": "max", "padding": "same", "pool_size": (1, 2), "from": "conv1"}, # (T,10,32) "conv_merged": {"class": "merge_dims", "from": "conv1p", "axes": "static"}, # (T,320) "lstm0_fw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": 1, "from": ["conv_merged"] }, "lstm0_bw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": -1, "from": ["conv_merged"] }, "lstm0_pool": {"class": "pool", "mode": "max", "padding": "same", "pool_size": (3,), "from": ["lstm0_fw", "lstm0_bw"], "trainable": False}, "lstm1_fw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": 1, "from": ["lstm0_pool"], "dropout": 0.3 }, "lstm1_bw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": -1, "from": ["lstm0_pool"], "dropout": 0.3 }, "lstm1_pool": {"class": "pool", "mode": "max", "padding": "same", "pool_size": (2,), "from": ["lstm1_fw", "lstm1_bw"], "trainable": False}, "lstm2_fw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": 1, "from": ["lstm1_pool"], "dropout": 0.3 }, "lstm2_bw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": -1, "from": ["lstm1_pool"], "dropout": 0.3 }, "lstm2_pool": {"class": "pool", "mode": "max", "padding": "same", "pool_size": (1,), "from": ["lstm2_fw", "lstm2_bw"], "trainable": False}, "lstm3_fw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": 1, "from": ["lstm2_pool"], "dropout": 0.3 }, "lstm3_bw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": -1, "from": ["lstm2_pool"], "dropout": 0.3 }, "lstm3_pool": {"class": "pool", "mode": "max", "padding": "same", "pool_size": (1,), "from": ["lstm3_fw", "lstm3_bw"], "trainable": False}, "lstm4_fw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": 1, "from": ["lstm3_pool"], "dropout": 0.3 }, "lstm4_bw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": -1, "from": ["lstm3_pool"], "dropout": 0.3 }, "lstm4_pool": {"class": "pool", "mode": "max", "padding": "same", "pool_size": (1,), "from": ["lstm4_fw", "lstm4_bw"], "trainable": False}, "lstm5_fw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": 1, "from": ["lstm4_pool"], "dropout": 0.3 }, "lstm5_bw" : { "class": "rec", "unit": "nativelstm2", "n_out" : LstmDim, "direction": -1, "from": ["lstm4_pool"], "dropout": 0.3 }, "encoder": {"class": "copy", "from": ["lstm5_fw", "lstm5_bw"]}, # dim: EncValueTotalDim "enc_ctx": {"class": "linear", "activation": None, "with_bias": True, "from": ["encoder"], "n_out": EncKeyTotalDim}, # preprocessed_attended in Blocks "inv_fertility": {"class": "linear", "activation": "sigmoid", "with_bias": False, "from": ["encoder"], "n_out": AttNumHeads}, "enc_value": {"class": "split_dims", "axis": "F", "dims": (AttNumHeads, EncValuePerHeadDim), "from": ["encoder"]}, # (B, enc-T, H, D'/H) "output": {"class": "rec", "from": [], 'cheating': config.bool("cheating", False), "unit": { 'output': {'class': 'choice', 'target': target, 'beam_size': beam_size, 'cheating': config.bool("cheating", False), 'from': ["output_prob"], "initial_output": 0}, "end": {"class": "compare", "from": ["output"], "value": 0}, 'target_embed': {'class': 'linear', 'activation': None, "with_bias": False, 'from': ['output'], "n_out": 621, "initial_output": 0}, # feedback_input "weight_feedback": {"class": "linear", "activation": None, "with_bias": False, "from": ["prev:accum_att_weights"], "n_out": EncKeyTotalDim}, "s_transformed": {"class": "linear", "activation": None, "with_bias": False, "from": ["s"], "n_out": EncKeyTotalDim}, "energy_in": {"class": "combine", "kind": "add", "from": ["base:enc_ctx", "weight_feedback", "s_transformed"], "n_out": EncKeyTotalDim}, "energy_tanh": {"class": "activation", "activation": "tanh", "from": ["energy_in"]}, "energy": {"class": "linear", "activation": None, "with_bias": False, "from": ["energy_tanh"], "n_out": AttNumHeads}, # (B, enc-T, H) "att_weights": {"class": "softmax_over_spatial", "from": ["energy"]}, # (B, enc-T, H) "accum_att_weights": {"class": "eval", "from": ["prev:accum_att_weights", "att_weights", "base:inv_fertility"], "eval": "source(0) + source(1) * source(2) * 0.5", "out_type": {"dim": AttNumHeads, "shape": (None, AttNumHeads)}}, "att0": {"class": "generic_attention", "weights": "att_weights", "base": "base:enc_value"}, # (B, H, V) "att": {"class": "merge_dims", "axes": "except_batch", "from": ["att0"]}, # (B, H*V) "s": {"class": "rec", "unit": "nativelstm2", "from": ["prev:target_embed", "prev:att"], "n_out": 1000}, # transform "readout_in": {"class": "linear", "from": ["s", "prev:target_embed", "att"], "activation": None, "n_out": 1000}, # merge + post_merge bias "readout": {"class": "reduce_out", "mode": "max", "num_pieces": 2, "from": ["readout_in"]}, "output_prob": { "class": "softmax", "from": ["readout"], "dropout": 0.3, "target": target, "loss": "ce", "loss_opts": {"label_smoothing": 0.1}} }, "target": target, "max_seq_len": "max_len_from('base:encoder')"}, "decision": { "class": "decide", "from": ["output"], "loss": "edit_distance", "target": target, "loss_opts": { #"debug_print": True } }, "ctc": {"class": "softmax", "from": ["encoder"], "loss": "ctc", "target": target, "loss_opts": {"beam_width": 1, "ctc_opts": {"ignore_longer_outputs_than_inputs": True}}} } search_output_layer = "decision" debug_print_layer_output_template = True # trainer batching = "random" log_batch_size = True batch_size = 10000 max_seqs = 200 max_seq_length = {"classes": 75} #chunking = "" # no chunking truncation = -1 def custom_construction_algo(idx, net_dict): # For debugging, use: python3 ./crnn/Pretrain.py config... Maybe set repetitions=1 below. StartNumLayers = 2 InitialDimFactor = 0.5 orig_num_lstm_layers = 0 while "lstm%i_fw" % orig_num_lstm_layers in net_dict: orig_num_lstm_layers += 1 assert orig_num_lstm_layers >= 2 orig_red_factor = 1 for i in range(orig_num_lstm_layers - 1): orig_red_factor *= net_dict["lstm%i_pool" % i]["pool_size"][0] net_dict["#config"] = {} if idx < 4: net_dict["#config"]["batch_size"] = 15000 idx = max(idx - 6, 0) # repeat first num_lstm_layers = idx + StartNumLayers # idx starts at 0. start with N layers if num_lstm_layers > orig_num_lstm_layers: # Finish. This will also use label-smoothing then. return None if num_lstm_layers == 2: net_dict["lstm0_pool"]["pool_size"] = (orig_red_factor,) # Skip to num layers. net_dict["encoder"]["from"] = ["lstm%i_fw" % (num_lstm_layers - 1), "lstm%i_bw" % (num_lstm_layers - 1)] # Delete non-used lstm layers. This is not explicitly necessary but maybe nicer. for i in range(num_lstm_layers, orig_num_lstm_layers): del net_dict["lstm%i_fw" % i] del net_dict["lstm%i_bw" % i] del net_dict["lstm%i_pool" % (i - 1)] # Thus we have layers 0 .. (num_lstm_layers - 1). layer_idxs = list(range(0, num_lstm_layers)) layers = ["lstm%i_fw" % i for i in layer_idxs] + ["lstm%i_bw" % i for i in layer_idxs] grow_frac = 1.0 - float(orig_num_lstm_layers - num_lstm_layers) / (orig_num_lstm_layers - StartNumLayers) dim_frac = InitialDimFactor + (1.0 - InitialDimFactor) * grow_frac for layer in layers: net_dict[layer]["n_out"] = int(net_dict[layer]["n_out"] * dim_frac) if "dropout" in net_dict[layer]: net_dict[layer]["dropout"] *= dim_frac net_dict["enc_value"]["dims"] = (AttNumHeads, int(EncValuePerHeadDim * dim_frac * 0.5) * 2) # Use label smoothing only at the very end. net_dict["output"]["unit"]["output_prob"]["loss_opts"]["label_smoothing"] = 0 return net_dict pretrain = {"repetitions": 5, "copy_param_mode": "subset", "construction_algo": custom_construction_algo} num_epochs = 250 model = "data/exp-%s/model" % setup_name #model = "net-model/network" cleanup_old_models = True gradient_clip = 0 #gradient_clip_global_norm = 1.0 adam = True optimizer_epsilon = 1e-8 accum_grad_multiple_step = 2 #debug_add_check_numerics_ops = True #debug_add_check_numerics_on_output = True #stop_on_nonfinite_train_score = False tf_log_memory_usage = True #debug_grad_summaries = True gradient_noise = 0.0 learning_rate = 0.0008 learning_rates = list(numpy.linspace(0.0001, learning_rate, num=20)) # warmup min_learning_rate = learning_rate / 50. learning_rate_control = "newbob_multi_epoch" #learning_rate_control_error_measure = "dev_score_output" learning_rate_control_relative_error_relative_lr = True learning_rate_control_min_num_epochs_per_new_lr = 3 use_learning_rate_control_always = True newbob_multi_num_epochs = EpochSplit newbob_multi_update_interval = 1 newbob_learning_rate_decay = 0.9 learning_rate_file = "data/exp-%s/train-scores.data" % setup_name # log #log = "| /u/zeyer/dotfiles/system-tools/bin/mt-cat.py >> log/crnn.seq-train.%s.log" % task log = "data/exp-%s/returnn.%s.$date.log" % (setup_name, task) log_verbosity = 5