2018-attention/wmt2017deen/mbleu.enc4l.pretrain2.adam.lr1e_3.mseqs100.bs4000.ls01.tembi0.invfert.oeps1e_8.gradnoise0.seqsort1000.config

#!crnn/rnn.py
# kate: syntax python;
# -*- mode: python -*-
# sublime: syntax 'Packages/Python Improved/PythonImproved.tmLanguage'
# vim:set expandtab tabstop=4 fenc=utf-8 ff=unix ft=python:

# run:
# returnn/rnn.py returnn.config

import os
from subprocess import check_output
import numpy

my_dir = os.path.dirname(os.path.abspath(__file__))

# task
use_tensorflow = True
task = config.value("task", "train")
device = "gpu"
multiprocessing = True
update_on_device = True

debug_mode = False
if int(os.environ.get("DEBUG", "0")):
    print("** DEBUG MODE")
    debug_mode = True

if config.has("beam_size"):
    beam_size = config.int("beam_size", 0)
    print("** beam_size %i" % beam_size)
elif task == "train":
    beam_size = 4
else:
    beam_size = 12
if task == "train":
    search_max_seq_len = "tf.to_int32(tf.to_float(max_len_from('data:classes')) * 1.2)"
else:
    search_max_seq_len = "max_len_from('encoder') * 3"

# data
# see: returnn/tools/dump-dataset.py "{'class':'TranslationDataset', 'path':'dataset', 'file_postfix':'train'}"
num_outputs = {'data': [24236, 1], 'classes': [24214, 1]}
num_inputs = num_outputs["data"][0]

# see: returnn/tools/dump-dataset.py "{'class':'TranslationDataset', 'path':'dataset', 'file_postfix':'train', 'seq_ordering':'sorted'}" --get_num_seqs
num_seqs = {'train': 4218414, 'dev': 3003}
# Via JTP: 5 times seeing the whole train dataset is enough.
EpochSplit = 50
SeqOrderTrainBins = num_seqs["train"] // 1000
TrainSeqOrder = "laplace:%i" % SeqOrderTrainBins
if debug_mode:
    TrainSeqOrder = "default"

_cf_cache = {}

def cf(filename):
    """Cache manager"""
    if filename in _cf_cache:
        return _cf_cache[filename]
    if int(os.environ.get("CF_NOT_FOR_LOCAL", "1")) and check_output(["hostname"]).strip() in ["cluster-cn-211", "sulfid"]:
        print("use local file: %s" % filename)
        return filename  # for debugging
    cached_fn = check_output(["cf", filename]).strip()
    assert os.path.exists(cached_fn)
    _cf_cache[filename] = cached_fn
    return cached_fn


def get_dataset(data):
    epochSplit = {"train": EpochSplit}.get(data, 1)
    return {
        "class": "TranslationDataset",
        "path": "base/dataset",
        "file_postfix": data,
        "source_postfix": " </S>",
        "target_postfix": " </S>",
        "partition_epoch": epochSplit,
        "seq_ordering": {"train": TrainSeqOrder, "dev": "sorted"}.get(data, "default"),
        "estimated_num_seqs": (num_seqs.get(data, None) // epochSplit) if data in num_seqs else None}

train = get_dataset("train")
dev = get_dataset("dev")
dev_short = get_dataset("dev_short")
test_data = get_dataset("test")
eval_data = get_dataset("eval")
cache_size = "0"
window = 1

# network
# (also defined by num_inputs & num_outputs)
network = {
"source_embed": {"class": "linear", "activation": None, "with_bias": False, "n_out": 620},

"lstm0_fw" : { "class": "rec", "unit": "nativelstm2", "n_out" : 1000, "direction": 1, "from": ["source_embed"] },
"lstm0_bw" : { "class": "rec", "unit": "nativelstm2", "n_out" : 1000, "direction": -1, "from": ["source_embed"] },

"lstm1_fw" : { "class": "rec", "unit": "nativelstm2", "n_out" : 1000, "direction": 1, "from": ["lstm0_fw", "lstm0_bw"] },
"lstm1_bw" : { "class": "rec", "unit": "nativelstm2", "n_out" : 1000, "direction": -1, "from": ["lstm0_fw", "lstm0_bw"] },

"lstm2_fw" : { "class": "rec", "unit": "nativelstm2", "n_out" : 1000, "direction": 1, "from": ["lstm1_fw", "lstm1_bw"] },
"lstm2_bw" : { "class": "rec", "unit": "nativelstm2", "n_out" : 1000, "direction": -1, "from": ["lstm1_fw", "lstm1_bw"] },

"lstm3_fw" : { "class": "rec", "unit": "nativelstm2", "n_out" : 1000, "direction": 1, "from": ["lstm2_fw", "lstm2_bw"] },
"lstm3_bw" : { "class": "rec", "unit": "nativelstm2", "n_out" : 1000, "direction": -1, "from": ["lstm2_fw", "lstm2_bw"] },

"encoder": {"class": "copy", "from": ["lstm3_fw", "lstm3_bw"]},
"enc_ctx": {"class": "linear", "activation": None, "with_bias": True, "from": ["encoder"], "n_out": 1000},  # preprocessed_attended in Blocks
"inv_fertility": {"class": "linear", "activation": "sigmoid", "with_bias": False, "from": ["encoder"], "n_out": 1},

"output": {"class": "rec", "from": [], "unit": {
    'output': {'class': 'choice', 'target': 'classes', 'beam_size': beam_size, 'from': ["output_prob"], "initial_output": 0},
    "end": {"class": "compare", "from": ["output"], "value": 0},
    'target_embed': {'class': 'linear', 'activation': None, "with_bias": False, 'from': ['output'], "n_out": 621, "initial_output": 0},  # feedback_input
    "weight_feedback": {"class": "linear", "activation": None, "with_bias": False, "from": ["prev:accum_att_weights"], "n_out": 1000},
    "prev_s_state": {"class": "get_last_hidden_state", "from": ["prev:s"], "n_out": 2000},
    "prev_s_transformed": {"class": "linear", "activation": None, "with_bias": False, "from": ["prev_s_state"], "n_out": 1000},
    "energy_in": {"class": "combine", "kind": "add", "from": ["base:enc_ctx", "weight_feedback", "prev_s_transformed"], "n_out": 1000},
    "energy_tanh": {"class": "activation", "activation": "tanh", "from": ["energy_in"]},
    "energy": {"class": "linear", "activation": None, "with_bias": False, "from": ["energy_tanh"], "n_out": 1},  # (B, enc-T, 1)
    "att_weights": {"class": "softmax_over_spatial", "from": ["energy"]},  # (B, enc-T, 1)
    "accum_att_weights": {"class": "eval", "from": ["prev:accum_att_weights", "att_weights", "base:inv_fertility"],
        "eval": "source(0) + source(1) * source(2) * 0.5", "out_type": {"dim": 1, "shape": (None, 1)}},
    "att": {"class": "generic_attention", "weights": "att_weights", "base": "base:encoder"},
    "s": {"class": "rnn_cell", "unit": "LSTMBlock", "from": ["target_embed", "att"], "n_out": 1000},  # transform
    "readout_in": {"class": "linear", "from": ["prev:s", "prev:target_embed", "att"], "activation": None, "n_out": 1000},  # merge + post_merge bias
    "readout": {"class": "reduce_out", "mode": "max", "num_pieces": 2, "from": ["readout_in"]},
    "output_prob": {
        "class": "softmax", "from": ["readout"], "dropout": 0.3,
        "loss_only_on_non_search": True,
        "target": "classes", "loss": "ce", "loss_opts": {"label_smoothing": 0.1}}
}, "target": "classes", "max_seq_len": search_max_seq_len, "loss_scale": 0.01},

"min_bleu": {
    "class": "copy", "from": ["output"], "only_on_search": True,
    "loss": "expected_loss", "target": "classes",
    "loss_opts": {"loss": {"class": "bleu"}, "loss_kind": "error"}
},

"decision": {
    "class": "decide", "from": ["output"], "loss": "edit_distance", "target": "classes",
    "only_on_eval": True, "only_on_search": True
},
"decision_bleu": {
    "class": "decide", "from": ["output"], "loss": "bleu", "target": "classes",
    "only_on_eval": True, "only_on_search": True
}
}
search_train_network_layers = ["output", "decision", "decision_bleu", "min_bleu"]
search_output_layer = "decision"
debug_print_layer_output_template = True

# trainer
batching = "random"
log_batch_size = True
# These might be overwritten during pretrain.
batch_size = 1400
max_seqs = 100
max_seq_length = 60
#chunking = ""  # no chunking
truncation = -1
num_epochs = 200
model = "net-model/network"
cleanup_old_models = True

def custom_construction_algo(idx, net_dict):
    # For debugging, use: python3 ./crnn/Pretrain.py config...
    orig_num_lstm_layers = 0
    while "lstm%i_fw" % orig_num_lstm_layers in net_dict:
        orig_num_lstm_layers += 1
    num_lstm_layers = idx + 1  # idx starts at 0. start with 1 layer
    # The seq-training at the very end.
    net_dict["output"]["loss_scale"] = 1.0
    del net_dict["min_bleu"]
    if task != "search":
        del net_dict["decision"]
    del net_dict["decision_bleu"]
    if task == "train":
        # We can (and should) use larger batch size initially.
        net_dict["#config"] = {
            "batch_size": 4000,
            "max_seqs": 100,
            "max_seq_length": 75,
            "search_train_network_layers": [],
            "learning_rate": 0.001
        }
    if num_lstm_layers == orig_num_lstm_layers:
        if task == "train":
            # do normal learning rate scheduling
            del net_dict["#config"]["learning_rate"]
        # do normal training for a while
        net_dict["#repetition"] = 100
        return net_dict
    if num_lstm_layers == orig_num_lstm_layers + 1:
        # Now the original network, but reset learning rate.
        net_dict = network.copy()
        net_dict["#config"] = {"learning_rate": 0.0005}
        net_dict["#repetition"] = 1
        return net_dict
    if num_lstm_layers >= orig_num_lstm_layers:
        return None
    # Disable label smoothing initially.
    net_dict["output"]["unit"]["output_prob"]["loss_opts"]["label_smoothing"] = 0
    # Leave the last layer as-is, but only modify its source.
    net_dict["encoder"]["from"] = ["lstm%i_fw" % (num_lstm_layers - 1), "lstm%i_bw" % (num_lstm_layers - 1)]
    # Delete non-used lstm layers. This is not explicitly necessary but maybe nicer.
    for i in range(num_lstm_layers, orig_num_lstm_layers):
        del net_dict["lstm%i_fw" % i]
        del net_dict["lstm%i_bw" % i]
    return net_dict

pretrain = {"repetitions": 5, "construction_algo": custom_construction_algo}

gradient_clip = 0
#gradient_clip_global_norm = 1.0
adam = True
optimizer_epsilon = 1e-8
#debug_add_check_numerics_ops = True
debug_add_check_numerics_on_output = True
tf_log_memory_usage = True
gradient_noise = 0.0
learning_rate = 0.001
learning_rate_control = "newbob_multi_epoch"
learning_rate_control_error_measure = ["dev_score_extra_search:min_bleu", "dev_score"]
learning_rate_control_relative_error_relative_lr = True
learning_rate_control_min_num_epochs_per_new_lr = 3
use_learning_rate_control_always = True
newbob_multi_num_epochs = 6
newbob_multi_update_interval = 1
newbob_learning_rate_decay = 0.9
learning_rate_file = "newbob.data"

# log
#log = "| /u/zeyer/dotfiles/system-tools/bin/mt-cat.py >> log/crnn.seq-train.%s.log" % task
log = "log/crnn.%s.log" % task
log_verbosity = 5


#blocks_mt_model = "/u/peter/experiments/wmt/2017/2017-08-14_de-en/work/blocks/Train.JmZDphtmhGAU/output/model.0300000"
#blocks_mt_model = "%s/blocks-params.npz" % my_dir
#blocks_debug_dump_output = "/Users/az/Programmierung/blocks-mt-model-import/dump-seq0"