-
Notifications
You must be signed in to change notification settings - Fork 43
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
1259529
commit f5f36fa
Showing
2 changed files
with
620 additions
and
0 deletions.
There are no files selected for viewing
314 changes: 314 additions & 0 deletions
314
2019-asr-resnet-lace-cnn/LACE/DataAug/lace_specaug_logmel+d+dd.config
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,314 @@ | ||
#!crnn/rnn.py | ||
# kate: syntax python; | ||
# see also file:///u/zeyer/setups/quaero-en/training/quaero-train11/50h/ann/2015-07-29--lstm-gt50/config-train/dropout01.3l.n500.custom_lstm.adam.lr1e_3.config | ||
|
||
import os | ||
import numpy | ||
from subprocess import check_output | ||
|
||
# task | ||
use_tensorflow = True | ||
task = "train" | ||
device = "gpu" | ||
multiprocessing = True | ||
update_on_device = True | ||
|
||
_cf_cache = {} | ||
|
||
def cf(filename): | ||
"""Cache manager""" | ||
if filename in _cf_cache: | ||
return _cf_cache[filename] | ||
if check_output(["hostname"]).strip().decode("utf8") in ["sulfid", "zink", "cobalt", "niob"]: | ||
print("use local file: %s" % filename) | ||
return filename # for debugging | ||
cached_fn = check_output(["cf", filename]).strip().decode("utf8") | ||
assert os.path.exists(cached_fn) | ||
_cf_cache[filename] = cached_fn | ||
return cached_fn | ||
|
||
# data | ||
context_window = 1 | ||
|
||
window = 1 # | ||
feature_dim = 64 # LogMel 64-dim | ||
channel_num = 3 | ||
num_inputs = feature_dim * channel_num * window | ||
num_outputs = 9001 # CART labels | ||
EpochSplit = 6 | ||
|
||
def get_sprint_dataset(data): | ||
assert data in ["train", "cv"] | ||
epochSplit = {"train": EpochSplit, "cv": 1} | ||
|
||
# see /u/tuske/work/ASR/switchboard/corpus/readme | ||
# and zoltans mail https://mail.google.com/mail/u/0/#inbox/152891802cbb2b40 | ||
files = {} | ||
files["config"] = "config/training.config" | ||
files["corpus"] = "/u/corpora/speech/switchboard-1/xml/swb1-all/swb1-all.corpus.gz" | ||
files["segments"] = "dependencies/seg_%s" % {"train":"train", "cv":"cv_head3000"}[data] | ||
files["features"] = "/u/bozheniuk/setups/switchboard/feature_extraction/cluster_setup/logmel64_30/data/logmel.train.bundle" | ||
files["lexicon"] = "/u/tuske/work/ASR/switchboard/corpus/train.lex.v1_0_3.ci.gz" | ||
files["alignment"] = "dependencies/tuske__2016_01_28__align.combined.train" | ||
files["cart"] = "/u/tuske/work/ASR/switchboard/initalign/data/%s" % {9001: "cart-9000"}[num_outputs] | ||
for k, v in sorted(files.items()): | ||
assert os.path.exists(v), "%s %r does not exist" % (k, v) | ||
estimated_num_seqs = {"train": 227047, "cv": 3000} # wc -l segment-file | ||
|
||
# features: /u/tuske/work/ASR/switchboard/feature.extraction/gt40_40/data/gt.train.* | ||
args = [ | ||
"--config=" + files["config"], | ||
lambda: "--*.corpus.file=" + cf(files["corpus"]), | ||
lambda: "--*.corpus.segments.file=" + cf(files["segments"]), | ||
{"train": "--*.corpus.segment-order-shuffle=true", "cv": "--*.corpus.segment-order-sort-by-time-length=true"}[data], | ||
"--*.state-tying.type=cart", | ||
lambda: "--*.state-tying.file=" + cf(files["cart"]), | ||
"--*.trainer-output-dimension=%i" % num_outputs, | ||
lambda: "--*.lexicon.file=" + cf(files["lexicon"]), | ||
lambda: "--*.alignment-cache-path=" + cf(files["alignment"]), | ||
lambda: "--*.feature-cache-path=" + cf(files["features"]), | ||
#"--*.mean-normalization.file=dependencies/setup-base/step254-hybrid-mlp-ibm-cmllr/mlp.4/train_mlp/normalize_layer1/mean", | ||
#"--*.variance-normalization.file=dependencies/setup-base/step254-hybrid-mlp-ibm-cmllr/mlp.4/train_mlp/normalize_layer1/std", | ||
"--*.log-channel.file=log/crnn.sprint.train-dataset.xml", | ||
"--*.window-size=1", | ||
"--*.trainer-output-dimension=%i" % num_outputs | ||
] | ||
return { | ||
"class": "ExternSprintDataset", "sprintTrainerExecPath": "sprint-executables/nn-trainer", | ||
"sprintConfigStr": args, | ||
"partitionEpoch": epochSplit[data], | ||
"estimated_num_seqs": estimated_num_seqs[data] // (epochSplit[data] or 1), | ||
} | ||
|
||
cache_size = "0" | ||
# network | ||
# (also defined by num_inputs & num_outputs) | ||
dropout = 0.05 | ||
ldropout = 0.05 | ||
L2 = 0.1 | ||
|
||
# bn params | ||
masked_time = True | ||
fused = True | ||
axes = ["f"] | ||
|
||
bn_momentum = 0.997 | ||
bn_epsilon = 1e-5 | ||
|
||
cur_feat_dim = feature_dim | ||
network = {} | ||
_last = "data" | ||
|
||
def add_sequential_layer(name, d, from_=None): | ||
global _last, network | ||
assert "from" not in d | ||
if from_ is not None: | ||
d["from"] = from_ | ||
else: | ||
d["from"] = [_last] | ||
assert name not in network | ||
network[name] = d | ||
_last = name | ||
|
||
# data augmentation | ||
def summary(name, x): | ||
""" | ||
:param str name: | ||
:param tf.Tensor x: (batch,time,feature) | ||
""" | ||
import tensorflow as tf | ||
# tf.summary.image wants [batch_size, height, width, channels], | ||
# we have (batch, time, feature). | ||
img = tf.expand_dims(x, axis=3) # (batch,time,feature,1) | ||
img = tf.transpose(img, [0, 2, 1, 3]) # (batch,feature,time,1) | ||
tf.summary.image(name, img, max_outputs=10) | ||
tf.summary.scalar("%s_max_abs" % name, tf.reduce_max(tf.abs(x))) | ||
mean = tf.reduce_mean(x) | ||
tf.summary.scalar("%s_mean" % name, mean) | ||
stddev = tf.sqrt(tf.reduce_mean(tf.square(x - mean))) | ||
tf.summary.scalar("%s_stddev" % name, stddev) | ||
tf.summary.histogram("%s_hist" % name, tf.reduce_max(tf.abs(x), axis=2)) | ||
|
||
|
||
def _mask(x, axis, pos, max_amount): | ||
""" | ||
:param tf.Tensor x: (batch,time,feature) | ||
:param int axis: | ||
:param tf.Tensor pos: (batch,) | ||
:param int max_amount: inclusive | ||
""" | ||
import tensorflow as tf | ||
ndim = x.get_shape().ndims | ||
n_batch = tf.shape(x)[0] | ||
dim = tf.shape(x)[axis] | ||
amount = tf.random_uniform(shape=(n_batch,), minval=1, maxval=max_amount + 1, dtype=tf.int32) | ||
pos2 = tf.minimum(pos + amount, dim) | ||
idxs = tf.expand_dims(tf.range(0, dim), 0) # (1,dim) | ||
pos_bc = tf.expand_dims(pos, 1) # (batch,1) | ||
pos2_bc = tf.expand_dims(pos2, 1) # (batch,1) | ||
cond = tf.logical_and(tf.greater_equal(idxs, pos_bc), tf.less(idxs, pos2_bc)) # (batch,dim) | ||
cond = tf.reshape(cond, [tf.shape(x)[i] if i in (0, axis) else 1 for i in range(ndim)]) | ||
from TFUtil import where_bc | ||
x = where_bc(cond, 0.0, x) | ||
return x | ||
|
||
|
||
def random_mask(x, axis, min_num, max_num, max_dims): | ||
""" | ||
:param tf.Tensor x: (batch,time,feature) | ||
:param int axis: | ||
:param int|tf.Tensor min_num: | ||
:param int|tf.Tensor max_num: inclusive | ||
:param int max_dims: inclusive | ||
""" | ||
import tensorflow as tf | ||
n_batch = tf.shape(x)[0] | ||
num = tf.random_uniform(shape=(n_batch,), minval=min_num, maxval=max_num + 1, dtype=tf.int32) | ||
# https://github.com/tensorflow/tensorflow/issues/9260 | ||
# https://timvieira.github.io/blog/post/2014/08/01/gumbel-max-trick-and-weighted-reservoir-sampling/ | ||
z = -tf.log(-tf.log(tf.random_uniform((n_batch, tf.shape(x)[axis]), 0, 1))) | ||
_, indices = tf.nn.top_k(z, tf.reduce_max(num)) | ||
_, x = tf.while_loop( | ||
cond=lambda i, _: tf.less(i, tf.reduce_max(num)), | ||
body=lambda i, x: ( | ||
i + 1, | ||
tf.where( | ||
tf.less(i, num), | ||
_mask(x, axis=axis, pos=indices[:, i], max_amount=max_dims), | ||
x)), | ||
loop_vars=(0, x)) | ||
return x | ||
|
||
|
||
def random_warp(x, std, scale): | ||
""" | ||
:param tf.Tensor x: (batch,time,dim) | ||
:param (float,float) std: | ||
:param (float,float) scale: | ||
:rtype: tf.Tensor | ||
:return: x transformed | ||
""" | ||
import tensorflow as tf | ||
from TFUtil import create_random_warp_flow_2d, dense_image_warp | ||
x = tf.expand_dims(x, axis=-1) | ||
flow = create_random_warp_flow_2d(tf.shape(x)[:-1], std=std, scale=scale) | ||
x = dense_image_warp(x, flow=flow) | ||
x = tf.squeeze(x, axis=-1) | ||
return x | ||
|
||
|
||
def transform(x, network): | ||
import tensorflow as tf | ||
def get_masked(): | ||
x_masked = x | ||
x_masked = random_mask(x_masked, axis=1, min_num=1, max_num=3, max_dims=40) | ||
x_masked = random_mask(x_masked, axis=2, min_num=1, max_num=2, max_dims=30) | ||
return x_masked | ||
x = network.cond_on_train(get_masked, lambda: x) | ||
return x | ||
|
||
|
||
def jump_net(prefix, inputs, filters, dilation_rate, data_format, conv_time_dim=False): | ||
if data_format == 'channels_first': | ||
NCHW = True | ||
else: | ||
NCHW = False | ||
filter_size = (3, 3) | ||
strides = (1, 1) | ||
padding = "SAME" | ||
use_frequency = False | ||
add_sequential_layer("%s_c1" % prefix, {"class": "conv", "n_out": filters, "filter_size": filter_size, "auto_use_channel_first": NCHW, | ||
"strides": strides, "dilation_rate": (dilation_rate, 1), "padding": padding, "activation": None, "with_bias": False, "dropout": dropout, | ||
"forward_weights_init": "xavier", "L2": L2}) | ||
add_sequential_layer("%s_bn1" % prefix, {"class": "batch_norm", "masked_time": masked_time, "axes": axes, "fused": fused, "momentum": bn_momentum, "epsilon": bn_epsilon}) | ||
add_sequential_layer("%s_y1" % prefix, {"class": "activation", "activation": "relu", "batch_norm": False}) | ||
add_sequential_layer("%s_c2" % prefix, {"class": "conv", "n_out": filters, "filter_size": filter_size, "auto_use_channel_first": NCHW, | ||
"strides": strides, "dilation_rate": (dilation_rate, 1), "padding": padding, "activation": None, "with_bias": False, "dropout": dropout, | ||
"forward_weights_init": "xavier", "L2": L2}) | ||
|
||
add_sequential_layer("%s_p" % prefix, {"class": "combine", "kind": "add"}, from_=["%s_c2" % prefix, inputs]) | ||
add_sequential_layer("%s_bn2" % prefix, {"class": "batch_norm", "masked_time": masked_time, "axes": axes, "fused": fused, "momentum": bn_momentum, "epsilon": bn_epsilon}) | ||
add_sequential_layer("%s_o" % prefix, {"class": "activation", "activation": "relu", "batch_norm": False}) | ||
return "%s_o" % prefix | ||
|
||
|
||
def jump_block(prefix, inputs, jump_net_num, filters, data_format, dilation_rate, attention=False, conv_time_dim=False): | ||
if data_format == 'channels_first': | ||
NCHW = True | ||
else: | ||
NCHW = False | ||
filter_size = (3, 3) | ||
strides = (1, 1) | ||
padding = "SAME" | ||
use_frequency = False | ||
add_sequential_layer("%s_c" % prefix, {"class": "conv", "n_out": filters, "filter_size": filter_size, "auto_use_channel_first": NCHW, | ||
"strides": strides, "dilation_rate": dilation_rate, "padding": padding, "activation": None, "with_bias": False, "dropout": dropout, | ||
"forward_weights_init": "xavier", "L2": L2}) | ||
# doing strides | ||
add_sequential_layer("%s_c_strides" % prefix, {"class": "slice", "axis": "s:1", "slice_step": 2}) | ||
|
||
dilation_rate *= 2 | ||
# print("dilation_rate: ", dilation_rate) | ||
inputs = "%s_c_strides" % prefix | ||
for i in range(1, jump_net_num + 1): | ||
inputs = jump_net("%s_net%i" % (prefix, i + 1), inputs, filters, dilation_rate, data_format=data_format, conv_time_dim=conv_time_dim) | ||
return inputs | ||
|
||
|
||
def lacea(inputs): | ||
block_sizes = [2, 2, 2, 2] | ||
filters = 128 | ||
data_format = 'channels_last' | ||
conv_time_dim = True | ||
dilation_rate = 1 | ||
attention = False | ||
|
||
for i, num_blocks in enumerate(block_sizes): | ||
inputs = jump_block("block%i" % (i + 1), inputs, num_blocks, filters, data_format, dilation_rate, attention=attention, conv_time_dim=conv_time_dim) | ||
filters *= 2 | ||
dilation_rate *= 2 | ||
return | ||
|
||
add_sequential_layer("split", {"class": "split_dims", "axis": "f", "dims": (channel_num, feature_dim)}) # output: (batch, time, window = 61, feature = 40, channel = 1) | ||
add_sequential_layer("swap_axes", {"class": "swap_axes", "axis1": "s:1", "axis2": "f"}) | ||
add_sequential_layer("data_aug", {"class": "eval", "eval": "self.network.get_config().typed_value('transform')(source(0), network=self.network)"}) | ||
lacea(inputs="data_aug") | ||
add_sequential_layer("out_pool", {"class": "reduce", "mode": "avg", "axes": "s:1", "keep_dims": False}) | ||
add_sequential_layer("output", {"class": "softmax", "loss": "ce", "L2": L2, "n_out": num_outputs, "loss_opts": {"focal_loss_factor": 2.0}, "dropout": ldropout}) | ||
|
||
train = get_sprint_dataset("train") | ||
dev = get_sprint_dataset("cv") | ||
|
||
############## debug stuff | ||
debug_print_layer_output_template = True # useful for debugging | ||
#debug_print_layer_output_sizes = True | ||
#debug_print_layer_output_shape = True # might be useful for debugging | ||
#debug_shell_in_runner = True | ||
log_batch_size = True | ||
tf_log_memory_usage = True | ||
############## debug stuff | ||
|
||
# trainer | ||
batching = "random" | ||
batch_size = 24 * (150) | ||
max_seqs = 500 | ||
chunking = "150:150" | ||
truncation = -1 | ||
num_epochs = 120 | ||
gradient_clip = 0 | ||
gradient_noise = 0.1 | ||
momentum = 0.99 | ||
learning_rate = 5e-6 | ||
learning_rate_file = "newbob.data" | ||
learning_rate_control = "newbob_multi_epoch" | ||
learning_rate_control_relative_error_relative_lr = True | ||
newbob_multi_num_epochs = 6 | ||
newbob_multi_update_interval = 1 | ||
model = "net-model/network" | ||
|
||
cleanup_old_models = True | ||
store_metadata_mod_step = None | ||
|
||
# log | ||
log = "log/crnn.train.log" | ||
log_verbosity = 5 |
Oops, something went wrong.