In [1]:
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_VISIBLE_DEVICES=0


In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
import tensorflow as tf
import tensorflow.contrib.slim as slim

In [4]:
def data_feed(filenames, batch_size):
    file_q = tf.train.string_input_producer(filenames)
    reader = tf.TextLineReader()
    k, v = reader.read(file_q)

    eng, jp = tf.decode_csv(v, record_defaults=[[""], [""]], field_delim="\t")
    eng = tf.string_split([eng], delimiter=" ").values
    eng = tf.string_to_number(eng, tf.int32)
    jp = tf.string_split([jp], delimiter=" ").values
    jp = tf.string_to_number(jp, tf.int32)
    seq_len = tf.tuple([eng, jp])

    def py_which_bucket(eng, jp):
        if len(eng) <= 51 and len(jp) <= 52:
            return np.int32(0)
        elif len(eng) <= 101 and len(jp) <= 102:
            return np.int32(1)
        return np.int32(2)

    which_bucket = tf.py_func(py_which_bucket, seq_len, tf.int32)
    capacity = 10000 + 8 * batch_size
    _, (eng_batch, jp_batch) = tf.contrib.training.bucket([eng, jp], which_bucket, batch_size=batch_size, num_buckets=5, num_threads=8, dynamic_pad=True, capacity=capacity)
    return eng_batch, jp_batch

In [5]:
emb_size = 512
n_hidden = 512
en_vocab_size = 131 + 4
jp_vocab_size = 3202 + 4
n_layers = 4
n_attn_heads = 8
dropout_keep_prob = 0.75

In [6]:
def init_embedding(X, Y, reuse=False):
    with tf.variable_scope("embedding", reuse=reuse), tf.device("/cpu:0"):
        en_emb = tf.get_variable("en_emb", 
                                [en_vocab_size, emb_size], 
                                dtype=tf.float32,
                                initializer=tf.truncated_normal_initializer(stddev=1e-4))

        jp_emb = tf.get_variable("jp_emb", 
                                [jp_vocab_size, emb_size], 
                                dtype=tf.float32,
                                initializer=tf.truncated_normal_initializer(stddev=1e-4))

        P_in = tf.get_variable("P_in", 
                            [401, emb_size], 
                            dtype=tf.float32,
                            initializer=tf.truncated_normal_initializer(stddev=1e-4))
        P_out = tf.get_variable("P_out", 
                            [402, emb_size], 
                            dtype=tf.float32,
                            initializer=tf.truncated_normal_initializer(stddev=1e-4))

        X_emb = tf.nn.embedding_lookup(en_emb, X) + P_in[:tf.shape(X)[1], :]
        Y_emb = tf.nn.embedding_lookup(jp_emb, Y) + P_out[:tf.shape(Y)[1], :]
        return X_emb, Y_emb

In [7]:
def encoder_block(inp, n_hidden, filter_size):
    inp = tf.expand_dims(inp, 2)
    inp = tf.pad(inp, [[0, 0], [(filter_size[0]-1)//2, (filter_size[0]-1)//2], [0, 0], [0, 0]])
    conv = slim.convolution(inp, n_hidden, filter_size, data_format="NHWC", padding="VALID", activation_fn=None)
    conv = tf.squeeze(conv, 2)
    return conv

def decoder_block(inp, n_hidden, filter_size):
    inp = tf.expand_dims(inp, 2)
    inp = tf.pad(inp, [[0, 0], [filter_size[0]-1, 0], [0, 0], [0, 0]])
    conv = slim.convolution(inp, n_hidden, filter_size, data_format="NHWC", padding="VALID", activation_fn=None)
    conv = tf.squeeze(conv, 2)
    return conv

In [8]:
def glu(x):
    return tf.multiply(x[:, :, :tf.shape(x)[2]//2], tf.sigmoid(x[:, :, tf.shape(x)[2]//2:]))

In [9]:
def layer(inp, conv_block, kernel_width, n_hidden, residual=None):
    z = conv_block(inp, n_hidden, (kernel_width, 1))
    return glu(z) + (residual if residual is not None else 0)

In [10]:
def encoder(inp, n_layers, device):
    with tf.variable_scope("encoder", reuse=device > 0), tf.device("/gpu:%d" % device):
        inp = e = tf.nn.dropout(inp, dropout_keep_prob)
        for i in range(n_layers):
            z = layer(inp, encoder_block, 3, n_hidden * 2, inp)
            z = tf.nn.dropout(z, dropout_keep_prob)
            inp = z
        return z, z + e

In [11]:
def decoder(inp, zu, ze, n_layers, device):
    with tf.variable_scope("decoder", reuse=device > 0), tf.device("/gpu:%d" % device):
        inp = g = tf.nn.dropout(inp, dropout_keep_prob)
        for i in range(n_layers):
            attn_res = h = layer(inp, decoder_block, 3, n_hidden * 2, residual=tf.zeros_like(inp))
            C = []
            for j in range(n_attn_heads):
                h_ = slim.linear(h, n_hidden//n_attn_heads)
                g_ = slim.linear(g, n_hidden//n_attn_heads)
                zu_ = slim.linear(zu, n_hidden//n_attn_heads)
                ze_ = slim.linear(ze, n_hidden//n_attn_heads)
                
                d = slim.linear(h_, n_hidden//n_attn_heads) + g_
                dz = tf.matmul(d, tf.transpose(zu_, [0, 2, 1]))
                a = tf.nn.softmax(dz)
                c_ = tf.matmul(a, ze_)
                C.append(c_)
            c = tf.concat(C, 2)
            h = slim.linear(attn_res + c, n_hidden)
            h = tf.nn.dropout(h, dropout_keep_prob)
            inp = h
        return h

In [12]:
def init_loss(hg, jp_batch, device):
    with tf.variable_scope("logits", reuse=device > 0), tf.device("/gpu:%d" % device):
        logits = slim.fully_connected(hg, jp_vocab_size) ## ?
        logits = logits[:, :-1]
        pred = tf.nn.softmax(logits)
        logits_shape = tf.shape(logits)
        logits = tf.reshape(logits, [logits_shape[0] * logits_shape[1], jp_vocab_size])
        labels = jp_batch[:, 1:]
        labels = tf.reshape(labels, [-1,])
        loss_mask = labels > 0
        logits = tf.boolean_mask(logits, loss_mask)
        labels = tf.boolean_mask(labels, loss_mask)
        
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits)
        loss = tf.reduce_mean(loss)
        tf.summary.scalar('softmax_loss', loss);
        return loss

In [13]:
def init_optimizer(loss, params, global_step, device):
    with tf.variable_scope("optimizer_%d" % device), tf.device("/gpu:%d" % device):
        opt = tf.train.AdamOptimizer(1e-4)
        grads = opt.compute_gradients(loss, params)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = opt.apply_gradients(grads, global_step=global_step)
        return train_op

In [14]:
import glob
import random

tf.reset_default_graph()

n_gpus = 1
n_layers = 4
filenames = glob.glob("../data/translation_id.tsv")
losses = []
for device in range(n_gpus):    
    random.shuffle(filenames)
    en_batch, jp_batch = data_feed(filenames, batch_size=64)
    en_emb, jp_emb = init_embedding(en_batch, jp_batch, reuse=device > 0)
    zu, ze = encoder(en_emb, n_layers, device)
    hg = decoder(jp_emb, zu, ze, n_layers, device)
    loss = init_loss(hg, jp_batch, device)
    losses.append(loss)

In [15]:
params = tf.trainable_variables()
global_step = tf.get_variable("global_step", shape=[], dtype=tf.int64, initializer=tf.constant_initializer(0), trainable=False)
train_ops = []
for device in range(n_gpus):
    train_op = init_optimizer(losses[device], params, global_step, device)
    train_ops.append(train_op)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [16]:
log_every = 100
checkpoint_every = 1000

def train_loop(sess, coord, saver, summary, summary_writer, train_op, global_step, device, run_event):
    try:
        while not coord.should_stop() and run_event.is_set():
            _, step, s = sess.run([train_op, global_step, summary])
            summary_writer.add_summary(s, step)
            if step % log_every == 0:
                summary_writer.flush()
            if step % checkpoint_every == 0:
                saver.save(sess, checkpoint_dir + "/model.ckpt")
    except tf.errors.OutOfRangeError:
        print("device %d id done!" % device)

In [None]:
import glob
import os
import threading


checkpoint_dir = "../checkpoints/tatoeba_v3"
if not os.path.exists(checkpoint_dir):
    os.mkdir(checkpoint_dir)

saver = tf.train.Saver(params + [global_step])
config = tf.ConfigProto(allow_soft_placement=True)
with tf.Session(config=config) as sess:
    summary = tf.summary.merge_all()
    summary_writer = tf.summary.FileWriter(checkpoint_dir)
    sess.run(tf.global_variables_initializer())
    
    if glob.glob(checkpoint_dir + "/model.ckpt*"):
        saver.restore(sess, checkpoint_dir + "/model.ckpt")
        print("loaded checkpoint")

    else:
        print("start fresh")
    
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    try:
        run_event = threading.Event()
        run_event.set()
        train_threads = []
        for device in range(n_gpus):
            t = threading.Thread(target=train_loop, args=[sess,
                                                         coord,
                                                         saver,
                                                         summary,
                                                         summary_writer,
                                                         train_ops[device],
                                                         global_step,
                                                         device,
                                                         run_event])
            t.start()
            train_threads.append(t)
        for t in train_threads:
            t.join()
    except KeyboardInterrupt:
        print("stopped")
        run_event.clear()
        for t in train_threads:
            t.join()
        print("stopped all threads")
    finally:
        # When done, ask the threads to stop.
        summary_writer.flush()
        saver.save(sess, checkpoint_dir + "/model.ckpt")
        coord.request_stop()
        coord.join(threads)

start fresh
