In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import random 

import tensorflow as tf
import numpy as np
import math
import matplotlib.pyplot as plt

from tf_rl.models import Layer, LSTMLayer, MultiLSTMLayer, SequenceWrapper
from tf_rl.controller import DiscreteDeepQ

from os import listdir
from os.path import join

np.set_printoptions(formatter={'float_kind': lambda x: '%.3f' % (x,)})

# Data loading

In [4]:
DATAPATH = "/home/sidor/projects/Dali/data/babi/tasks/en/"

In [5]:
MAX_SENTENCE_LEN = 7

def extract_sentences(filename):
    with open(filename) as f:
        res = []
        for line in f:
            if '?' in line: continue
            line = line[:-1].replace('.', ' ')
            line = line.split(' ')[1:]
            line = [w for w in line if w != '' and w != ' ']
            if len(line) == 0 or len(line) > MAX_SENTENCE_LEN: continue
            res.append(line)
        res = [tuple(s) for s in res]
        res = list(set(res))
        return res

def load_data(datapath, num_tasks, tvt_examples):
    tasks = [f for f in listdir(datapath) if f.endswith('_train.txt')]
    tasks.sort(key=lambda x: int(x.split('_')[0][2:]))
    assert 0 <= num_tasks <= len(tasks)
    
    ntrain, nvalidate, ntest = tvt_examples
    train, validate, test = [], [], []
    tasks_processed = 0
    for task in tasks:
        s = extract_sentences(join(datapath, task))
        print(len(s), task)
        if len(s) < sum(tvt_examples): 
            continue
        random.shuffle(s)
        train.extend(s[:ntrain])
        validate.extend(s[ntrain:(ntrain+nvalidate)])
        test.extend(s[(ntrain+nvalidate):(ntrain+nvalidate+ntest)])
        tasks_processed += 1
        if tasks_processed == num_tasks:
            break
    if tasks_processed < num_tasks:
        raise Exception("Not enough tasks with sufficient examples")
    for dataset in [train, validate, test]:
        random.shuffle(dataset)
    return train, validate, test

train, validate, test = load_data(DATAPATH, 5, (120, 30, 30))
len(train), len(validate), len(test)

120 qa1_single-supporting-fact_train.txt
262 qa2_two-supporting-facts_train.txt
312 qa3_three-supporting-facts_train.txt
120 qa4_two-arg-relations_train.txt
367 qa5_three-arg-relations_train.txt
245 qa6_yes-no-questions_train.txt
346 qa7_counting_train.txt


(600, 150, 150)

In [6]:
all_sentences = train + test + validate
EOS = "**EOS**"
compute_dict = lambda sentences: list(set(w for s in sentences for w in s)) + [EOS]

assert len(compute_dict(train)) >= len(compute_dict(validate)) and \
       len(compute_dict(train)) >= len(compute_dict(test)), "Words in validate/test do not occur in train set"
dictionary = compute_dict(all_sentences)
SEQ_LEN = max(len(s) for s in all_sentences) + 1

In [7]:
from itertools import combinations, product

In [8]:
def move_element(s, x,y):
    indexes = list(range(len(s)))
    indexes.remove(x)
    new_index = indexes[y]
    element = s[x]
    v = s[:x] + s[(x+1):]
    return v[:new_index] + [element] + v[new_index:]

def pad_sentence(sentence):
    sentence = list(sentence[:])
    while len(sentence) < SEQ_LEN:
        sentence.append(EOS)
    return sentence

class Reshuffling(object):
    def __init__(self, sentence, max_len, dictionary, action_type):
        assert len(sentence) == max_len
        self.max_len = max_len
        if action_type == 'swap':
            self.actions = list(combinations(range(max_len), 2)) + ["STOP"]
        elif action_type == 'move':
            self.actions = list(product(range(max_len), range(max_len - 1))) + ["STOP"]
        else:
            assert False
        self.action_type = action_type
        self.num_actions = len(self.actions)
        
        self.observation_size = max_len
        self.goal    = sentence[:]
        self.state       = list(sentence[:])
        random.shuffle(self.state)
        
        self.steps       = 0
        self.stopped     = False
        self.last_objective = self.objective()
        
    def objective(self):
        return sum([1 if a == b else 0 for a, b in zip(self.goal, self.state)])
        
    def perform_action(self, action_idx):
        self.steps += 1
        a = self.actions[action_idx]
        if a == "STOP":
            self.stopped = True
        elif isinstance(a, tuple) and len(a) == 2:
            x, y = a
            if self.action_type == 'swap':
                self.state[x], self.state[y] = self.state[y], self.state[x]
            elif self.action_type == 'move':
                self.state = move_element(self.state, x, y)
            else:
                assert False
        else:
            assert False
            
    def observe(self):
        if self.stopped:
            return None
        else:
            return np.array([dictionary.index(w) for w in self.state])
        
    def done(self):
        return self.stopped
        
    def success(self):
        return r.stopped and self.objective() == self.max_len
    
    def collect_reward(self):
        if self.stopped:
            return 1 if self.objective() == len(self.goal) else 0
        else:
            obj = self.objective()
            reward = obj - self.last_objective 
            self.last_objective = obj
            return reward

In [9]:
r = Reshuffling(pad_sentence(train[0]), SEQ_LEN, dictionary, action_type='move')
print(r.state, r.objective())
r.perform_action(5)
print(r.actions[5])
print(r.state, r.objective())
print(r.goal)
print(r.observe())
print(r.collect_reward())
print(r.collect_reward())
print(r.success())
r.state = r.goal
print(r.success())
print(r.done())
r.perform_action(r.num_actions-1)
print(r.done())

['**EOS**', 'travelled', 'bathroom', '**EOS**', 'to', 'Daniel', 'the'] 1
(0, 5)
['travelled', 'bathroom', '**EOS**', 'to', 'Daniel', 'the', '**EOS**'] 1
['Daniel', 'travelled', 'to', 'the', 'bathroom', '**EOS**', '**EOS**']
[15 29 37  7 20 26 37]
0
0
False
False
False
True


# Model 

In [10]:
class NLPDeepQ(DiscreteDeepQ):
    def create_observation_variable(self, name):
        return tf.placeholder(tf.int32, (SEQ_LEN, None), name=name)

    def prepare_observation(self, observation):
        return observation[:, np.newaxis]

    def batch_samples(self, samples):
        # batch states
        states         = np.empty(self.observation_size + [len(samples)], dtype=np.int32)
        newstates      = np.empty(self.observation_size + [len(samples)], dtype=np.int32)
        action_mask    = np.zeros((len(samples), self.num_actions))

        newstates_mask = np.empty((len(samples),))
        rewards        = np.empty((len(samples),))

        for i, (state, action, reward, newstate) in enumerate(samples):
            states[:, i] = state
            action_mask[i] = 0
            action_mask[i][action] = 1
            rewards[i] = reward
            if newstate is not None:
                newstates[:, i] = newstate
                newstates_mask[i] = 1
            else:
                newstates[:, i] = 0
                newstates_mask[i] = 0

        return states, action_mask, rewards, newstates, newstates_mask

In [11]:
from tf_rl.models import base_name

class NLPLSTM(object):
    def __init__(self, embedding_size, nsymbols, lstm_hiddens, scope="NLPLSTM", initialize=True):
        self.embedding_size, self.nsymbols, self.lstm_hiddens = embedding_size, nsymbols, lstm_hiddens
        self.scope = scope
        self.embedding, self.lstm = None, None
        if initialize:
            with tf.variable_scope(self.scope):
                embedding_i =  tf.random_uniform_initializer(- 1.0 / math.sqrt(embedding_size), 
                                                             1.0 / math.sqrt(embedding_size))
                self.embedding = tf.get_variable('embedding', (nsymbols, embedding_size), initializer=embedding_i)
                self.lstm = MultiLSTMLayer(embedding_size, lstm_hiddens)

    def __call__(self, words):
        embedded = tf.nn.embedding_lookup(self.embedding, words, name="embedded")
        lstm_inputs  = [ embedded[i,:,:] for i in range(embedded.get_shape().as_list()[0])]

        rnn_outputs = []
        rnn_states = []
        batch_size = tf.shape(lstm_inputs[0])[0]
        state = self.lstm.initial_state(batch_size)
        for input_ in lstm_inputs:
            output, state = self.lstm(input_, state)
            rnn_outputs.append(output)
            rnn_states.append(state)
        
        return rnn_outputs, rnn_states
    
    def variables(self):
        return [self.embedding] + self.lstm.variables()
    
    def copy(self, scope=None):
        if scope is None:
            scope = self.scope + "_copy"
        res = NLPLSTM(self.embedding_size, self.nsymbols, self.lstm_hiddens, scope=scope, initialize=False)
        with tf.variable_scope(scope):
            res.embedding = tf.get_variable(base_name(self.embedding), self.embedding.get_shape(),
                        initializer=lambda x, dtype=tf.float32: self.embedding.initialized_value())
            res.lstm     = self.lstm.copy()
        return res

In [13]:
ACTION_TYPE='move'
r = Reshuffling(pad_sentence(train[0]), SEQ_LEN, dictionary, action_type=ACTION_TYPE)

In [12]:
EMBEDDING_SIZE = 50
HIDDEN_SIZES   = [100,100,50]
NSYMBOLS       = len(dictionary)
NACTIONS       = r.num_actions

In [15]:
tf.ops.reset_default_graph()
if 'session' in globals():
    session.close()
session = tf.InteractiveSession()

with tf.device("/cpu:0"):
    lstm = NLPLSTM(EMBEDDING_SIZE, NSYMBOLS, HIDDEN_SIZES, scope="nlplstm")
    decoder = Layer(HIDDEN_SIZES[-1], NACTIONS, scope="decoder")

    brain = SequenceWrapper([
        lstm,
        lambda x: x[0][-1][-1],
        decoder,   
    ])
    optimizer = tf.train.AdamOptimizer(learning_rate=0.0005, beta1=0.5)
    # optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.005)

    assert r.observation_size == SEQ_LEN

    current_controller = NLPDeepQ(r.observation_size, r.num_actions, brain, optimizer, session,
                                  discount_rate=0.99, exploration_period=5000, max_experience=20000, 
                                  store_every_nth=1, train_every_nth=2,
                                  target_network_update_rate=0.001,
                                  summary_writer=None)

    session.run(tf.initialize_all_variables())

In [16]:
def run_example(sentence, training=True, debug=False):
    r = Reshuffling(pad_sentence(sentence), SEQ_LEN, dictionary, action_type=ACTION_TYPE)
    state = r.observe()
    while not r.done() and r.steps < 15:
        r_prev_state = r.state[:]
        action = current_controller.action(state, exploration=training)
        r.perform_action(action)
        reward = r.collect_reward()
        if debug: print('%d\t%s\t%.1f\t%s' % (r.steps, r.actions[action], float(reward), r_prev_state))
        newstate = r.observe()
        if training:
            current_controller.store(state, action, reward, newstate)
            current_controller.training_step()
        state = newstate
    return r

def accuracy(dataset, seed=None, repeats=1):
    if seed: random.seed(seed)
    result = 0.0
    avg_objective = 0.0
    for ex in dataset:
        for _ in range(repeats):
            r = run_example(ex, training=False, debug=False)
            result += 1.0 if r.success() else 0.0
            avg_objective += r.objective()
    avg_objective /= len(dataset) * repeats
    result /= len(dataset) * repeats
    if seed: random.seed()
    return result, avg_objective

In [17]:
saver = tf.train.Saver()
best_acc = 0.0

In [None]:
for epoch in range(0, 100):
    cur_acc, cur_obj = accuracy(validate, seed=123123, repeats=3)
    if cur_acc > best_acc:
        best_acc = cur_acc
        saver.save(session, "babi_model.chpt")
    print("Accuracy before epoch %d => accuracy: %.1f %%, avg_objective: %.1f" % (epoch, 100.0 * cur_acc, cur_obj))
    run_example(random.choice(validate), training=False, debug=True)
    random.shuffle(train)
    no_successful = 0
    for i, ex in enumerate(train):
        r = run_example(ex)
        if r.success(): no_successful += 1
        if i % 50 == 0:
            print(" Example %d: %d" % (i, no_successful), flush=True)

Accuracy before epoch 0 => accuracy: 0.0 %, avg_objective: 1.2
1	(2, 2)	0.0	['to', 'went', 'Fred', '**EOS**', '**EOS**', 'bedroom', 'the']
2	(2, 2)	0.0	['to', 'went', '**EOS**', 'Fred', '**EOS**', 'bedroom', 'the']
3	(2, 2)	0.0	['to', 'went', 'Fred', '**EOS**', '**EOS**', 'bedroom', 'the']
4	(2, 2)	0.0	['to', 'went', '**EOS**', 'Fred', '**EOS**', 'bedroom', 'the']
5	(2, 2)	0.0	['to', 'went', 'Fred', '**EOS**', '**EOS**', 'bedroom', 'the']
6	(2, 2)	0.0	['to', 'went', '**EOS**', 'Fred', '**EOS**', 'bedroom', 'the']
7	(2, 2)	0.0	['to', 'went', 'Fred', '**EOS**', '**EOS**', 'bedroom', 'the']
8	(2, 2)	0.0	['to', 'went', '**EOS**', 'Fred', '**EOS**', 'bedroom', 'the']
9	(2, 2)	0.0	['to', 'went', 'Fred', '**EOS**', '**EOS**', 'bedroom', 'the']
10	(2, 2)	0.0	['to', 'went', '**EOS**', 'Fred', '**EOS**', 'bedroom', 'the']
11	(2, 2)	0.0	['to', 'went', 'Fred', '**EOS**', '**EOS**', 'bedroom', 'the']
12	(2, 2)	0.0	['to', 'went', '**EOS**', 'Fred', '**EOS**', 'bedroom', 'the']
13	(2, 2)	0.0	['to', '

In [163]:
print(len(current_controller.experience))
run_example(validate[4], training=False, debug=True)

0
1	(0, 2)	0.0	['discarded', '**EOS**', 'there', 'John', 'milk', 'the']
2	(0, 2)	-2.0	['there', '**EOS**', 'discarded', 'John', 'milk', 'the']
3	(0, 2)	-2.0	['discarded', '**EOS**', 'there', 'John', 'milk', 'the']
4	(0, 2)	-2.0	['there', '**EOS**', 'discarded', 'John', 'milk', 'the']
5	(0, 2)	-2.0	['discarded', '**EOS**', 'there', 'John', 'milk', 'the']
6	(0, 2)	-2.0	['there', '**EOS**', 'discarded', 'John', 'milk', 'the']
7	(0, 2)	-2.0	['discarded', '**EOS**', 'there', 'John', 'milk', 'the']
8	(0, 2)	-2.0	['there', '**EOS**', 'discarded', 'John', 'milk', 'the']
9	(0, 2)	-2.0	['discarded', '**EOS**', 'there', 'John', 'milk', 'the']
10	(0, 2)	-2.0	['there', '**EOS**', 'discarded', 'John', 'milk', 'the']
11	(0, 2)	-2.0	['discarded', '**EOS**', 'there', 'John', 'milk', 'the']
12	(0, 2)	-2.0	['there', '**EOS**', 'discarded', 'John', 'milk', 'the']
13	(0, 2)	-2.0	['discarded', '**EOS**', 'there', 'John', 'milk', 'the']
14	(0, 2)	-2.0	['there', '**EOS**', 'discarded', 'John', 'milk', 'the']


<__main__.Reshuffling at 0x7fbe33b2fbe0>

In [137]:
accuracy(test, seed=4434)

(0.37333333333333335, 3.2666666666666666)

In [140]:
r = run_example(test[0], training=False, debug=True)

0	(1, 4)	['to', 'Daniel', '**EOS**', 'kitchen', 'the', 'travelled']
1	(1, 4)	['to', 'the', '**EOS**', 'kitchen', 'Daniel', 'travelled']
2	(1, 4)	['to', 'Daniel', '**EOS**', 'kitchen', 'the', 'travelled']
3	(1, 4)	['to', 'the', '**EOS**', 'kitchen', 'Daniel', 'travelled']
4	(1, 4)	['to', 'Daniel', '**EOS**', 'kitchen', 'the', 'travelled']
5	(1, 4)	['to', 'the', '**EOS**', 'kitchen', 'Daniel', 'travelled']
6	(1, 4)	['to', 'Daniel', '**EOS**', 'kitchen', 'the', 'travelled']
7	(1, 4)	['to', 'the', '**EOS**', 'kitchen', 'Daniel', 'travelled']
8	(1, 4)	['to', 'Daniel', '**EOS**', 'kitchen', 'the', 'travelled']
9	(1, 4)	['to', 'the', '**EOS**', 'kitchen', 'Daniel', 'travelled']
10	(1, 4)	['to', 'Daniel', '**EOS**', 'kitchen', 'the', 'travelled']
11	(1, 4)	['to', 'the', '**EOS**', 'kitchen', 'Daniel', 'travelled']
12	(1, 4)	['to', 'Daniel', '**EOS**', 'kitchen', 'the', 'travelled']
13	(1, 4)	['to', 'the', '**EOS**', 'kitchen', 'Daniel', 'travelled']
14	(1, 4)	['to', 'Daniel', '**EOS**', 'kitch