In [1]:
import gym
import textworld.gym
import numpy as np
import tensorflow as tf
from collections import deque
import nltk
import string
import time
from tensorflow.contrib.rnn.python.ops import core_rnn_cell
import random

  from ._conv import register_converters as _register_converters


In [2]:
request_infos=textworld.EnvInfos(admissible_commands=True,entities=True,description=True,basics=True,command_templates=True,max_episode_steps=50)
env_id=textworld.gym.register_game("/home/sirzechlucifer/tw_games/custom_game.ulx",request_infos)
env=gym.make(env_id)
obs, infos = env.reset() 
env.render()




                    ________  ________  __    __  ________
                   |        \|        \|  \  |  \|        \
                    \$$$$$$$$| $$$$$$$$| $$  | $$ \$$$$$$$$
                      | $$   | $$__     \$$\/  $$   | $$
                      | $$   | $$  \     >$$  $$    | $$
                      | $$   | $$$$$    /  $$$$\    | $$
                      | $$   | $$_____ |  $$ \$$\   | $$
                      | $$   | $$     \| $$  | $$   | $$
                       \$$    \$$$$$$$$ \$$   \$$    \$$
              __       __   ______   _______   __        _______
             |  \  _  |  \ /      \ |       \ |  \      |       \
             | $$ / \ | $$|  $$$$$$\| $$$$$$$\| $$      | $$$$$$$\
             | $$/  $\| $$| $$  | $$| $$__| $$| $$      | $$  | $$
             | $$  $$$\ $$| $$  | $$| $$    $$| $$      | $$  | $$
             | $$ $$\$$\$$| $$  | $$| $$$$$$$\| $$      | $$  | $$
             | $$$$  \$$$$| $$__/ $$| $$  | $$| $$_____ | $$__/ $$
          

In [3]:
infos["command_templates"]

['close {c}',
 'close {d}',
 'drop {o}',
 'eat {f}',
 'examine {d}',
 'examine {o}',
 'examine {t}',
 'go east',
 'go north',
 'go south',
 'go west',
 'insert {o} into {c}',
 'inventory',
 'lock {c} with {k}',
 'lock {d} with {k}',
 'look',
 'open {c}',
 'open {d}',
 'put {o} on {s}',
 'take {o}',
 'take {o} from {c}',
 'take {o} from {s}',
 'unlock {c} with {k}',
 'unlock {d} with {k}']

In [4]:
infos["entities"]

['gateway',
 'door',
 'spherical locker',
 'toolbox',
 'type Z box',
 'spherical keycard',
 'type Z passkey',
 'insect',
 'counter',
 'shelf',
 'latchkey',
 'frisbee',
 'north',
 'south',
 'east',
 'west']

## Working code
        Shape and command updates left.
        Parameter updates left

In [5]:
class LSTMDQN():
    def __init__(self, game, rnn_size=1250, batch_size=25,
               seq_length=200, embed_dim=200, layer_depth=3,
               start_epsilon=1, epsilon_end_time=1000000,
               memory_size=1000000, 
               checkpoint_dir="checkpoint", forward_only=False):
        self.rnn_size = rnn_size
        self.seq_length = seq_length
        self.batch_size = batch_size
        self.layer_depth = layer_depth

        self.embed_dim = embed_dim
        self.vocab_size = 1251

        self.epsilon = self.start_epsilon = start_epsilon
        self.final_epsilon = 0.05
        self.observe = 500
        self.explore = 500
        self.gamma = 0.99
        self.num_action_per_step = 1
        self.memory_size = memory_size
        self.game = game
        self._attrs = ['epsilon', 'final_epsilon', 'oberve','explore', 'gamma', 'memory_size', 'batch_size']
        self.build_model()

    def build_model(self):
        
        self.inputs = tf.placeholder(tf.int32, [1,200])
        embed = tf.get_variable("embed", [self.vocab_size, self.embed_dim])
        word_embeds = tf.nn.embedding_lookup(embed, self.inputs)
        self.cell = tf.nn.rnn_cell.LSTMCell(self.rnn_size)
        self.stacked_cell = tf.nn.rnn_cell.MultiRNNCell([self.cell] * self.layer_depth)

        outputs, _ = tf.nn.static_rnn(self.cell,
        [tf.reshape(embed_t, [1, self.embed_dim]) for embed_t in tf.split( word_embeds, self.seq_length,1)],
                            dtype=tf.float32)

        output_embed = tf.transpose(tf.stack(outputs), [1, 0, 2])
        mean_pool = tf.nn.relu(tf.reduce_mean(output_embed, 1))
        # Action scorer. no bias in paper
        with tf.variable_scope('action'):
            self.pred_reward =core_rnn_cell._linear(mean_pool, len(infos["command_templates"]),0.0)
        with tf.variable_scope('object'):
            self.pred_object =core_rnn_cell._linear(mean_pool, len(infos["entities"]), 0.0)
    
        self.true_reward = tf.placeholder(tf.float32, [self.batch_size, len(infos["command_templates"])])
        self.true_object = tf.placeholder(tf.float32, [self.batch_size, len(infos["entities"])])

        _ = tf.summary.histogram("mean_pool", mean_pool)
        _ = tf.summary.histogram("pred_reward", self.pred_reward)
        _ = tf.summary.histogram("true_reward", self.true_reward)

        _ = tf.summary.scalar("pred_reward_mean", tf.reduce_mean(self.pred_reward))
        _ = tf.summary.scalar("true_reward_mean", tf.reduce_mean(self.true_reward))
    
    
    def vectorize(self, text):
        null_idx = (len(self.game.observation_space.w2id))
        vector = np.ones(self.seq_length) * null_idx
        cnt = 0
        word_full=[]
        words=text.split()
        for i in words:
            if(i is not string.punctuation and not ("." in i) and not ((",") in i)):
                word_full.append(str(i).lower())
            if("." in i):
                word_full.append(i.split(".")[0])
            if("," in i):
                word_full.append(i.split(",")[0])
        for word in word_full:
            try:
                vector[cnt] = self.game.observation_space.w2id[word]
                cnt += 1
            except:
                continue
            return vector
    def train(self, max_iter=1000000,
        alpha=0.01, learning_rate=0.001,
        start_epsilon=1.0, final_epsilon=0.05, memory_size=5000,
            checkpoint_dir="checkpoint"):
        init=tf.global_variables_initializer()
        with tf.Session() as sess:
            sess.run(init)
            state_t, infos = env.reset()
            self.max_iter = max_iter
            self.alpha = alpha
            self.learning_rate = learning_rate
            self.checkpoint_dir = checkpoint_dir
            self.step = 0
            self.loss = tf.reduce_sum(tf.square(self.true_reward - self.pred_reward))
            self.optim = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
            _ = tf.summary.scalar("loss", self.loss)
            self.memory = deque()
            #action = np.zeros(len(infos["command_templates"]))
            #action[0] = 1
            start_time = time.time()
            win_count = 0
            print(" [*] Start")

            for step in range(self.max_iter):
                pred_reward, pred_object = sess.run(
                [self.pred_reward, self.pred_object], feed_dict={self.inputs: [self.vectorize(infos["description"])]})
                #action_t = np.zeros([len(infos["command_templates"])])
                #object_t = np.zeros([len(infos["entities"])])

        # Epsilon greedy
                if random.random() <= self.epsilon or step <= self.observe:
                    action_idx = random.randrange(0, len(infos["command_templates"]) - 1)
                    object_idx = random.randrange(0, len(infos["entities"]) - 1)
                else:
                    max_reward = np.max(pred_reward[0])
                    max_object = np.max(pred_object[0])
                    action_idx = np.random.choice(np.where(pred_reward[0] == max_reward)[0])
                    object_idx = np.random.choice(np.where(pred_object[0] == max_object)[0])
          #best_q = (max_action + max_object)/2
        # run and observe rewards
                #action_t[action_idx] = 1
                #object_t[object_idx] = 1
                
                if self.epsilon > self.final_epsilon and step > self.observe:
                    self.epsilon -= (self.start_epsilon- self.final_epsilon) / self.observe
                print(infos["description"])
                state_t1, reward_t, is_finished,infos = self.game.step(infos["command_templates"][action_idx]+infos["entities"][object_idx])
                env.render()
                self.memory.append((state_t, infos["command_templates"][action_idx], infos["entities"][object_idx], reward_t, state_t1, is_finished))

        # qLearnMinibatch : Q-learning updates
                if step > self.observe:
                    batch = random.sample(self.memory, self.batch_size)

                    s = [mem[0] for mem in batch]
                    a = [mem[1] for mem in batch]
                    o = [mem[2] for mem in batch]
                    r = [mem[3] for mem in batch]
                    s2 = [mem[4] for mem in batch]
                    finished = [mem[5] for mem in batch]

                    if r > 0:
                        win_count += 1

                    pred_reward = self.pred_reward.eval(feed_dict={self.inputs: s2})

                    action = np.zeros(self.num_action)
                    object_= np.zeros(self.num_object)

                    _, loss, summary_str = sess.run([self.optim, self.loss, self.merged_sum], feed_dict={
                    self.inputs: s,
                    self.true_reward: a,
                    self.pred_reward: pred_reward,
                    self.true_object: o,
                    self.pred_object: pred_object,
                  })

                    if step % 50 == 0:
                        self.save(checkpoint_dir, step)

                    if step % 10 == 0:
                        print("Step: [%2d/%7d] time: %4.4f, loss: %.8f, win: %4d" % (step, self.max_iter, time.time() - start_time, loss, win_count))

                if is_finished:
                    state_t1, infos1 = env.reset()

                state_t = state_t1
                infos=infos1

In [6]:
obj=LSTMDQN(env)

