In [None]:
import tensorflow as tf
import numpy as np
import skimage.transform
import matplotlib.pyplot as plt
import imageio
%matplotlib inline

In [None]:
from tetris_env import TetrisEnv

env = TetrisEnv(max_steps=10000)
ob = env.reset()

## Training a TRPO model

In [None]:
import sys
sys.path.append("..")
from rl_agent import RL_Agent
from rl_learner import TRPO_Learner

In [None]:
class Tetris_Agent(RL_Agent):
    # Overwriting supposedly abstract RL_Agent class
    # All what is left is to actually provide the specific model to choose action
    # It is still implied that
    # 1) __init__ method defines all its variables in model_name scope
    # 2) the class has self.session, self.prob_layer and self.log_prob_layer methods
    # The remaining functionality needed in PG and TRPO learners is still defined in abstract base
    def __init__(self, model_name):
        RL_Agent.__init__(self, model_name)
        with tf.variable_scope(model_name):
            self.session = tf.Session()
            self.n_actions = 4

            self.input_layer = tf.placeholder(shape=[None, 20, 10], dtype=tf.float32)
            self.input_expanded = tf.expand_dims(self.input_layer, axis=-1)
            
            self.conv_1 = tf.layers.conv2d(self.input_expanded, filters=8, kernel_size=3, strides=1, padding="same", activation=tf.nn.relu)
            self.conv_2 = tf.layers.conv2d(self.conv_1, filters=8, kernel_size=3, strides=1, padding="same", activation=tf.nn.relu)
            self.pool_1 = tf.layers.max_pooling2d(self.conv_2, pool_size=3, strides=2, padding="same")

            self.flat = tf.contrib.layers.flatten(self.pool_1)
            self.dense_1 = tf.layers.dense(self.flat, units=10, activation=tf.nn.relu)
            self.dense_2 = tf.layers.dense(self.dense_1, units=4)
                        
            self.prob_layer = tf.nn.softmax(self.dense_2)
            self.log_prob_layer = tf.log(self.prob_layer)
            
            
                        
            self.session.run(tf.global_variables_initializer())

In [None]:
tf.reset_default_graph()
trpo = TRPO_Learner(rl_agent=Tetris_Agent("2018_02_01_tetris_trpo"), 
                    game_env=env,
                    discount=0.99, 
                    batch_size=1000, 
                    frame_cap=None,
                    trpo_delta=0.005,
                    line_search_option="max")

import time
start_time = time.time()
for i in range(100):
    trpo.step()
print "Used time: {} seconds".format(time.time() - start_time)

## Playing a random game

In [None]:
played_frames = [ob]

In [None]:
done = False
while not done:
    action = np.random.choice(range(4))
    ob, reward, done = env.step(action)
    played_frames.append(ob)

In [None]:
from IPython.display import HTML
gif_location = "simulations/" + "random_agent" + ".gif"
imageio.mimsave(gif_location, played_frames)
HTML('<img src="' + gif_location + '" width="20%">')