In [1]:
import numpy as np
import copy
import sys
from ale_python_interface import ALEInterface
import cv2
import time

In [2]:
ale = ALEInterface()

In [3]:
max_frames_per_episode = ale.getInt("max_num_frames_per_episode")

In [4]:
ale.setInt("random_seed",123)
ale.setInt("frame_skip",4)

In [5]:
rom_name = 'breakout.bin'
ale.loadROM('roms/' + rom_name)

In [6]:
legal_actions = ale.getMinimalActionSet()
print(legal_actions)

[0 1 3 4]


In [7]:
action_map = dict()
for i in range(len(legal_actions)):
	action_map[legal_actions[i]] = i

In [8]:
screen_width, screen_height = ale.getScreenDims()
print("widhth/height: " + str(screen_width) + "/" + str(screen_height))

widhth/height: 160/210


In [9]:
cv2.startWindowThread()
cv2.namedWindow("preview")

In [10]:
for episode in range(10):
    total_reward = 0.0
    while not ale.game_over():
        a = legal_actions[np.random.randint(legal_actions.size)]
        reward = ale.act(a);
        total_reward += reward
    print("Episode " + str(episode) + " ended with score: " + str(total_reward))
    ale.reset_game()

Episode 0 ended with score: 0.0
Episode 1 ended with score: 0.0
Episode 2 ended with score: 1.0
Episode 3 ended with score: 0.0
Episode 4 ended with score: 1.0
Episode 5 ended with score: 2.0
Episode 6 ended with score: 0.0
Episode 7 ended with score: 2.0
Episode 8 ended with score: 0.0
Episode 9 ended with score: 0.0


In [11]:
numpy_surface = np.zeros(screen_height*screen_width*3, dtype=np.uint8)
ale.getScreenRGB(numpy_surface)
image = np.reshape(numpy_surface, (screen_height, screen_width, 3))

In [12]:
cv2.imshow('preview', image)

In [13]:
import tensorflow as tf
sess = tf.InteractiveSession()

In [14]:
params = {
#    'ckpt_file':None,
    'num_episodes': 250000,
    'rms_decay':0.99,
    'rms_eps':1e-6,
    'db_size': 1000000,
    'batch': 32,
    'num_act': 0,
    'input_dims' : [210, 160, 3],
    'input_dims_proc' : [84, 84, 4],
    'episode_max_length': 100000,
    'learning_interval': 1,
    'eps': 1.0,
    'eps_step':1000000,
    'discount': 0.95,
    'lr': 0.0002,
    'save_interval':20000,
    'train_start':100,
    'eval_mode':False
}

In [15]:
# placeholders
x = tf.placeholder('float', [None,84,84,4])
q_t = tf.placeholder('float', [None])
actions = tf.placeholder("float", [None, params['num_act']])
rewards = tf.placeholder("float", [None])
terminals = tf.placeholder("float", [None])

In [17]:
# conv1
size = 8 
channels = 4 
filters = 16 
stride = 4

w1 = tf.Variable(tf.random_normal([size, size, channels, filters], stddev=0.01))
b1 = tf.Variable(tf.constant(0.1, shape=[filters]))
c1 = tf.nn.conv2d(x, w1, strides=[1, stride, stride, 1], padding='SAME')
o1 = tf.nn.relu(tf.add(c1, b1))

In [21]:
# conv2
size = 4
channels = 16
filters = 32
stride = 2

w2 = tf.Variable(tf.random_normal([size, size, channels, filters], stddev=0.01))
b2 = tf.Variable(tf.constant(0.1, shape=[filters]))
c2 = tf.nn.conv2d(o1, w2, strides=[1, stride, stride, 1], padding='SAME')
o2 = tf.nn.relu(tf.add(c2, b2))

In [23]:
# flat
o2_shape = o2.get_shape().as_list() 

In [25]:
# fc3
hiddens = 256

dim = o2_shape[1]*o2_shape[2]*o2_shape[3]
o2_flat = tf.reshape(o2, [-1,dim])
w3 = tf.Variable(tf.random_normal([dim,hiddens], stddev=0.01))
b3 = tf.Variable(tf.constant(0.1, shape=[hiddens]))
ip3 = tf.add(tf.matmul(o2_flat, w3), b3)
o3 = tf.nn.relu(ip3)

In [26]:
#fc4
hiddens = params['num_act']
dim = 256

w4 = tf.Variable(tf.random_normal([dim,hiddens], stddev=0.01))
b4 = tf.Variable(tf.constant(0.1, shape=[hiddens]))
y = tf.add(tf.matmul(o3, w4), b4)

In [27]:
#Q,Cost,Optimizer

discount = tf.constant(params['discount'])
yj = tf.add(rewards, tf.mul(1.0-terminals, tf.mul(discount, q_t)))
Q_pred = tf.reduce_sum(tf.mul(y, actions), reduction_indices=1)
cost = tf.reduce_sum(tf.pow(tf.sub(yj, Q_pred), 2))

In [32]:
#if params['ckpt_file'] is not None:
#    global_step = tf.Variable(int(params['ckpt_file'].split('_')[-1]), trainable=False)
#else:
#    global_step = tf.Variable(0, trainable=False)
    
global_step = tf.Variable(0, trainable=False)    
rmsprop = tf.train.RMSPropOptimizer(params['lr'], params['rms_decay'], 0.0, params['rms_eps']).minimize(cost, global_step)
sess.run(tf.initialize_all_variables())

#if params['ckpt_file'] is not None:
#    print 'loading checkpoint...'
#    saver = tf.train.Saver()
#    saver.restore(sess, params['ckpt_file'])