In [1]:
"""
The code credit goes to EFavDB
I only played with parameters, refactored a few codes and added comments.
"""

import tensorflow as tf
import numpy as np

BOARD_SIZE = 20
SHIP_SIZE = 3

hidden_units = BOARD_SIZE
output_units = BOARD_SIZE

In [2]:
# policy network to inference action given a current state

input_positions = tf.placeholder(tf.float32, (1,BOARD_SIZE))
learning_rate = tf.placeholder(tf.float32)

# The input neuron number is equal to the game pixel
w1 = tf.Variable(tf.truncated_normal([BOARD_SIZE, hidden_units]))
b1 = tf.Variable(tf.zeros([hidden_units]))
h1 = tf.tanh(tf.matmul(input_positions, w1) + b1)

w2 = tf.Variable(tf.truncated_normal([hidden_units, output_units]))
b2 = tf.Variable(tf.zeros([output_units]))
logits = tf.matmul(h1, w2) + b2

# The output is the probability of each action
probabilities = tf.nn.softmax(logits)

In [3]:
# training of the policy network is faciliated by learning_rate induced by reward function defined below

# here labels = x in range(10). Not one-hot. So we use sparse xentropy below
labels = tf.placeholder(tf.int64)

cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels, name='xentropy')
train_step = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost)


In [4]:

def play_game(sess, training=True):
    """ Play a single game of battleship using a fixed policy network.
    
    Args:
        session: tensorflow session
        training: boolean to indicate training
    
    Returns:
        board_position_log [[int]]: history of gameplay e.g. [[-1,-1, ... -1],[-1,1, ... -1],[0,1, ... -1], ...]
        action_log [int]: history of bombing position 0 to 9 e.g. [1,0,6, ...]
        hit_log [int]: history of hit 1 or not hit 0 in the game e.g. [1,0,0, ...]
    
    """
    # Select random location for ship
    ship_left = np.random.randint(BOARD_SIZE - SHIP_SIZE + 1)
    ship_positions = set(range(ship_left, ship_left + SHIP_SIZE))
    # Initialize logs for game
    board_position_log = []
    action_log = []
    hit_log = []
    # Play through game
    current_board = [[-1 for i in range(BOARD_SIZE)]]
    board_position_log.append([[i for i in current_board[0]]])
    
    while (sum(hit_log) < SHIP_SIZE) and (len(action_log) < BOARD_SIZE):
        feed_dict = {
            input_positions : current_board
        }
        # here, we use the current policy network for inference
        # the action to be taken is drawn by probs
        probs = sess.run([probabilities], feed_dict=feed_dict)[0][0]
        probs = [p * (index not in action_log) for index, p in enumerate(probs)]
        probs = [p / sum(probs) for p in probs]
        if training == True:
            bomb_index = np.random.choice(BOARD_SIZE, p=probs)            
        else:
            bomb_index = np.argmax(probs)
        # update board, logs
        hit_log.append(1 * (bomb_index in ship_positions))
        current_board[0][bomb_index] = 1 * (bomb_index in ship_positions)
        board_position_log.append([[i for i in current_board[0]]])
        action_log.append(bomb_index)
    return board_position_log, action_log, hit_log



# Reward function

$r(a;t_0) = \sum_{t>t_0} (h(t) - \bar h(t)) \gamma^{(t-t_0)}$

In [5]:

def calculate_rewards(hit_log, gamma=0.5):
    """ Reward function of one completed game
    
    Args:
        hit_log [int]: history of hit 1 or not hit 0 in the game e.g. [1,0,0, ...]
        gamma (float<1): diminishing return parameter to suppress postponed hits
    
    Return:
    """
    # for one completed game, we re-weight the hit score at each time step (originaly 1 for hit or 0 for not hit) by
    #    subtracting the average score beyond that time step; and 
    #    multiply a diminishing reward factor for hits beyond that time step
    hit_log_weighted = [
            (
                item
                - float(SHIP_SIZE - sum(hit_log[:index])) / float(BOARD_SIZE - index)
            )
            * (gamma ** index)
        for index, item in enumerate(hit_log)
    ]
    # for one completed game, the reward for each time step is the sum over the future weighted hit
    rewards = [
            sum(hit_log_weighted[index:])
            * (gamma ** (- index))
        for index in range(len(hit_log))
    ]
    return rewards

In [6]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    game_lengths = []
    ALPHA = 0.06  # global training rate. This is to be multipled by reward function of each step
    
    # play 10000 games
    for i in range(10000):
        board_position_log, action_log, hit_log = play_game(sess)
        rewards_log = calculate_rewards(hit_log)
        game_lengths.append(len(action_log))
        
        # for each action in each completed game, train the policy network by feedbacking reward values
        for r,b,a in zip(rewards_log, board_position_log[:-1], action_log):
            feed_dict = {
                input_positions: b,
                labels: [a],
                learning_rate: ALPHA * r  # negative learn rate for negative reward
            }
            sess.run(train_step, feed_dict=feed_dict)
            
        if i%1000 == 0:
            print("cost: {}; game length: {}; action history: {}".format(
                    sess.run(cost, feed_dict=feed_dict), 
                    len(action_log),
                    action_log
                ))
        
        


cost: [ 3.54401755]; game length: 14; action history: [10, 18, 0, 9, 2, 16, 12, 6, 5, 3, 13, 17, 1, 11]
cost: [ 0.10565664]; game length: 14; action history: [5, 3, 12, 11, 2, 14, 6, 16, 7, 17, 9, 10, 15, 13]
cost: [ 0.08510733]; game length: 16; action history: [16, 5, 3, 9, 15, 8, 6, 14, 2, 17, 10, 12, 11, 13, 1, 0]
cost: [ 0.06959347]; game length: 8; action history: [12, 8, 5, 2, 3, 1, 6, 4]
cost: [ 0.02949222]; game length: 6; action history: [12, 16, 11, 9, 5, 13]
cost: [ 0.00756459]; game length: 7; action history: [12, 8, 5, 17, 2, 18, 16]
cost: [ 0.18464011]; game length: 10; action history: [8, 9, 12, 16, 14, 2, 17, 5, 3, 4]
cost: [ 0.02112601]; game length: 4; action history: [8, 7, 10, 9]
cost: [ 0.00102634]; game length: 7; action history: [3, 16, 8, 9, 7, 5, 6]
cost: [ 0.00338767]; game length: 6; action history: [2, 10, 12, 11, 16, 9]


# Comments

<p>Possible issue:</p>
<ol>
  <li>diverge for large system.</li>
  <li>overfit in high dimension.</li>
</ol>