<a href="https://colab.research.google.com/github/sabrysm/seabattle-ai/blob/master/Sea_Battle_Training_Phase_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tf-agents[reverb]
!pip install dm-reverb[tensorflow]
!pip install tensorflow-probability
!pip install Pillow
!pip install protobuf

Collecting tf-agents[reverb]
  Downloading tf_agents-0.18.0-py3-none-any.whl (1.4 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.4 MB[0m [31m7.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
Collecting gym<=0.23.0,>=0.17.0 (from tf-agents[reverb])
  Downloading gym-0.23.0.tar.gz (624 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m624.4/624.4 kB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pygame==2.1.3 (from tf-agents[reverb])
  Downloading pygame-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.7 MB)
[

In [None]:
import os

from typing import Sequence
from absl import app
import reverb
import tensorflow as tf
import tensorflow_probability as tfp
import tf_agents as tfa
from tf_agents.agents.reinforce import reinforce_agent
from tf_agents.drivers import py_driver
from tf_agents.environments import tf_py_environment, py_environment
from tf_agents.policies import policy_saver
from tf_agents.policies import py_tf_eager_policy
from tf_agents.replay_buffers import reverb_replay_buffer
from tf_agents.replay_buffers import reverb_utils
from tf_agents.specs import tensor_spec, array_spec
from tf_agents.trajectories import time_step as ts
from tf_agents.utils import common
import time
import zipfile
import shutil
import numpy as np

BOARD_SIZE = 8
ITERATIONS = 250000
COLLECT_EPISODES_PER_ITERATION = 1
REPLAY_BUFFER_CAPACITY = 2000
REPLAY_BUFFER_TABLE_NAME = "uniform_table"
DISCOUNT = 0.995

FC_LAYER_PARAMS = 64

LEARNING_RATE = 1e-2
NUM_EVAL_EPISODES = 20
EVAL_INTERVAL = 500
CHECKPOINT_INTERVAL = 2000
DOWNLOAD_INTERVAL = 10530

LOGDIR = "./tf_agents_log_bs7"
MODELDIR = "./"
POLICYDIR = "./bs7_downloads"
train_dir = './bs7_training'
NAMING_CONVENTION = 'bs7_exported_cp_'

In [None]:
SHIPS_SIZE = np.array([2 ,2 ,3, 3, 4])
MAX_STEPS_PER_EPISODE = BOARD_SIZE**2

# Rewards for each strike
HIT_REWARD = 1
MISS_REWARD = 0
REPEAT_STRIKE_REWARD = -1
REPEAT_MISS_REWARD = -1
# Reward for finishing the game within MAX_STEPS_PER_EPISODE
FINISHED_GAME_REWARD = 10
# Reward for not finishing the game within MAX_STEPS_PER_EPISODE
UNFINISHED_GAME_REWARD = -10

# Hidden board cell status; 'occupied' means it's part of the plane
HIDDEN_BOARD_CELL_OCCUPIED = 1
HIDDEN_BOARD_CELL_UNOCCUPIED = 0

# Visible board cell status
VISIBLE_BOARD_CELL_HIT = 1
VISIBLE_BOARD_CELL_MISS = -1
VISIBLE_BOARD_CELL_DESTROYED = 2
VISIBLE_BOARD_CELL_UNTRIED = 0


all_ships = {
    1:2,
    2:2,
    3:3,
    4:3,
    5:4
}

destroyed_ships = {
    1:0,
    2:0,
    3:0,
    4:0,
    5:0
}


class BattleshipEnv(py_environment.PyEnvironment):
    def __init__(self,
               board_size=BOARD_SIZE,
               ships_size=SHIPS_SIZE,
               discount=0.9,
               max_steps=MAX_STEPS_PER_EPISODE) -> None:
        assert board_size >= 4
        self._board_size = board_size
        self._strike_count = 0
        self._discount = discount
        self._max_steps = max_steps
        self._episode_ended = False
        self._action_spec = array_spec.BoundedArraySpec(
          (), np.int32, minimum=0, maximum=self._board_size**2 - 1)
        self._observation_spec = array_spec.BoundedArraySpec(
          (self._board_size, self._board_size),
          np.float32,
          minimum=VISIBLE_BOARD_CELL_MISS,
          maximum=VISIBLE_BOARD_CELL_HIT)
        self._time_step_spec = ts.time_step_spec(self._observation_spec)
        self.set_boards()

    def set_boards(self):
        self._ships_size = SHIPS_SIZE
        self._hit_count = 0
        self._visible_board = np.zeros((self._board_size, self._board_size))
        self._hidden_board = self.place_ships(self._board_size, self._ships_size)

    def current_time_step(self):
        return self._current_time_step

    def observation_spec(self):
        """Return observation_spec."""
        return self._observation_spec

    def action_spec(self):
        """Return action_spec."""
        return self._action_spec

    def _reset(self):
        """Return initial_time_step."""
        self._episode_ended = False
        self._strike_count = 0
        self._hit_count = 0
        self.set_boards()
        return ts.restart(np.array(self._visible_board, dtype=np.float32))

    def _step(self, action):
        """Apply action and return new time_step."""
        if self._hit_count == np.sum(self._ships_size):
            self._episode_ended = True
            return self.reset()

        if self._strike_count + 1 == self._max_steps:
            self.reset()
            return ts.termination(
            np.array(self._visible_board, dtype=np.float32),
            UNFINISHED_GAME_REWARD)

        self._strike_count += 1
        action_x = action // self._board_size
        action_y = action % self._board_size
        # Hit
        if self._hidden_board[action_x][action_y] >= 1:
            # Non-repeat move
            if self._visible_board[action_x][action_y] == VISIBLE_BOARD_CELL_UNTRIED:
                additional_reward = 0
                self._hit_count += 1
                self._visible_board[action_x][action_y] = VISIBLE_BOARD_CELL_HIT
                # increment number of hits
                destroyed_ships[self._hidden_board[action_x][action_y]] += 1
                positions = np.where(self._hidden_board[action_x][action_y] == 6) # continue here
                if destroyed_ships[self._hidden_board[action_x][action_y]] == all_ships[self._hidden_board[action_x][action_y]]:
                    # the ship is destroyed
                    positions = np.where(self._hidden_board == self._hidden_board[action_x][action_y])
                    self._visible_board[positions] = VISIBLE_BOARD_CELL_DESTROYED # means it is destroyed
                    additional_reward = 4
                # Successful strike
                if self._hit_count == np.sum(self._ships_size):
                    # Game finished
                    self._episode_ended = True
                    return ts.termination(np.array(self._visible_board, dtype=np.float32),FINISHED_GAME_REWARD)
                else:
                    self._episode_ended = False
                    return ts.transition(
                        np.array(self._visible_board, dtype=np.float32), HIT_REWARD + additional_reward,
                        self._discount)
            # Repeat strike
            else:
                self._episode_ended = False
                return ts.transition(
                      np.array(self._visible_board, dtype=np.float32),
                      REPEAT_STRIKE_REWARD, self._discount)
        # Miss
        else:
            if self._visible_board[action_x][action_y] == VISIBLE_BOARD_CELL_UNTRIED:
                # Unsuccessful strike
                self._episode_ended = False
                self._visible_board[action_x][action_y] = VISIBLE_BOARD_CELL_MISS
                return ts.transition(
                    np.array(self._visible_board, dtype=np.float32), MISS_REWARD,
                    self._discount)
            else:
                self._episode_ended = False
                return ts.transition(
                    np.array(self._visible_board, dtype=np.float32), REPEAT_STRIKE_REWARD,
                    self._discount)

    def place_ships(self, grid_size, ships_length):
        ships_location = np.zeros((grid_size, grid_size))
        ind = 0
        for ship_length in ships_length:
            # Randomly place ship horizontally or vertically without overlapping
            if np.random.randint(0, 2) == 0: # Horizontal
                start_row = np.random.randint(0, grid_size)
                start_col = np.random.randint(0, grid_size - ship_length + 1)
                while np.sum(ships_location[start_row, start_col:start_col+ship_length]) > 0:
                    start_row = np.random.randint(0, grid_size)
                    start_col = np.random.randint(0, grid_size - ship_length + 1)
                ships_location[start_row, start_col:start_col+ship_length] = ind + 1
            else: # Vertical
                start_row = np.random.randint(0, grid_size - ship_length + 1)
                start_col = np.random.randint(0, grid_size)
                while np.sum(ships_location[start_row:start_row+ship_length, start_col]) > 0:
                    start_col = np.random.randint(0, grid_size)
                    start_row = np.random.randint(0, grid_size - ship_length + 1)
                ships_location[start_row:start_row+ship_length, start_col] = ind + 1
            ind += 1
        return ships_location

In [None]:
def compute_avg_return_and_steps(environment, policy, num_episodes=10):
    """Compute average return and # of steps."""
    total_return = 0.0
    total_steps = 0.0
    total_repeated_steps = 0.0
    for _ in range(num_episodes):
        time_step = environment.reset()
        episode_return = 0.0
        episode_steps = 0.0
        episode_repeated_steps = 0.0
        actions = []
        while not time_step.is_last():
            action_step = policy.action(time_step)
            if action_step.action.numpy()[0] in actions:
                episode_repeated_steps += 1
            else:
                actions.append(action_step.action.numpy()[0])
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward
            episode_steps += 1
        total_return += episode_return
        total_steps += episode_steps
        total_repeated_steps += episode_repeated_steps
    average_return = total_return / num_episodes
    average_episode_steps = total_steps / num_episodes
    average_repeated_steps = total_repeated_steps / num_episodes
    return average_return.numpy()[0], average_episode_steps, average_repeated_steps

In [None]:
def collect_episode(environment, policy, num_episodes, replay_buffer_observer):
    """Collect game episode trajectories."""
    initial_time_step = environment.reset()

    driver = py_driver.PyDriver(
        environment,
        py_tf_eager_policy.PyTFEagerPolicy(policy, use_tf_function=True),
        [replay_buffer_observer],
        max_episodes=num_episodes,
    )
    initial_time_step = environment.reset()
    driver.run(initial_time_step)

In [None]:
def create_zip_file(dirname, base_filename):
  return shutil.make_archive(base_filename, 'zip', dirname)

In [None]:
def train_agent(iterations, modeldir, logdir, policydir):
    """Train and convert the model using TF Agents."""

    train_py_env = BattleshipEnv(
        board_size=BOARD_SIZE, discount=DISCOUNT, max_steps=BOARD_SIZE**2
    )
    eval_py_env = BattleshipEnv(
        board_size=BOARD_SIZE, discount=DISCOUNT, max_steps=BOARD_SIZE**2
    )

    train_env = tf_py_environment.TFPyEnvironment(train_py_env)
    eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

    # Alternatively you could use ActorDistributionNetwork as actor_net
    actor_net = tfa.networks.Sequential(
        [
            tfa.keras_layers.InnerReshape([BOARD_SIZE, BOARD_SIZE], [BOARD_SIZE**2]),
            tf.keras.layers.Dense(3*FC_LAYER_PARAMS, activation="relu"),
            tf.keras.layers.Dense(BOARD_SIZE**2),
            tf.keras.layers.Lambda(lambda t: tfp.distributions.Categorical(logits=t)),
        ],
        input_spec=train_py_env.observation_spec(),
    )

    optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)

    global_step = tf.compat.v1.train.get_or_create_global_step()


    tf_agent = reinforce_agent.ReinforceAgent(
        train_env.time_step_spec(),
        train_env.action_spec(),
        actor_network=actor_net,
        optimizer=optimizer,
        normalize_returns=True,
        train_step_counter=global_step,
    )

    tf_agent.initialize()

    eval_policy = tf_agent.policy
    collect_policy = tf_agent.collect_policy

    tf_policy_saver = policy_saver.PolicySaver(collect_policy)

    # Use reverb as replay buffer
    replay_buffer_signature = tensor_spec.from_spec(tf_agent.collect_data_spec)
    replay_buffer_signature = tensor_spec.add_outer_dim(replay_buffer_signature)
    table = reverb.Table(
        REPLAY_BUFFER_TABLE_NAME,
        max_size=REPLAY_BUFFER_CAPACITY,
        sampler=reverb.selectors.Uniform(),
        remover=reverb.selectors.Fifo(),
        rate_limiter=reverb.rate_limiters.MinSize(1),
        signature=replay_buffer_signature,
    )  # specify signature here for validation at insertion time

    reverb_server = reverb.Server([table])

    replay_buffer = reverb_replay_buffer.ReverbReplayBuffer(
        tf_agent.collect_data_spec,
        sequence_length=None,
        table_name=REPLAY_BUFFER_TABLE_NAME,
        local_server=reverb_server,
    )

    replay_buffer_observer = reverb_utils.ReverbAddEpisodeObserver(
        replay_buffer.py_client, REPLAY_BUFFER_TABLE_NAME, REPLAY_BUFFER_CAPACITY
    )
    train_checkpointer = common.Checkpointer(
    ckpt_dir=train_dir,
    max_to_keep=1,
    agent=tf_agent,
    policy=tf_agent.policy,
    replay_buffer=replay_buffer,
    global_step=global_step
    )
    policy_checkpointer = common.Checkpointer(
      ckpt_dir=os.path.join(train_dir, 'policy'),
      policy=eval_policy,
      global_step=global_step,
    )
    rb_checkpointer = common.Checkpointer(
        ckpt_dir=os.path.join(train_dir, 'replay_buffer'),
        max_to_keep=1,
        replay_buffer=replay_buffer,
    )
    train_checkpointer.initialize_or_restore()
    rb_checkpointer.initialize_or_restore()
    policy_checkpointer.initialize_or_restore()
    global_step = tf.compat.v1.train.get_global_step()
    # Optimize by wrapping some of the code in a graph using TF function.
    tf_agent.train = common.function(tf_agent.train)

    # Evaluate the agent's policy once before training.
    avg_return = compute_avg_return_and_steps(
        eval_env, tf_agent.policy, NUM_EVAL_EPISODES
    )

    summary_writer = tf.summary.create_file_writer(logdir)

    for i in range(iterations):
        # Collect a few episodes using collect_policy and save to the replay buffer.
        collect_episode(
            train_py_env,
            collect_policy,
            COLLECT_EPISODES_PER_ITERATION,
            replay_buffer_observer,
        )

        # Use data from the buffer and update the agent's network.
        iterator = iter(replay_buffer.as_dataset(sample_batch_size=1))
        trajectories, _ = next(iterator)
        tf_train = tf_agent.train(experience=trajectories)
        replay_buffer.clear()

        logger = tf.get_logger()
        if i % CHECKPOINT_INTERVAL == 0:
            train_checkpointer.save(global_step=global_step.numpy())
            rb_checkpointer.save(global_step=global_step.numpy())
            policy_checkpointer.save(global_step=global_step.numpy())
            print(f"\nSaved Checkpoint: {global_step.numpy()}")
        if i % DOWNLOAD_INTERVAL == 0:
            tf_policy_saver.save(policydir)
            create_zip_file(train_dir, NAMING_CONVENTION + '%d' % global_step)
            print("Downloaded the Policy")
        if i % EVAL_INTERVAL == 0:
            avg_return, avg_episode_length, avg_repeated_steps = compute_avg_return_and_steps(
                eval_env, eval_policy, NUM_EVAL_EPISODES
            )
            with summary_writer.as_default():
                tf.summary.scalar("Average return", avg_return, step=i)
                tf.summary.scalar("Average episode length", avg_episode_length, step=i)
                summary_writer.flush()
            logger.info(
                "iteration = {0}: Average Return = {1}, Average Episode Length = {2}".format(
                    i, avg_return, avg_episode_length
                )
            )
            seconds = time.time()
            local_time = time.ctime(seconds)
            print(f"Continuing from step {global_step.numpy()} at {local_time}")
            print(f"Loss: {tf_train.loss}")
            print(
                'iteration = {0}: avg Return = {1}, avg Episode Length = {2}, avg repeated actions = {3}'
                .format(global_step.numpy(), avg_return, avg_episode_length, avg_repeated_steps))

    summary_writer.close()

    tf_policy_saver.save(policydir)

In [None]:
train_agent(ITERATIONS, MODELDIR, LOGDIR, POLICYDIR)




Saved Checkpoint: 720027




Downloaded the Policy
Continuing from step 720027 at Mon Oct 16 12:52:32 2023
Loss: -2.3879926204681396
iteration = 720027: avg Return = 6.300000190734863, avg Episode Length = 51.45, avg repeated actions = 6.7
Continuing from step 720527 at Mon Oct 16 12:57:18 2023
Loss: -0.5716915130615234
iteration = 720527: avg Return = 2.5, avg Episode Length = 59.25, avg repeated actions = 6.2
Continuing from step 721027 at Mon Oct 16 13:02:01 2023
Loss: -5.934597969055176
iteration = 721027: avg Return = 6.5, avg Episode Length = 54.55, avg repeated actions = 4.9
Continuing from step 721527 at Mon Oct 16 13:06:38 2023
Loss: 9.449164390563965
iteration = 721527: avg Return = 9.850000381469727, avg Episode Length = 50.05, avg repeated actions = 4.45

Saved Checkpoint: 722027
Continuing from step 722027 at Mon Oct 16 13:11:19 2023
Loss: 16.29084014892578
iteration = 722027: avg Return = 3.3499999046325684, avg Episode Length = 53.65, avg repeated actions = 8.85
Continuing from step 722527 at Mon Oc



Downloaded the Policy
Continuing from step 731027 at Mon Oct 16 14:33:04 2023
Loss: -6.930057525634766
iteration = 731027: avg Return = 11.850000381469727, avg Episode Length = 50.45, avg repeated actions = 3.35
Continuing from step 731527 at Mon Oct 16 14:37:51 2023
Loss: 1.4612560272216797
iteration = 731527: avg Return = 11.949999809265137, avg Episode Length = 51.4, avg repeated actions = 3.45

Saved Checkpoint: 732027
Continuing from step 732027 at Mon Oct 16 14:42:30 2023
Loss: -3.922785758972168
iteration = 732027: avg Return = 14.399999618530273, avg Episode Length = 45.7, avg repeated actions = 2.7


### Testing the Agent

In [None]:
from tf_agents.policies import policy_loader


# global_step = tf.compat.v1.train.get_global_step()
# print(global_step.numpy())
saved_policy = tf.saved_model.load(POLICYDIR)
eval_py_env = BattleshipEnv(
    board_size=BOARD_SIZE, discount=DISCOUNT, max_steps=BOARD_SIZE**2
)

eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)
start = time.time()
avg_return, avg_episode_length, avg_repeat = compute_avg_return_and_steps(
          eval_env, saved_policy, 20)
print('Average Return = {0}, Average Episode Length = {1}'.format(avg_return, avg_episode_length))
end = time.time()

print(f"secs: {end-start} avg repeat: {avg_repeat}")
# saved_policy.get_train_step()
# saved_policy.action()