## Tic Tac Toc + Reinforcement Learning with SageMaker
The goal of this experiment is to have in the end a Tensorflow model that knows how to play Tic Tac Toe. We will be able to create a game or integrate this model to an existing game.  
We will use **RayRLlib** + **OpenAI Gym** running as a SageMaker training job. So, let's get our hands dirty and:
- create an OpenAI Gym custom environment that represents the board and the rules of the game
- create an Heuristics (rule based engine) that will play against the agent to make it learn
- train our model using SageMaker
- test the model in a Tic Tac Toe match

In [None]:
!rm -rf tictactoe && mkdir -p tictactoe

### 1/4) First we need to create a new OpenAI Gym that represents the board and the game rules
This is a multi-agent experiment, so we will create an Env that supports two players simultaneously. To make this work, besides the 9 possible positions in the board we need to create an additional action to represent the player is waiting for the other one.

In [None]:
%%writefile tictactoe/tictactoe.py
import gym
import numpy as np
import random
import ray.rllib as rllib

class TicTacToeEnv(rllib.env.MultiAgentEnv, gym.Env):
    PLAYER_X=1
    PLAYER_O=2
    WAIT=9
    marker = ['-', 'X', 'O']
    def __init__(self):
        self.action_space = gym.spaces.Discrete(9 + 1) # 9 valid + wait
        self.observation_space = gym.spaces.Box(0,2, [9+1]) # values between 0 and 2
        self.reset()

    def __defense_detection__(self, board, enemy_id):
        """Checks if there is an opportunity to defend itself from an attack"""
        board_play=board==enemy_id # agent id
        board_mask=board==0 # empty cells
        h = np.sum(board_play, axis=1)# horizontal
        v = np.sum(board_play, axis=0) # vertical
        # diagonals
        diagA,diagAMask = np.diagonal(board_play),np.diagonal(board_mask)     
        diagB,diagBMask = np.fliplr(board_play).diagonal(),np.fliplr(board_mask).diagonal()
        defense_options = []
        for idx,row in enumerate(h): # scan rows
            if row==2 and np.sum(board_mask[idx])>0:
                defense_options.append((idx,np.argmax(board_mask[idx])))
        for idx,col in enumerate(v): # scan cols
            if col==2 and np.sum(board_mask[:,idx])>0:
                defense_options.append((np.argmax(board_mask[:,idx]),idx))
        if np.sum(diagA)==2 and np.sum(diagAMask)>0: # scan diagonal A
            idx = np.argmax(diagAMask); defense_options.append((idx,idx))
        if np.sum(diagB)==2 and np.sum(diagBMask)>0: # scan diagonal B
            idx = np.argmax(diagBMask); defense_options.append((idx,2-idx))
        return defense_options
        
    def step(self, action_dict):
        """One step in the simulation"""
        done=False
        reward=[0,0,0]
        action_x = action_dict['agent_x']
        action_o = action_dict['agent_o']
        
        if self.turn==self.PLAYER_X:
            player_id,enemy_id = (self.PLAYER_X,self.PLAYER_O)
            action_player,action_enemy = (action_x,action_o)
        else:
            player_id,enemy_id = (self.PLAYER_O,self.PLAYER_X)        
            action_player,action_enemy = (action_o,action_x)
        
        # check movement current player        
        if action_player == self.WAIT or self.board[action_player//3, action_player%3] != 0:
            # invalid movement
            reward[player_id] = -7
        else:
            # next time the enemy will play
            self.turn = enemy_id
            # valid movement
            row,col=action_player//3,action_player%3
            self.board[row, col] = player_id
            # is it a critical situation that requires defense?
            defense_options = self.__defense_detection__(self.board, enemy_id)            
            if len(defense_options) > 0:
                reward[player_id] = -10 # probably will lose if this is not a defense, lets see
                for i in defense_options:
                    if i[0]==row and i[1]==col: ## woohoo! defended
                        reward[player_id] = 8
                        break
            else:
                reward[player_id] = 1
        
        # Enemy should be waiting
        if action_enemy != self.WAIT: reward[enemy_id] = -7
        
        tests = [np.diagonal(self.board), np.fliplr(self.board).diagonal()]
        for i in range(3): tests += [self.board[i],self.board[:,i]]
        # check board status
        if   (np.array(tests)==player_id).all(axis=1).any(): done,reward[player_id] = True,15 # win
        elif (np.array(tests)==enemy_id).all(axis=1).any(): done,reward[player_id] = True,-15 # defeat
        elif not (np.array(tests)==0).any(): done,reward[player_id],reward[enemy_id] = True,2,2 # draw
        elif self.trials < 1: done,reward[player_id],reward[enemy_id] = True,-5,-5
        
        self.trials -= 1        
        
        if done: self.render()

        obs = {
            'agent_x': np.concatenate([self.board.flatten(),[self.turn]], axis=0),
            'agent_o': np.concatenate([self.board.flatten(),[self.turn]], axis=0)
        }
        reward = {'agent_x': reward[1], 'agent_o': reward[2]}
        done = {'agent_x': done, 'agent_o': done, '__all__': done}
        
        return obs, reward, done, {}
    
    def reset(self):        
        self.trials = 20
        self.turn = np.random.randint(1,3)
        self.board = np.zeros((3,3), dtype=np.uint8)
        obs = {
            'agent_x': np.concatenate([self.board.flatten(),[self.turn]], axis=0),
            'agent_o': np.concatenate([self.board.flatten(),[self.turn]], axis=0)
        }
        return obs
 
    def render(self, mode='none', close=False):
        if mode=='none': return
        for i in range(9):
            print(self.marker[self.board[i//3,i%3]], end='\n' if i % 3 == 2 else ' ')
        print()
    
    def seed(self, seed):
        print(f"TicTacToeEnv - Seeding {seed}")
        if seed is not None:
            random.seed(seed)
            np.random.seed(seed)

### 2/4) Then you need to create the heuristics/policy

This policy will be used as an rival player to train the agent. It has a set of rules that are adjusted stochastically over time to be a harder or an easier player.

In [None]:
%%writefile tictactoe/heuristics.py
from ray.rllib.policy.policy import Policy
import random
import numpy as np
from tictactoe import TicTacToeEnv

class SemiSmartTicTacToeHeuristicsPolicy(Policy):
    """Starts with random movements but tries to avoid defeat"""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.exploration = self._create_exploration()
        seed = args[2]['seed']
        if seed is not None:
            random.seed(seed)
            np.random.seed(seed)
    
    def __attack_or_defend__(self, board, agent_id):
        board_play=board==agent_id # agent id
        board_mask=board==0 # empty cells
        h = np.sum(board_play, axis=1)# horizontal
        v = np.sum(board_play, axis=0) # vertical
        # diagonals
        diagA,diagAMask = np.diagonal(board_play),np.diagonal(board_mask)     
        diagB,diagBMask = np.fliplr(board_play).diagonal(),np.fliplr(board_mask).diagonal()    
        for idx,row in enumerate(h): # scan rows
            if row==2 and np.sum(board_mask[idx])>0:             
                return (idx,np.argmax(board_mask[idx]))
        for idx,col in enumerate(v): # scan cols
            if col==2 and np.sum(board_mask[:,idx])>0: 
                return (np.argmax(board_mask[:,idx]),idx)
        if np.sum(diagA)==2 and np.sum(diagAMask)>0: # scan diagonal A
            idx = np.argmax(diagAMask); return (idx,idx)
        if np.sum(diagB)==2 and np.sum(diagBMask)>0: # scan diagonal B
            idx = np.argmax(diagBMask); return (idx,2-idx)
        return None, None
        
    def compute_actions(self,
                        obs_batch,
                        state_batches=None,
                        prev_action_batch=None,
                        prev_reward_batch=None,
                        info_batch=None,
                        episodes=None,
                        **kwargs):
        
        def determine_action(obs):
            # Wait if it's not player's turn.
            if obs[9] == TicTacToeEnv.PLAYER_X: return 9
            
            board = obs[:-1].reshape((3,3))
            if random.randint(0,3) == 0: # 33% hard - 66% potentially dumb
                row,col = self.__attack_or_defend__(board, TicTacToeEnv.PLAYER_O) # attack
                if row is not None: return (row*3)+col
                row,col = self.__attack_or_defend__(board, TicTacToeEnv.PLAYER_X) # defend
                if row is not None: return (row*3)+col
                    
            # Make a move on the first empty field heuristic can find.
            empty_cells = []
            for i, symbol in enumerate(obs):
                if symbol == 0: empty_cells.append(i)
            if len(empty_cells) > 0: return empty_cells[random.randint(0,len(empty_cells)-1)]
            raise Exception('Heuristic did not find empty.')

        return [determine_action(obs) for obs in obs_batch], [], {}

    def get_weights(self):
        return None

    def set_weights(self, weights):
        return None

### 3/4) Training the model
SageMaker expects that you share a python script with the estimator to execute the training. The following script defines the whole training process using Ray+RLLib + Tensorflow 2 + OpenAI Gym.

In [None]:
%%writefile tictactoe/train.py
import sys
import subprocess
# we need a special package for cleaning our data, lets pip install it first
subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "sagemaker-training==3.9.2", "ray[rllib]==1.2.0"])

import copy
import os
import argparse
import traceback
import random
import time
import numpy as np
import glob
import re

import ray
import ray.tune
import ray.rllib as rllib
from ray.rllib.agents.registry import get_agent_class

import gym
from gym import error, spaces, utils
from gym.utils import seeding
from gym.envs.registration import register

from sagemaker_training import environment, intermediate_output, logging_config, params, files

from heuristics import SemiSmartTicTacToeHeuristicsPolicy

def start_file_sync(env):
    global logger, intermediate_sync
    ## this service will copy all the files, stored in the intermediate dir, to S3
    region = os.environ.get("AWS_REGION", os.environ.get(params.REGION_NAME_ENV))
    s3_endpoint_url = os.environ.get(params.S3_ENDPOINT_URL, None)

    logger.info("Starting intermediate sync. %s: %s - %s" % (region, env.sagemaker_s3_output(), s3_endpoint_url))
    intermediate_sync = intermediate_output.start_sync(
        env.sagemaker_s3_output(), region, endpoint_url=s3_endpoint_url
    )
    
def get_latest_checkpoint(env, algo):
    global logger
    logger.info("Latest checkpoint")
    # get the latest experiment
    experiments = glob.glob(os.path.join(env.output_intermediate_dir,'training', f'{algo}*'))
    experiments.sort(key=lambda x: [int(c) if c.isdigit() else c for c in ''.join(x.replace('-','').split('_')[-2:])])

    if len(experiments) > 0:
        exp_name = experiments[-1]

        chkpts = [c for c in glob.glob(f'{exp_name}/checkpoint*')]
        chkpts.sort(key=lambda x: [int(c) if c.isdigit() else c for c in re.split('(\d+)', x)])

        if len(chkpts) == 0: raise Exception("No checkpoint found!")
        ckpt_path=chkpts[-1]
        ckpt_meta_filename=ckpt_path.split('/')[-1].split('_')
        ckpt_meta_filename=f'{ckpt_meta_filename[0]}-{int(ckpt_meta_filename[1])}'
        logger.info(f'{ckpt_path}/{ckpt_meta_filename}')
        return ckpt_path, ckpt_meta_filename

def save_model(env_vars, experiment_params):
    global logger
    config = copy.deepcopy(experiment_params)['training']['config']

    config["monitor"] = False
    config["num_workers"] = 1
    config["num_gpus"] = 0
    logger.info(experiment_params)
    algo = experiment_params['training']['run']
    env_name = experiment_params['training']['env']
    logger.info(f'{algo} - {env_name}')
    cls = get_agent_class(algo)        
    agent = cls(env=env_name, config=config)
    
    ckpt_path, ckpt_meta_filename = get_latest_checkpoint(env_vars, algo)
    
    logger.info('Restoring agent...')
    agent.restore(os.path.join(ckpt_path, ckpt_meta_filename))
    logger.info(f'Exporting model to {env_vars.model_dir}...')
    agent.export_policy_model(os.path.join(env_vars.model_dir, "1"), 'agent_x')
    
if __name__ == "__main__":
    
    env_vars = environment.Environment()
    parser = argparse.ArgumentParser()
    logging_config.configure_logger(env_vars.log_level)
    
    parser.add_argument("--log-level", type=int, default=0)
    parser.add_argument("--record-videos", type=bool, default=False)
    parser.add_argument("--num-workers", type=int, default=max(env_vars.num_cpus-1, 3))
    parser.add_argument("--num-gpus", type=int, default=env_vars.num_gpus)
    parser.add_argument("--batch-mode", type=str, default="complete_episodes")
    parser.add_argument("--episode-reward-mean", type=float, default=3.5)
    parser.add_argument("--learning-rate", type=float, default=0.001)
    parser.add_argument("--init-seed", type=int, default=-1)
    parser.add_argument("--refining-iter", type=int, default=4)
    args,unknown = parser.parse_known_args()

    seed=args.init_seed if args.init_seed != -1 else None
    
    random.seed(seed)
    np.random.seed(seed)
    
    logger = logging_config.get_logger()
    intermediate_sync = None

    env_name='TicTacToeEnv-v0'
    register(
        id=env_name,
        entry_point='tictactoe:TicTacToeEnv'
    )
    env = gym.make(env_name)
    env.seed(seed)
    
    experiment_params = {
        "training": {
            "env": env_name,
            "run": "A3C",
            "stop": {
                "episode_reward_mean": args.episode_reward_mean,
            },
            "local_dir": env_vars.output_intermediate_dir,
            "checkpoint_at_end": True,
            "checkpoint_freq": 60,
            #"export_formats": ["h5"],
            "config": {            
                "log_level": args.log_level,
                "monitor": args.record_videos,
                #"framework": "tfe",
                "lr": args.learning_rate,
                "model": {
                    # https://docs.ray.io/en/master/rllib-models.html#default-model-config-settings
                },
                "multiagent": {
                    "policies": {
                        "agent_x": (None, env.observation_space, env.action_space, {}),
                        "agent_o": (SemiSmartTicTacToeHeuristicsPolicy, env.observation_space, env.action_space, {})
                    },
                    "policy_mapping_fn": lambda x: x,
                    "policies_to_train": ["agent_x"],                
                },            
                "num_workers": args.num_workers,
                "num_gpus": args.num_gpus,
                "batch_mode": args.batch_mode,
                "seed": seed
            }
        }
    }

    try:
        start_file_sync(env_vars)
        # main program
        ray.init()
        ray.tune.register_env(env_name, lambda x: env)
        ray.tune.run_experiments(copy.deepcopy(experiment_params))
        for i in range(args.refining_iter):
            seed = int(time.time())
            random.seed(seed)
            np.random.seed(seed)    
            env.seed(seed)
            algo = experiment_params['training']['run']
            ckpt_path, ckpt_meta_filename = get_latest_checkpoint(env_vars, algo)
            experiment_params['training']['config']['seed'] = seed
            experiment_params['training']['restore'] = os.path.join(ckpt_path, ckpt_meta_filename)
            ray.tune.run_experiments(copy.deepcopy(experiment_params))
        save_model(env_vars, experiment_params)
        ray.shutdown()
        
        files.write_success_file()
        logger.info("Reporting training SUCCESS")
    except Exception as e:
        failure_msg = "framework error: \n%s\n%s" % (traceback.format_exc(), str(e))
        logger.error("Reporting training FAILURE: %s" % failure_msg)
        files.write_failure_file(failure_msg)
    finally:
        if intermediate_sync:
            intermediate_sync.join()

#### Training using SageMaker RL container


In [None]:
import sagemaker
import boto3
# S3 bucket
sagemaker_session = sagemaker.session.Session()
s3_bucket = sagemaker_session.default_bucket()  
s3_output_path = 's3://{}/'.format(s3_bucket)

# create a descriptive job name 
aws_region = boto3.Session().region_name
role = sagemaker.get_execution_role()
print("S3 bucket path: {}".format(s3_output_path))

In [None]:
from sagemaker.rl import RLEstimator, RLToolkit, RLFramework
import time

image_name=f"462105765813.dkr.ecr.{aws_region}.amazonaws.com/sagemaker-rl-ray-container:ray-1.1.0-tf-gpu-py36"
estimator = RLEstimator(
    image_uri=image_name,
    entry_point="train.py",
    source_dir='tictactoe',
    role=role,
    instance_type='ml.p3.2xlarge',
    #instance_type='local_gpu',
    max_run=60*(60 * 2),
    instance_count=1,
    output_path=s3_output_path,
    metric_definitions=RLEstimator.default_metric_definitions(RLToolkit.RAY),
    hyperparameters={
        "log-level": 20,
        "record-videos": False,
        "batch-mode": "complete_episodes",
        "episode-reward-mean": 5.0,
        "learning-rate": 0.0001,
        "init-seed": 1, # seed == 1 makes the agent learn faster but it gets biased
        "refining-iter": 4 # refining iterations are to make the agent generalize to random matches
    }
)

#### Kick-off the training job

In [None]:
!sudo rm -rf /tmp/tmp*
estimator.fit(wait=True)
job_name = estimator.latest_training_job.job_name
print("Training job: %s" % job_name)

### 4/4) Testing the agent

In [None]:
s3_uri=f'{estimator.output_path}{estimator.latest_training_job.name}/output/model.tar.gz'
print(s3_uri)
!aws s3 cp $s3_uri /tmp/
!mkdir -p model
!tar -xzvf /tmp/model.tar.gz -C model

In [None]:
import tensorflow as tf
import os
import numpy as np
from tensorflow.python.saved_model import tag_constants

# loading
model_version = '1'
model_dir = 'model'
export_dir = os.path.join(model_dir, model_version)
with tf.compat.v1.Session(graph=tf.Graph()) as sess:
    tf.compat.v1.saved_model.loader.load(sess, [tag_constants.SERVING], export_dir)
    graph = tf.compat.v1.get_default_graph()
    
    #for o in graph.get_operations(): print(o.name)    
    x = graph.get_tensor_by_name('agent_x/observations:0')    
    y = graph.get_tensor_by_name('agent_x/fc_out/BiasAdd:0')
    #obs = [2,2,1,0,1,0,0,0,0,1]
    #obs = [0,0,0,0,0,0,0,0,0,1]
    obs = [2,0,1,0,2,0,0,0,0,1]
    payload = np.array([obs], dtype=np.float32)
    preds = sess.run(y, feed_dict={x: payload})
    
    print(payload[0][:-1].reshape((3,3)))
    print(np.argmax(preds[0]))