# Proximal Policy Optimization algorithm 
  
This implementation I took from [marload repository](https://github.com/marload/DeepRL-TensorFlow2/blob/master/PPO/PPO_Discrete.py) and fit to ConnectX game.

In [None]:
!pip install kaggle-environments

A little change ConnectX environment

In [None]:
from kaggle_environments import evaluate, make, utils
from gym import spaces
class ConnectX:
    DRAW = 0.5
    WIN = 1.0
    LOSE = -1.0
    ERROR = -10.0 
    
    def __init__(self, pair=[None, "random"], config = {"rows": 6, "columns": 7, "inarow": 4}):
        self.ks_env = make("connectx", config, debug=True)
        self.pair = pair
        self.env = self.ks_env.train(pair)
        self.config = config
        # Learn about spaces here: http://gym.openai.com/docs/#spaces
        self.action_space = spaces.Discrete(config["columns"])
        self.observation_space = spaces.Box(low=0, high=2, 
                                            shape=(config["rows"],config["columns"],1), dtype=np.int)

        # StableBaselines throws error if these are not defined
        self.spec = None
        self.metadata = None
        
    def reset(self):
        self.obs = self.env.reset()
        self.obs = self.get_board(self.obs, self.config)
        return self.obs
    
    def switch_pair(self):
        self.pair = self.pair[::-1]
        self.env = self.ks_env.train(self.pair)
        
    def change_pair(self, pair):
        self.pair = pair
        self.env = self.ks_env.train(self.pair)
        
    def change_reward(self, reward, done):
        
        if done:
            if reward is None: #Error 
                reward = ConnectX.ERROR
            elif reward == 1:
                reward = ConnectX.WIN
            elif reward == -1:
                reward = ConnectX.LOSE
            elif reward == 0:
                reward = ConnectX.DRAW
        else:
            reward = -1/(self.config['rows'] * self.config['columns'])
            
        return reward
    
    def get_board(self, observation, configuration):
        rows = configuration['rows']
        columns = configuration['columns']

        board = np.array(observation['board']).reshape((rows,columns,1))
        new_board = np.zeros_like(board)

        mark = observation["mark"]
        new_board[board == mark] = 1
        new_board[(board != mark) & (board != 0)] = 2
        return new_board / 2 #normalization
    
    def step(self, action):
        if not np.any(self.obs[:, action] == 0):
            reward, done, _ = ConnectX.ERROR, True, {}
        else:
            self.obs, old_reward, done, _ = self.env.step(int(action))
            reward = self.change_reward(old_reward, done)
            self.obs = self.get_board(self.obs, self.config)
        
        return self.obs, reward, done, _

In [None]:
from matplotlib import pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, MaxPool2D, Flatten, Dense, Dropout, BatchNormalization

import numpy as np

tf.keras.backend.set_floatx('float64')

class Actor:
    def __init__(self, state_dim, action_dim, clip_ratio = 0.1, lr = 0.005):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.model = self.create_model()
        self.opt = tf.keras.optimizers.Adam(lr)
        
        self.clip_ratio = clip_ratio

    def create_model(self):
        initializer = tf.initializers.VarianceScaling(scale=2.0)
        return tf.keras.Sequential([
            Input(self.state_dim),
            Flatten(),
            Dense(100, activation='relu', kernel_regularizer='l2', kernel_initializer=initializer),
            BatchNormalization(),
            Dropout(0.1),
            Dense(20, activation='relu', kernel_regularizer='l2', kernel_initializer=initializer),
            BatchNormalization(),
            Dropout(0.1),
            Dense(self.action_dim, activation='softmax')
        ])

    def compute_loss(self, old_policy, new_policy, actions, gaes):
        gaes = tf.stop_gradient(gaes)
        old_log_p = tf.math.log(
            tf.reduce_sum(old_policy * actions))
        old_log_p = tf.stop_gradient(old_log_p)
        log_p = tf.math.log(tf.reduce_sum(
            new_policy * actions))
        ratio = tf.math.exp(log_p - old_log_p)
        clipped_ratio = tf.clip_by_value(
            ratio, 1 - self.clip_ratio, 1 + self.clip_ratio)
        surrogate = -tf.minimum(ratio * gaes, clipped_ratio * gaes)
        return tf.reduce_mean(surrogate)

    def train(self, old_policy, states, actions, gaes):
        actions = tf.one_hot(actions, self.action_dim)
        actions = tf.reshape(actions, [-1, self.action_dim])
        actions = tf.cast(actions, tf.float64)

        with tf.GradientTape() as tape:
            logits = self.model(states, training=True)
            loss = self.compute_loss(old_policy, logits, actions, gaes)
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss


class Critic:
    def __init__(self, state_dim, lr = 0.005):
        self.state_dim = state_dim
        self.model = self.create_model()
        self.opt = tf.keras.optimizers.Adam(lr)

    def create_model(self):
        initializer = tf.initializers.VarianceScaling(scale=2.0)
        return tf.keras.Sequential([
            Input(self.state_dim),
            Flatten(),
            Dense(100, activation='relu', kernel_regularizer='l2', kernel_initializer=initializer),
            BatchNormalization(),
            Dropout(0.1),
            Dense(20, activation='relu', kernel_regularizer='l2', kernel_initializer=initializer),
            BatchNormalization(),
            Dropout(0.1),
            Dense(1, activation='linear')
        ])

    def compute_loss(self, v_pred, td_targets):
        mse = tf.keras.losses.MeanSquaredError()
        return mse(td_targets, v_pred)

    def train(self, states, td_targets):
        with tf.GradientTape() as tape:
            v_pred = self.model(states, training=True)
            assert v_pred.shape == td_targets.shape
            loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss


class Agent:
    def __init__(self, env, discount = 0.99, lmbda = 0.95):
        self.env = env
        self.state_dim = self.env.observation_space.shape
        self.action_dim = self.env.action_space.n

        self.actor = Actor(self.state_dim, self.action_dim)
        self.critic = Critic(self.state_dim)
        
        self.discount = discount
        self.lmbda = lmbda

    def gae_target(self, rewards, v_values, next_v_value, done):
        n_step_targets = np.zeros_like(rewards)
        gae = np.zeros_like(rewards)
        gae_cumulative = 0
        forward_val = 0

        if not done:
            forward_val = next_v_value

        for k in reversed(range(0, len(rewards))):
            delta = rewards[k] + self.discount * forward_val - v_values[k]
            gae_cumulative = self.discount * self.lmbda * gae_cumulative + delta
            gae[k] = gae_cumulative
            forward_val = v_values[k]
            n_step_targets[k] = gae[k] + v_values[k]
        return gae, n_step_targets

    def list_to_batch(self, list_):
        batch = list_[0]
        for elem in list_[1:]:
            batch = np.append(batch, elem, axis=0)
        return batch

    def _print_statistics(self, rewards):
        rewards = np.array(rewards)
        print("Wins:", (rewards == ConnectX.WIN).sum())
        print("Loses:", (rewards == ConnectX.LOSE).sum())
        print("Errors:", (rewards == ConnectX.ERROR).sum())
        
    def train(self, episodes=1000, every = 100, switch = False, update_interval = 2, epochs = 3):
        results = []
        total_rewards = []
        
        for ep in range(episodes):
            state_batch = []
            action_batch = []
            reward_batch = []
            old_policy_batch = []

            episode_reward, done = 0, False
            state = self.env.reset()

            while not done:
                probs = self.actor.model.predict(
                    np.reshape(state, [1, *self.state_dim]))
                action = np.random.choice(self.action_dim, p=probs[0])

                next_state, reward, done, _ = self.env.step(action)

                state = np.reshape(state, [1, *self.state_dim])
                action = np.reshape(action, [1, 1])
                next_state = np.reshape(next_state, [1, *self.state_dim])
                reward = np.reshape(reward, [1, 1])

                state_batch.append(state)
                action_batch.append(action)
                reward_batch.append(reward)
                old_policy_batch.append(probs)

                if len(state_batch) >= update_interval or done:
                    states = self.list_to_batch(state_batch)
                    actions = self.list_to_batch(action_batch)
                    rewards = self.list_to_batch(reward_batch)
                    old_policys = self.list_to_batch(old_policy_batch)

                    v_values = self.critic.model.predict(states)
                    next_v_value = self.critic.model.predict(next_state)

                    gaes, td_targets = self.gae_target(
                        rewards, v_values, next_v_value, done)

                    for epoch in range(epochs):
                        actor_loss = self.actor.train(
                            old_policys, states, actions, gaes)
                        critic_loss = self.critic.train(states, td_targets)

                    state_batch = []
                    action_batch = []
                    reward_batch = []
                    old_policy_batch = []

                episode_reward += reward[0][0]
                state = next_state[0]
            
            if (ep + 1) % 5 == 0:
                print('EP {} EpisodeReward={}'.format(ep + 1, episode_reward))
            
            total_rewards.append(episode_reward)
            results.append(reward[0][0])
            
            if (ep + 1) % every == 0:
                if switch:
                    self.env.switch_pair()
                self._print_statistics(results)
                results = []
                
        return total_rewards

def plot(x, h = 100):
    plt.plot(np.convolve(x, np.ones(h), 'valid')/h)
    plt.xlabel('Episode')
    plt.ylabel('Rewards')
    plt.show()

In [None]:
# Here I experiment with only Connect 3.
env = ConnectX(pair = ["random", None ], config = {"rows": 4, "columns": 5, "inarow": 3})
agent = Agent(env)

In [None]:
total_rewards = agent.train(episodes = 7000, every = 100, switch = True, update_interval = 2)
plot(total_rewards, 100)

In [None]:
# Plot last 500 games
plot(total_rewards[-500:], 30)

In [None]:
env.change_pair([None, "negamax"])
total_rewards = agent.train(episodes= 7000, every = 100, switch = True, update_interval = 2)
plot(total_rewards, 100)

In [None]:
# Plot last 500 games
plot(total_rewards[-500:], 30)

# Conclusion  
As you can see, dueling double DQN doesn't solve Connect 3 game. And I don't have idea why. I also tried dueling double DQN and PPO algorithm with different model architectures, but get the same result.  

Leave comment and write, where I can have a mistake. 

My other notebooks:  
1) [Usual DQN](https://www.kaggle.com/masurte/deep-q-learning)   
2) [Dueling double DQN](https://www.kaggle.com/masurte/dueling-double-dqn)    
3) [Usual DQN, my own implementation](https://www.kaggle.com/masurte/deep-q-learning-implementation)   