# Deep Q-Network

In [None]:
!pip install kaggle-environments

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam, SGD
from kaggle_environments import evaluate, make, utils, agent

DISCOUNT = 0.99

DRAW = 0.3
WIN = 1
LOSE = -1
ERROR = -5

EPSILON = 1  # not a constant, going to be decayed
EPSILON_DECAY = 0.9993
MIN_EPSILON = 0.001


class ConnectX:
    def __init__(self, pair=[None, "random"], config={"rows": 6, "columns": 7, "inarow": 4}):
        self.env = make("connectx", config, debug=True)
        self.config = config
        self.set_pair(pair)
        
    def set_pair(self, pair):
        self.pair = pair
    
    def switch_pair(self):
        self.pair = self.pair[::-1]
    
    def change_reward(self, reward, done):
        if done:
            if reward is None: #Error - column has already filled
                reward = ERROR
            elif reward == 1:
                reward = WIN
            elif reward == -1:
                reward = LOSE
            elif reward == 0:
                reward = DRAW
        else:
            reward = -1/(self.config['rows'] * self.config['columns'])
        return reward
    
    def play_game(self, model, render=False):
        global EPSILON
        trainer = self.env.train(self.pair)
        observation = trainer.reset()
        steps = []

        while not self.env.done:
            current_state = get_board(observation, self.config) 
            current_qs = model.predict(current_state[None,:])[0]
            if np.random.random() > EPSILON:
                action = int(np.argmax(current_qs))
            else:
                action = np.random.randint(0, self.config["columns"] - 1)    
            observation, reward, done, info = trainer.step(action)
            reward = self.change_reward(reward, done)
            
            steps.append((current_state, current_qs, action, reward, done))
            
        if render:
            self.env.render(mode="ipython", width=500, height=500)
            
        # Decay epsilon
        if EPSILON > MIN_EPSILON:
            EPSILON *= EPSILON_DECAY
            EPSILON = max(MIN_EPSILON, EPSILON)
            
        return (reward, steps)

In [None]:
class DQN:
    def __init__(self):
        initializer = tf.initializers.VarianceScaling(scale=2.0)
        layers = [
            
# Model for Connect 2 
            
#             Conv2D(64, 3, activation='relu', padding='same', kernel_initializer=initializer, 
#                    kernel_regularizer='l2', input_shape=(3,3,1)),
#             BatchNormalization(),
#             Flatten(),
#             Dense(100, activation='relu', kernel_regularizer='l2', kernel_initializer=initializer),
#             BatchNormalization(),
#             Dropout(0.1),
#             Dense(3)
    
            
# Model for Connect 3
            
            Conv2D(64, 3, activation='relu', kernel_initializer=initializer, 
                   kernel_regularizer='l2', input_shape=(4,5,1)),
            BatchNormalization(),
            Flatten(),
            Dense(100, activation='relu', kernel_regularizer='l2', kernel_initializer=initializer),
            BatchNormalization(),
            Dropout(0.1),
            Dense(20, activation='relu', kernel_regularizer='l2', kernel_initializer=initializer),
            BatchNormalization(),
            Dropout(0.1),
            Dense(5)

            
# Model for Connect 4 

#             Conv2D(64, 3, activation='relu', kernel_initializer=initializer, input_shape=(6,7,1)),
#             Conv2D(64, 3, activation='relu', kernel_initializer=initializer),
#             Flatten(),
#             Dense(50, activation='relu', kernel_initializer=initializer),
#             Dropout(0.1),
#             Dense(7)
        ]
        model = tf.keras.Sequential(layers)
        # clipvalue and clipnorm needs to prevent gradient explosion
        model.compile(loss="mse", optimizer=Adam(lr=0.001, clipvalue = 1., clipnorm = 1.), metrics=['accuracy'])
        
        self.model = model
        
    def train(self, steps, epochs = 10):
        X = []
        y = []
        
        er_before = 0
        for index in range(len(steps)-1, -1 , -1):
            current_state, current_qs, action, reward, done = steps[index]

            # If not a terminal state, get new q from future states, otherwise set it to 0
            # almost like with Q Learning, but we use just part of equation here
            if not done:
                max_future_q = np.max(steps[index + 1][1])
                new_q = reward + DISCOUNT * max_future_q
            else:
                new_q = reward

            # Update Q value for given state
            current_qs[action] = new_q
            current_qs[current_qs!=action] -= reward

            # And append to our training data
            X.append(current_state)
            y.append(current_qs)
            
            # Add symetrical state
            X.append(current_state[:, ::-1, :])
            y.append(current_qs[::-1])
            
            
        self.model.fit(np.array(X), np.array(y), batch_size=64, epochs = epochs)
    
    def predict(self, X):
        return self.model.predict(X)
    

class Trainer:
    def __init__(self, game, model):
        self.game = game
        self.model = model
    
    def _print_statistics(self, rewards):
        rewards = np.array(rewards)
        print("Wins:", (rewards == WIN).sum())
        print("Loses:", (rewards == LOSE).sum())
        print("Errors:", (rewards == ERROR).sum())

    def train(self, num_of_games = 1000, every = 100, epochs = 50, switch=True):
        games = []
        rewards = []
        
        for i in range(1, num_of_games + 1):
            reward, steps = self.game.play_game(self.model)
            games = games + steps
            rewards.append(reward)
            
            if i % every == 0:
                if switch:
                    self.game.switch_pair()
                self._print_statistics(rewards)
                self.model.train(games, epochs)
                games = []
                rewards = []
        
        if len(games) > 200:
            self._print_statistics(rewards)
            self.model.train(games, epochs)
            
            
def get_board(observation, configuration):
    rows = configuration['rows']
    columns = configuration['columns']

    board = np.array(observation['board']).reshape((rows,columns,1))
    new_board = np.zeros_like(board)

    mark = observation["mark"]
    new_board[board == mark] = 1
    new_board[(board != mark) & (board != 0)] = 2
    return new_board / 2


def agent(observation, configuration):
    current_state = get_board(observation, configuration)
    current_qs = model.predict(current_state[None,:])[0]
    return int(np.argmax(current_qs))

In [None]:
model = DQN()
game = ConnectX(config={"rows": 4, "columns": 5, "inarow": 3})
trainer = Trainer(game, model)

In [None]:
# test
game.set_pair([None, agent])
t = game.play_game(model, render=True)

In [None]:
EPSILON = 1.0
game.set_pair(['random', None])
trainer.train(num_of_games=10000, every=100, epochs=15, switch=True)

In [None]:
game.set_pair([None, 'negamax'])
trainer.train(num_of_games=10000, every=100, epochs=15, switch=True)

# Conclusion  
As you can see, dueling double DQN doesn't solve Connect 3 game. And I don't have idea why. I also tried dueling double DQN and PPO algorithm with different model architectures, but get the same result.  

Leave comment and write, where I can have a mistake. 

My other notebooks:  
1) [Usual DQN](https://www.kaggle.com/masurte/deep-q-learning)   
2) [Dueling double DQN](https://www.kaggle.com/masurte/dueling-double-dqn)    
3) [PPO](https://www.kaggle.com/masurte/ppo-algorithm)  