# Doule DQN

## Installs & Imports

In [None]:
# !pip install 'kaggle-environments>=0.1.6'

In [None]:
from kaggle_environments import evaluate, make, utils
from kaggle_environments import agent as KAgent
from tqdm.notebook import tqdm
from random import choice

import torch
import torch.nn as nn
import torch.optim as optim

import matplotlib.pyplot as plt
import numpy as np
import random
import gym
import inspect
import os
import sys

## Create Environment

Create an envrionment class to do all the things:-
- reset
- render
- step
- initialize

In [None]:
class connectX(gym.Env):
    def __init__(self, switch_prob=0.5):
        
        # create environment
        self.env = make('connectx', debug=True)
        
        # create opponent
        self.pair = [None, 'negamax']
        self.trainer = self.env.train(self.pair)
        
        self.switch_prob = switch_prob
        
        # initialize action space and observation space
        config = self.env.configuration
        
        self.action_space = gym.spaces.Discrete(config.columns)
        self.observation_space = gym.spaces.Discrete(config.columns * config.rows)
    
    # to switch trainer
    def switch_trainer(self):
        self.pair = self.pair[::-1]
        self.trainer = self.env.train(self.pair)
    
    # do the action against trainer
    def step(self, action):
        return self.trainer.step(action)
    
    # reset trainer
    def reset(self):
        if random.uniform(0, 1) < self.switch_prob:
            self.switch_trainer()
        return self.trainer.reset()
    
    # render environment
    def render(self, **kwargs):
        return self.env.render(**kwargs)

In [None]:
# create environment 
env = connectX()

In [None]:
state = env.reset()

In [None]:
type(state)

## Debug/Train Agent

We will use the Deep Q-learning method. Create a Neural network model and we will train the weights to choose correct action.

**Deep Learning Model**

In [None]:
class model(nn.Module):
    def __init__(self, num_states, hidden_units, num_actions):
        super(model, self).__init__()
        
        # initialize hidden layers
        self.hidden_layers = nn.ModuleList([])
        
        # add more layers 
        for i in range(len(hidden_units)):
            if i == 0:
                self.hidden_layers.append(nn.Linear(num_states, hidden_units[i]))
            else:
                self.hidden_layers.append(nn.Linear(hidden_units[i-1], hidden_units[i]))
        
        # initialize output layer
        self.output_layer = nn.Linear(hidden_units[-1], num_actions)
    
    # create forward function
    def forward(self, x):
        # pass the input through hidden layers
        for layer in self.hidden_layers:
            x = torch.sigmoid(layer(x))
        
        # pass through output layer
        x = self.output_layer(x)
        
        return x

**DQN Class**

In [None]:
class DQN:
    def __init__(self, num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr):
        # initialize hyperparameters
        self.num_actions = num_actions
        self.batch_size = batch_size
        self.gamma = gamma
        
        # initialize the agent model
        self.model = model(num_states, hidden_units, num_actions)
        
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.criterion = nn.MSELoss()
        
        # create the experience replay buffer
        self.experience = {'s':[], 'a':[], 'r':[], 's2':[], 'done':[]}
        
        self.max_experiences = max_experiences
        self.min_experiences = min_experiences
        
    # predict the q values for different action using model
    def predict(self, inputs):
        return self.model(torch.from_numpy(inputs).float())
    
    # create method to train the model
    def train(self, targetNet):
        
        # check if we have min experience in the buffer
        if len(self.experience['s']) < self.min_experiences:
            return 0
        
        # select random indices
        ids = np.random.randint(low=0, high=len(self.experience['s']), size=self.batch_size)
        
        # get batch of states, action, rewards
        states = np.asarray([self.preprocess(self.experience['s'][i]) for i in ids])
        actions = np.asarray([self.experience['a'][i] for i in ids])
        rewards = np.asarray([self.experience['r'][i] for i in ids])
        
        # get next states values
        states_next = np.asarray([self.preprocess(self.experience['s2'][i]) for i in ids])
        dones = np.asarray([self.experience['done'][i] for i in ids])
        value_next = np.max(TargetNet.predict(states_next).detach().numpy(), axis=1)
        actual_values = np.where(dones, rewards, rewards+self.gamma*value_next)
        
        # one hot encoding for actions
        actions = np.expand_dims(actions, axis=1)
        actions_one_hot = torch.FloatTensor(self.batch_size, self.num_actions).zero_()
        actions_one_hot = actions_one_hot.scatter_(1, torch.LongTensor(actions), 1)
        
        # get the q values for each (state, action)
        selected_action_values = torch.sum(self.predict(states) * actions_one_hot, dim=1)
        actual_values = torch.FloatTensor(actual_values)
        
        # change the weights
        self.optimizer.zero_grad()
        
        loss = self.criterion(selected_action_values, actual_values)
        loss.backward()
        
        self.optimizer.step()
        
    # choose action using epsilon-greedy method
    def get_action(self, state, epsilon):
        
        # choose random action
        if np.random.random() < epsilon:
            return int(np.random.choice([c for c in range(self.num_actions) if state['board'][c] == 0]))
        
        # return best action
        else:
            prediction = self.predict(np.atleast_2d(self.preprocess(state)))[0].detach().numpy()
            
            # iterate all possible actions
            for i in range(self.num_actions):
                if state['board'][i] != 0:
                    prediction[i] = -1e7
            
            return int(np.argmax(prediction))
    
    # define method to add experience to buffer
    def add_experience(self, exp):
        
        # if have required experiences
        if len(self.experience['s']) >= self.max_experiences:
            
            # remove last values 
            for key in self.experience.keys():
                self.experience[key].pop(0)
        
        for key, value in exp.items():
            self.experience[key].append(value)
    
    # extra functions
    def copy_weights(self, TrainNet):
        self.model.load_state_dict(TrainNet.state_dict())

    def save_weights(self, path):
        torch.save(self.model.state_dict(), path)

    def load_weights(self, path):
        self.model.load_state_dict(torch.load(path))
    
    # Each state will consist of the board and the mark
    # in the observations
    def preprocess(self, state):
        result = state['board'][:]
        result.append(state.mark)

        return result

**Play games Class**

In [None]:
def play_game(env, TrainNet, TargetNet, epsilon, copy_step):
    rewards = 0
    iter = 0
    done = False
    observations = env.reset()
    while not done:
        # Using epsilon-greedy to get an action
        action = TrainNet.get_action(observations, epsilon)

        # Caching the information of current state
        prev_observations = observations

        # Take action
        observations, reward, done, _ = env.step(action)

        # Apply new rules
        if done:
            if reward == 1: # Won
                reward = 20
            elif reward == 0: # Lost
                reward = -20
            else:           # Draw
                reward = 10
        else:
#             reward = -0.05 # Try to prevent the agent from taking a long move
            reward = 0.5

        rewards += reward

        # Adding experience into buffer
        exp = {'s': prev_observations, 'a': action, 'r': reward, 's2': observations, 'done': done}
        TrainNet.add_experience(exp)

        # Train the training model by using experiences in buffer and the target model
        TrainNet.train(TargetNet)
        iter += 1
        if iter % copy_step == 0:
            # Update the weights of the target model when reaching enough "copy step"
            TargetNet.copy_weights(TrainNet)
    
    return rewards

**Define hyperparameters**

In [None]:
gamma = 0.99
copy_step = 25
hidden_units = [128, 128, 128, 128, 128]   # no of hidden layers
# hidden layers = 5
max_experiences = 10000
min_experiences = 100
batch_size = 32
lr = 1e-2
epsilon = 0.5
decay = 0.9999
min_epsilon = 0.1
episodes = 20000

precision = 7

**Training**

In [None]:
# number of possible states and actions
num_states = env.observation_space.n + 1
num_actions = env.action_space.n

# create empty array to store results
all_total_rewards = np.empty(episodes)
all_avg_rewards = np.empty(episodes) # Last 100 steps
all_epsilons = np.empty(episodes)

Since we are using the Double DQn algorithm, hence we will create a training network and a target network.

In [None]:
# initialize the models
TrainNet = DQN(num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr)
TargetNet = DQN(num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr)

Train the model for episodes. We will use the trained weights for our submission agent.

Using Double DQN algorithm.

In [None]:
for i in tqdm(range(episodes)):
    # get a epsilon value
    epsilon = max(min_epsilon, epsilon * decay)
    
    # train the training model using target model (for one episode)
    total_reward = play_game(env, TrainNet, TargetNet, epsilon, copy_step)
    
    # append the results
    all_total_rewards[i] = total_reward
    avg_reward = all_total_rewards[max(0, i - 100):(i + 1)].mean()
    all_avg_rewards[i] = avg_reward
    all_epsilons[i] = epsilon

**Plots**

Create some plots regarding the training work.

In [None]:
plt.plot(all_avg_rewards)
plt.xlabel('Episode')
plt.ylabel('Avg rewards (100)')
plt.show()

In [None]:
plt.plot(all_epsilons)
plt.xlabel('Episode')
plt.ylabel('Epsilon')
plt.show()

Since we will use the trained weights for creating an agent. Hence we have to save the weights for future use.

In [None]:
# save the weights

TrainNet.save_weights('/weights.pth')

## Create an Agent

Extract different layers weights and biases from network to store them in a list.

In [None]:
model_layers = []

In [None]:
# for each hidden layer
for i in range(len(hidden_units)):
    model_layers.extend([
        
        # add the weights and biases
        TrainNet.model.hidden_layers[i].weight.T.tolist(),  
        TrainNet.model.hidden_layers[i].bias.tolist()
    ])

In [None]:
# output layer
model_layers.extend([
    
    # add the weight and bias
    TrainNet.model.output_layer.weight.T.tolist(),
    TrainNet.model.output_layer.bias.tolist()
])

In [None]:
# reshape the model
model_layers = np.reshape(model_layers, (-1, 2))

Now, create an agent that will use the above calculated models (or the trained model weights and biases).

In [None]:
def my_agent(observation, configuration):
    
    # create list of hidden and output layers
    hl_w = [] * len(model_layers)  # n hidden layers
    hl_b = [] * len(model_layers)
    ol_w = []    # 1 output layer
    ol_b = []
    
    # add hidden layers's weights and biases
    for i, (w, b) in enumerate(model_layers[:-1]):
        hl_w.append(np.array(w, dtype=np.float32))
        hl_b.append(np.array(b, dtype=np.float32))
    
    # add output layer's weights and biases
    ol_w = np.array(model_layers[-1][0], dtype=np.float32)
    ol_b = np.array(model_layers[-1][1], dtype=np.float32)

    # get current state of environment
    # board 
    state = observation['board'][:]
    state.append(observation.mark)
    
    # create result array
    res = np.array(state, dtype=np.float32)
    
    # for each hidden layer
#     for i in range(model_layers[:-1]):  # use the enumerate method
    for i, (w, b) in enumerate(model_layers[:-1]):
        # add weights and biases 
        res = np.matmul(res, hl_w[i]) + hl_b[i]
        
        # apply sigmoid function
        res = 1 / (1 + np.exp(-res))
    
    # add weights and biases of output layer
    res = np.matmul(res, ol_w) + ol_b
    
    # for unfilled columns set to min
    for i in range(configuration.columns):
        if observation['board'][i] != 0:
            res[i] = 1e-7
    
    # return best action
    return int(np.argmax(res))

## Test Agent

**Against negamax**

In [None]:
# reset environment
env.reset()

# get the opponent
trainer = env.trainer

# get starting configurations
observation = trainer.reset()
configuration = env.env.configuration

In [None]:
done = False

# while episode is not finished
while not done:
    my_action = my_agent(observation, env.env.configuration)
    print("My Action", my_action)
    
    # keep playing
    observation, reward, done, info = trainer.step(my_action)

env.render(mode="ipython")

**Against random**

In [None]:
env.reset()

# play
env.env.run([my_agent, "random"])
env.render(mode="ipython", width=500, height=450)

## Evaluate Agent

Import our agent in the way it will be used by the tester.

In [None]:
def mean_reward(rewards):
    return sum(r[0] for r in rewards) / float(len(rewards))

In [None]:
# run multiple episodes
print("My agent vs random agent: ", mean_reward(evaluate("connectx", [my_agent, "random"], num_episodes=10)))

In [None]:
print("My agent vs Negamax agent: ", mean_reward(evaluate("connectx", [my_agent, "negamax"], num_episodes=10)))

## Write Submission File

We have to write it as a string because the dictionary will not able to save.

In [None]:
# create agent

my_agent = '''def my_agent(observation, configuration):
    # import required libraries
    import numpy as np
    
'''
    
# NOTE - NO INTENDS
# create list of hidden and output layers

# add hidden layers's weights and biases
for i, (w, b) in enumerate(model_layers[:-1]):
        
    my_agent += '    hl{}_w = np.array({}, dtype=np.float32)\n'.format(i+1, w)
    my_agent +=  '    hl{}_b = np.array({}, dtype=np.float32)\n'.format(i+1, b)
    
# add output layer's weights and biases
my_agent += '    ol_w = np.array({}, dtype=np.float32)\n'.format(model_layers[-1][0])
my_agent += '    ol_b = np.array({}, dtype=np.float32)\n'.format(model_layers[-1][1])

# get current state of environment
# board 
my_agent += '''
    state = observation['board'][:]
    state.append(observation.mark)
    
    # create result array
    res = np.array(state, dtype=np.float32)
    
'''
    
    
    
# for each hidden layer
#     for i in range(model_layers[:-1]):  # use the enumerate method
for i, (w, b) in enumerate(model_layers[:-1]):
    # add weights and biases 
    my_agent += '    res = np.matmul(res, hl{0}_w) + hl{0}_b \n'.format(i+1)
        
    # apply sigmoid function
    my_agent += '    res = 1 / (1 + np.exp(-res)) \n'
    
# add weights and biases of output layer
my_agent += '    res = np.matmul(res, ol_w) + ol_b\n'
    
my_agent += '''
    # for unfilled columns set to min
    for i in range(configuration.columns):
        if observation['board'][i] != 0:
            res[i] = 1e-7
    
    # return best action
    return int(np.argmax(res)) 
    '''

In [None]:

# save our agent in a python file
with open('submission.py', 'w') as f:
    f.write(my_agent)

## Validate Submission

Play against itself.

In [None]:
import sys
from kaggle_environments import agent

out = sys.stdout
submission = utils.read_file("/kaggle/working/submission.py")
a = agent.get_last_callable(submission, path=submission)
sys.stdout = out

In [None]:
out = sys.stdout

# read the file
submission = utils.read_file("/kaggle/working/submission.py")

# get the agent
agent = KAgent.get_last_callable(submission, path=submission)
sys.stdout = out

In [None]:
# play aginst itself
env = make("connectx", debug=True)
env.run([agent, agent])

print("Success!" if env.state[0].status == env.state[1].status == "DONE" else "Failed...")