# Introduction
Hopefully this code will help you create an agent based on a
Deep reinforcement learning approach.
I have created two classes to better interact with Gym and also in order to have a class that allows to exploit a model based on neural networks.
You will see that the implemented network is not really deep (among other details). But I think this code can be a good starting point to tackle the problem using neural networks.
You can choose to change the network, add layers, change the rewards and the criteria that determine the outcome of an episode, as well as the agent used to train the network.
Thanks to everyone who has shared their kernels. I have seen almost all of them and they have helped me to have a better idea of the problem and to learn a lot.
Finally I would like to reference that I have used code from the following links: https://www.kaggle.com/alexisbcook/deep-reinforcement-learning and https://www.tensorflow.org/tutorials/reinforcement_learning/actor_critic.
You can use those links to have a better explanation of what the code does. Happy Kaggling!

## **Setup**
Import necessary packages and configure global settings.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install kaggle-environments --upgrade

In [None]:
import collections
import gym
#!pip install 'tensorflow==1.15.0'
import tensorflow as tf
tf.__version__
import tqdm

from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple

from kaggle_environments import make, evaluate

from gym import spaces

import random

## MABanditGym class (Open AI Gym)
Here we create a class for better interaction with the Open AI Gym enviroment.

In [None]:
class MABanditGym:   
    
    
    def __init__(self, agent2="random"):
        ks_env = make("mab", debug=True)
        self.env = ks_env.train([None, agent2])
        self.nrounds = 2000
        self.banditCount = ks_env.configuration.banditCount
        self.prev_reward = 0
        
        # Learn about spaces here: http://gym.openai.com/docs/#spaces
        self.action_space = spaces.Discrete(self.banditCount)
        low = -np.ones((self.nrounds,), dtype=np.float32)
        high = -low*(self.banditCount-1)
        self.observation_space = spaces.Box(low, high, dtype=np.float32)
        # Tuple corresponding to the min and max possible rewards
        self.reward_range = (-10,1)
        self.grid = -np.ones((self.nrounds,2))
        self.obs=np.array(self.grid).reshape(self.nrounds*2)
        # StableBaselines throws error if these are not defined
        self.spec = None
        self.metadata = None
    def reset(self):
        #print(env.obs)
        self.env.reset()
        self.grid = -np.ones((self.nrounds,2))
        self.obs=np.array(self.grid).reshape(self.nrounds*2)
        self.prev_reward = 0
        return self.obs
    def change_reward(self, old_reward, done):
        if old_reward == 1000: # The agent won the game
            return 0
        elif done: # The opponent won the game
            return -10
        else: # Reward 1/2000
            return old_reward
    def step(self, action):
        _={}
        # Check if agent's move is valid
        is_valid = (int(action) in range(0,self.banditCount) )
        #valid_moves = [bnd for bnd in range(config.banditCount)]
       
        if is_valid: # Play the "move"
            current_obs = self.env.step(int(action))
            
            for pos in range(0,2):
                #print(current_obs)
                self.grid[current_obs[0]['step']-1][pos]=current_obs[0]['lastActions'][pos]
            self.obs=np.array(self.grid).reshape(self.nrounds*2)
            old_reward= current_obs[0]['reward']
            done = (current_obs[0]['step']==self.nrounds-1 and current_obs[0]['reward']<600)#current_obs[1]['observation']['reward']
            reward = old_reward- self.prev_reward  #self.change_reward(old_reward, done)
            self.prev_reward=old_reward
        else: # End the game and penalize agent
            reward, done, _ = -10, True, {}
        #print(self.obs, reward, done, _  )    
        return self.obs, reward, done, _    


    
    def seed(self, seed=None):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

# Create the environment for training tasks
env = MABanditGym(agent2="random")
# Set seed for experiment reproducibility
seed = 42
env.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)
np.random.seed(seed)

# Small epsilon value for stabilizing division operations
eps = np.finfo(np.float32).eps.item()




## Model
The Actor and Critic will be modeled using one neural network that generates the action probabilities and critic value respectively. We use model subclassing to define the model.

In [None]:
class ActorCritic(tf.keras.Model):
  """Combined actor-critic network."""

  def __init__(
      self, 
      num_actions: int, 
      num_hidden_units: int):
    """Initialize."""
    super().__init__()
    self.common = layers.Dense(num_hidden_units, activation="relu")
    self.actor = layers.Dense(num_actions)
    self.critic = layers.Dense(1)

  def call(self, inputs: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
    x = self.common(inputs)
    return self.actor(x), self.critic(x)

In [None]:
num_actions = env.action_space.n  # 100
#print(num_actions)
num_hidden_units = 128

model = ActorCritic(num_actions, num_hidden_units)

## Training
To train the agent, we will follow these steps:
 
1. Run the agent on the environment to collect training data per episode.
2. Compute expected return at each time step.
3. Compute the loss for the combined actor-critic model.
4. Compute gradients and update network parameters.
5. Repeat 1-4 until either success criterion or max episodes has been reached.

### 1. Collecting training data

In [None]:
# Wrap OpenAI Gym's `env.step` call as an operation in a TensorFlow function.
# This would allow it to be included in a callable TensorFlow graph.

def env_step(action: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
  """Returns state, reward and done flag given an action."""

  state, reward, done, _ = env.step(action)
  return (state.astype(np.float32), 
          np.array(reward, np.int32), 
          np.array(done, np.int32))


def tf_env_step(action: tf.Tensor) -> List[tf.Tensor]:
  return tf.numpy_function(env_step, [action], 
                           [tf.float32, tf.int32, tf.int32])

In [None]:
def run_episode(
    initial_state: tf.Tensor,  
    model: tf.keras.Model, 
    max_steps: int) -> List[tf.Tensor]:
  """Runs a single episode to collect training data."""

  action_probs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
  values = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
  rewards = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)

  initial_state_shape = initial_state.shape
  state = initial_state

  for t in tf.range(max_steps):
    # Convert state into a batched tensor (batch size = 1)
    state = tf.expand_dims(state, 0)

    # Run the model and to get action probabilities and critic value
    action_logits_t, value = model(state)

    # Sample next action from the action probability distribution
    action = tf.random.categorical(action_logits_t, 1)[0, 0]
    action_probs_t = tf.nn.softmax(action_logits_t)

    # Store critic values
    values = values.write(t, tf.squeeze(value))

    # Store log probability of the action chosen
    action_probs = action_probs.write(t, action_probs_t[0, action])

    # Apply action to the environment to get next state and reward
    state, reward, done = tf_env_step(action)
    print(reward)
    state.set_shape(initial_state_shape)

     # Store reward
    rewards = rewards.write(t, reward)

    if tf.cast(done, tf.bool):
      break

  action_probs = action_probs.stack()
  values = values.stack()
  rewards = rewards.stack()

  return action_probs, values, rewards

### 2. Computing expected returns

In [None]:
def get_expected_return(
    rewards: tf.Tensor, 
    gamma: float, 
    standardize: bool = True) -> tf.Tensor:
  """Compute expected returns per timestep."""

  n = tf.shape(rewards)[0]
  returns = tf.TensorArray(dtype=tf.float32, size=n)

  # Start from the end of `rewards` and accumulate reward sums
  # into the `returns` array
  rewards = tf.cast(rewards[::-1], dtype=tf.float32)
  discounted_sum = tf.constant(0.0)
  discounted_sum_shape = discounted_sum.shape
  for i in tf.range(n):
    reward = rewards[i]
    discounted_sum = reward + gamma * discounted_sum
    discounted_sum.set_shape(discounted_sum_shape)
    returns = returns.write(i, discounted_sum)
  returns = returns.stack()[::-1]

  if standardize:
    returns = ((returns - tf.math.reduce_mean(returns)) / 
               (tf.math.reduce_std(returns) + eps))

  return returns

## [Huber loss](https://en.wikipedia.org/wiki/Huber_loss)

In [None]:
huber_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)

def compute_loss(
    action_probs: tf.Tensor,  
    values: tf.Tensor,  
    returns: tf.Tensor) -> tf.Tensor:
  """Computes the combined actor-critic loss."""

  advantage = returns - values

  action_log_probs = tf.math.log(action_probs)
  actor_loss = -tf.math.reduce_sum(action_log_probs * advantage)

  critic_loss = huber_loss(values, returns)

  return actor_loss + critic_loss

### 4. Defining the training step to update parameters

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)


@tf.function
def train_step(
    initial_state: tf.Tensor, 
    model: tf.keras.Model, 
    optimizer: tf.keras.optimizers.Optimizer, 
    gamma: float, 
    max_steps_per_episode: int) -> tf.Tensor:
  """Runs a model training step."""
  #print("Hello")
  with tf.GradientTape() as tape:

    # Run the model for one episode to collect training data
    action_probs, values, rewards = run_episode(
        initial_state, model, max_steps_per_episode) 

    # Calculate expected returns
    returns = get_expected_return(rewards, gamma)

    # Convert training data to appropriate TF tensor shapes
    action_probs, values, returns = [
        tf.expand_dims(x, 1) for x in [action_probs, values, returns]] 

    # Calculating loss values to update our network
    loss = compute_loss(action_probs, values, returns)

  # Compute the gradients from the loss
  grads = tape.gradient(loss, model.trainable_variables)

  # Apply the gradients to the model's parameters
  optimizer.apply_gradients(zip(grads, model.trainable_variables))

  episode_reward = tf.math.reduce_sum(rewards)

  return episode_reward

### 5. Run the training loop

In [None]:
%%time

max_episodes = 1
max_steps_per_episode = 2000

# Problem is considered solved if average reward is >= 600 over 100 
# consecutive trials
reward_threshold = 600
running_reward = 0

# Discount factor for future rewards
gamma = 0.99

with tqdm.trange(max_episodes) as t:
  for i in t:
    new_env=env.reset()
    #print(new_env)
    #print(type(new_env))
    #print(new_env.shape)
    
    initial_state = tf.constant(new_env, dtype=tf.float32)
    episode_reward = int(train_step(
        initial_state, model, optimizer, gamma, max_steps_per_episode))
    #print('episode_reward: ', episode_reward)
    running_reward = episode_reward*0.01 + running_reward*.99

    t.set_description(f'Episode {i}')
    t.set_postfix(
        episode_reward=episode_reward, running_reward=running_reward)

    # Show average episode reward every 5 episodes
    if i % 5 == 0:
      print(f'Episode {i}: average reward: {running_reward}')

    if running_reward > reward_threshold:  
        break

print(f'\nSolved at episode {i}: average reward: {running_reward:.2f}!')

## Save model

In [None]:

filename = 'bandit_model'
model.save(filename) # creates a HDF5 file 'bandit_model.h5'
#del model  # deletes the existing model


### Load model

In [None]:
from tensorflow.keras.models import load_model
# returns a compiled model
# identical to the previous one
model_bandit = load_model('bandit_model')

### Random agent

In [None]:
%%writefile agent_random_.py
import random
def agent_random_(obs, config):
    #print(obs)
    #print(config)
    valid_moves = [bnd for bnd in range(config['banditCount'])]
    return random.choice(valid_moves)

### Keras Agent (submission)
Agent trained  using the implemented Actor-Critic method using TensorFlow. This is the submission file.

In [None]:
%%writefile submission.py

import random
import numpy as np
import tensorflow as tf

from tensorflow.keras.models import load_model
# returns a compiled model
# identical to the previous one
model_bandit = load_model('bandit_model')


class MABanditPlayer:
    global model_bandit
    
    
    def __init__(self, observation, configuration):
        
        self.nrounds = configuration['episodeSteps']
        self.banditCount = configuration['banditCount']
        self.prev_reward = 0
        self.grid = -np.ones((self.nrounds,2))
        self.obs=np.array(self.grid).reshape(self.nrounds*2)
        self.prev_reward = 0
        # StableBaselines throws error if these are not defined
        
    def reset(self):
        #print(env.obs)
        self.env.reset()
        self.grid = -np.ones((self.nrounds,2))
        self.obs=np.array(self.grid).reshape(self.nrounds*2)
        self.prev_reward = 0
        return self.obs
    
    def play(self, observation, configuration):
        bandit=0
        if observation['step']>0:
            
        
            for pos in range(0,2):
                    #print(current_obs)
                    self.grid[observation['step']-1][pos]=observation['lastActions'][pos]
            new_reward= observation['reward']
            reward = new_reward- self.prev_reward
            self.prev_reward=new_reward
            self.obs=np.array(self.grid).reshape(self.nrounds*2)
            
            # Convert state into a batched tensor (batch size = 1)
            state = tf.expand_dims(self.obs,0)
            # Run the model and to get action probabilities and critic value
            action_logits_t, value = model_bandit(state)
            # Sample next action from the action probability distribution
            action = tf.random.categorical(action_logits_t, 1)[0, 0]
            #bandit = model_bandit.predict_classes(state) #state = tf.expand_dims(state, 0)
            with tf.compat.v1.Session() as sess:
                bandit = action.numpy()
            
        else:
            valid_moves = [bnd for bnd in range(configuration['banditCount'])]
            bandit = np.dtype('int32').type(random.choice(valid_moves))

        return(bandit)    
    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]
# Create the environment


# Set seed for experiment reproducibility
seed = 2021
#env.seed(seed)
random.seed(seed)
#tf.random.set_seed(seed)
#np.random.seed(seed)



observation0 = [{'remainingOverageTime': 60,
 'step': 0,
 'agentIndex': 0,
 'reward': 0,
 'lastActions': []}]
configuration0 = {'episodeSteps': 2000, 'actTimeout': 0.25, 'runTimeout': 1200, 'banditCount': 100, 'decayRate': 0.97, 'sampleResolution': 100}

mab_player = MABanditPlayer(observation0, configuration0)

def keras_agent(observation, configuration):
    #print(observation)
    #print(configuration)
    
    global mab_player
    bandit=0
    bandit=(mab_player.play(observation, configuration)).item()  
    return int(bandit)

### Testing the agent

In [None]:
env_test = make("mab", debug=True)

steps = env_test.run(["submission.py", "agent_random_.py"])

In [None]:
steps[-1]