In [None]:
!pip install pettingzoo[classic]==1.22.3

In [None]:
# !unzip '/content/to_upload.zip'

In [None]:
import os
import ray
import gymnasium as gym
from gym.spaces import Box, Discrete
from ray import tune
from ray.rllib.algorithms.dqn import DQNConfig
from ray.rllib.algorithms.dqn.dqn_torch_model import DQNTorchModel
from ray.rllib.env import PettingZooEnv
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
from ray.rllib.utils.framework import try_import_torch
from ray.rllib.utils.torch_utils import FLOAT_MAX
from ray.tune.registry import register_env
from ray import air
from pettingzoo.classic import hanabi_v4

torch, nn = try_import_torch()

# Adapted from [1] and [2]

if __name__ == "__main__":
    ray.shutdown()
    ray.init(log_to_driver=False, num_cpus = 4)

    alg_name = "DQN"

    def env_creator():
        env = hanabi_v4.env(colors=2, ranks=5, players=2, hand_size=2, max_information_tokens=3, max_life_tokens=1)
        return env

    env_name = "hanabi_v4"
    register_env(env_name, lambda config: PettingZooEnv(env_creator()))

    test_env = PettingZooEnv(env_creator())
    obs_space = test_env.observation_space
    act_space = test_env.action_space


    config = (DQNConfig()
        .environment(env=env_name)
        .rollouts(num_rollout_workers=3, rollout_fragment_length=4)
        .training(
            noisy = True,
            num_atoms = 51, # number of atoms for distributional DQN
            train_batch_size=32, # batch size
            hiddens=[], # Dense-layer setup for each the advantage branch and the value branch
            dueling=False, # Whether to use dueling DQN.
            gamma = 0.99, # discount factor
            double_q = True,
            model={"fcnet_hiddens": [512, 512], 'fcnet_activation': 'relu'}, # number of hidden nets and activation function
            num_steps_sampled_before_learning_starts = 500,
            target_network_update_freq= 500, # hard update of target network every 500 steps
            replay_buffer_config = {"capacity": 1000000, "prioritized_replay_alpha": 0.5}, # prioritized replay buffer config
            adam_epsilon = 0.00003125,  
            n_step = 1 # n_step in n_step update

        )
        .multi_agent(
            # set different policies for each player
            policies={
                "player_0": (None, obs_space, act_space, {}),
                "player_1": (None, obs_space, act_space, {}),
            },
            policy_mapping_fn=(lambda agent_id, *args, **kwargs: agent_id),
        )
        .resources(num_gpus= 0)
        .debugging(
            log_level="DEBUG"
        )  
        .framework(framework="torch") # pytorch or tf
        .exploration(
            exploration_config={
                # The Exploration class to use.
                "type": "EpsilonGreedy",
                # Config for the Exploration class' constructor:
                "initial_epsilon": 0.000025,
                "final_epsilon": 0.0,
                "epsilon_timesteps": 1000,  # Timesteps over which to anneal epsilon.
            }
         ))
#     .build())


In [None]:
# tune.run(
#         alg_name,
#         name="DQN",
#         restore = '/kaggle/input/checkpoint-20mil/kaggle_upload/checkpoint_019880',
#         storage_path = '/kaggle/working/',
#         stop={"timesteps_total": 22000000},
#         checkpoint_freq=20,
#         config=config.to_dict(),
#     )

In [None]:
# !zip -r distributional_DQN_5.zip /root/ray_results

In [None]:
from ray.rllib.algorithms.algorithm import Algorithm
from ray.rllib.env.wrappers.pettingzoo_env import PettingZooEnv
from ray.rllib.models import ModelCatalog
from ray.tune.registry import register_env
from pettingzoo.classic import hanabi_v4


ray.shutdown()
ray.init()



alg_name = "DQN"


def env_creator():
    env = hanabi_v4.env(colors=2, ranks=5, players=2, hand_size=2, max_information_tokens=3, max_life_tokens=1)
    return env


env = env_creator()
env_name = "hanabi_v4"
register_env(env_name, lambda config: PettingZooEnv(env_creator()))

# load trained DQN agent
DQNAgent = Algorithm.from_checkpoint('/kaggle/input/checkpoint-20mil/kaggle_upload/checkpoint_019880')


In [None]:
import torch.nn.functional as F
import torch
import torch.nn as nn

# the norm aware algorithm

class MultiOutputNormNet(torch.nn.Module):
    def __init__(self, num_classes1, num_classes2):
        super(MultiOutputNormNet, self).__init__()

        self.fc1 = torch.nn.Linear(171, 256)
        self.act = nn.ReLU()
        self.fc2 = torch.nn.Linear(256, 512)
        self.act2 = nn.ReLU()
        self.fc21 = torch.nn.Linear(512, 256)
        self.act3 = nn.ReLU()
        self.dropout = nn.Dropout(0.20)
        self.fc3 = torch.nn.Linear(256, num_classes1)
        self.fc4 = torch.nn.Linear(256, num_classes2)
        self.fc5 = torch.nn.Linear(256, num_classes2)
        self.fc6 = torch.nn.Linear(256, num_classes2)

    def forward(self, x):
        output_1 = self.act(self.fc1(x))
        output_2 = self.act2(self.fc2(output_1))
        output_3 = self.act3(self.fc21(output_2))
        output_3 = self.dropout(output_3)
        output_4 = self.fc3(output_3)
        output_5 = self.fc4(output_3)
        output_6 = self.fc5(output_3)
        output_7 = self.fc6(output_3)
        return output_4, output_5, output_6, output_7

In [None]:
model = MultiOutputNormNet(11, 3)
# load trained norm aware algorithm
model.load_state_dict(torch.load('/kaggle/input/data-checpoint/data_checkpoints/norm_learning_model.pth'))
model.eval()

In [None]:
import numpy as np
from ray.rllib.models.preprocessors import get_preprocessor


lambda_val = 50
episodes = 1000


reward_sums = {a: 0 for a in env.possible_agents}


# episode = 0
env.reset()
# iterate over episodes
for episode in range(1000):
    
        env.reset()
        flag = False
       
        print('Episode',episode)
        # iterate over agents
        for agent in env.agent_iter():
            # get environment observation
            observation, reward, termination, truncation, info = env.last()
            obs = observation["observation"] # get the observation vector
            reward_sums[agent] += reward # increment reward
            
            # if it is final state, no actions were taken
            if termination or truncation:
                action = None

            else:
                # set lambda_val 0 and 1 to different probabilities
                lambda_val = np.random.choice(np.arange(0, 2), p=[0.0, 1.0])
                
                # output of norm aware algorithm
                action_out, behav_norm, action_norm, state_norm = model(torch.from_numpy(observation['observation']))
                action_out, behav_norm, action_norm, state_norm = model(torch.from_numpy(observation['observation']))
                # get norm values
                behav_norm = torch.argmax(behav_norm).cpu().detach().numpy()
                action_norm = torch.argmax(action_norm).cpu().detach().numpy()
                state_norm = torch.argmax(state_norm).cpu().detach().numpy()
                
                # take the actions according to output of norm aware algorithm if the actions are norms and lambda_val ==1
                if lambda_val == 1 and (action_norm == 1 or behav_norm == 1 or state_norm == 1):
                    action = torch.argmax(action_out).cpu().detach().numpy()
                else:
                    # otherwise select the best action
                    prep = get_preprocessor(env.observation_space(agent))(env.observation_space(agent))
                    # transform observation space to same configuration as the model
                    p = prep.transform(observation)
                    # get policy of current agent
                    pol = config.to_dict()['multiagent']['policy_mapping_fn'](agent, _)
                    policy = DQNAgent.get_policy(agent)
                    # compute possible actions of agent
                    out = policy.compute_actions_from_input_dict({"obs": p.reshape(1, -1)})
                    # take the best action
                    single_action = out[0]
                    action = single_action[0] 

                    
            env.step(action) 

            if termination or truncation:
                if flag == True:
                    print("reward_final :", reward_sums) # print mean episode reward after each episodes
                flag = True

            env.render()

            
print("episode reward mean :",        sum(reward_sums.values() ) /episodes) # final mean episode reward

# References

[1] “hanabi-learning-environment/hanabi_learning_environment/agents/rainbow/rainbow_agent.py at master · deepmind/hanabi-learning-environment,” GitHub. https://github.com/deepmind/hanabi-learning-environment/blob/master/hanabi_learning_environment/agents/rainbow/rainbow_agent.py (accessed Sep. 07, 2023).

[2] “PettingZoo/tutorials/Ray/rllib_leduc_holdem.py at master · Farama-Foundation/PettingZoo,” GitHub. https://github.com/Farama-Foundation/PettingZoo/blob/master/tutorials/Ray/rllib_leduc_holdem.py (accessed Sep. 07, 2023).