In [1]:
import asyncio

import numpy as np
from gymnasium.spaces import Box, Space
from gymnasium.utils.env_checker import check_env
from rl.agents.dqn import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import EpsGreedyQPolicy, LinearAnnealedPolicy
from tabulate import tabulate
from keras.layers import Dense, Flatten
from keras.models import Sequential
from keras.optimizers.legacy import Adam

from poke_env.environment.abstract_battle import AbstractBattle
from poke_env.environment import Move
from poke_env.player import (
    Player,
    Gen8EnvSinglePlayer,
    MaxBasePowerPlayer,
    ObsType,
    RandomPlayer,
    SimpleHeuristicsPlayer,
    background_cross_evaluate,
    background_evaluate_player,
    wrap_for_old_gym_api,
)
from poke_env.data import GenData


In [2]:
%cd ~/cs/15888/poke
!ls

/Users/richardzhan/cs/15888/poke
[1m[36mdata[m[m                [1m[36mpokemon-showdown[m[m    [1m[36mscripts[m[m
poke.code-workspace [1m[36mpython[m[m              [1m[36msrc[m[m


In [3]:
import sys
sys.path.append("./python")

In [4]:
%load_ext autoreload
%autoreload 2
from rzlib.env import embed
from rzlib.env.simple_rl_player import SimpleRLPlayer

In [5]:
_battle_format = "gen8ou"
_team1_fname = "./data/team1.txt"
_team2_fname = "./data/team2.txt"

In [6]:
def load_team(fname):
    # load_bot()
    with open(fname, "r") as f:
        return "".join(f.readlines())

def create_random_player():
    return RandomPlayer(battle_format=_battle_format, team=load_team(_team1_fname))

def create_max_power_player():
    return MaxBasePowerPlayer(battle_format=_battle_format, team=load_team(_team1_fname))

def create_rl_player(opponent, **kwargs):
    return SimpleRLPlayer(
        battle_format=_battle_format,
        opponent=opponent,
        start_challenging=True,
        team=load_team(_team2_fname),
        **kwargs
    )

def create_rl_env_random(sanity_check=True):
    print("create_rl_env_random()...")
    if sanity_check:
        # First test the environment to ensure the class is consistent
        # with the OpenAI API
        sanity_check_env = create_rl_player(opponent=create_random_player())
        check_env(sanity_check_env)
        sanity_check_env.close()

    # Create one environment for training and one for evaluation
    train_env = create_rl_player(opponent=create_random_player())
    train_env = wrap_for_old_gym_api(train_env)

    eval_env = create_rl_player(opponent=create_random_player())
    eval_env = wrap_for_old_gym_api(eval_env)

    return train_env, eval_env


def create_model(train_env, memory_limit=10000, anneal_steps=10000):
    print("create_model()...")
    # Compute dimensions
    n_action = train_env.action_space.n
    input_shape = (1,) + train_env.observation_space.shape

    # Create model
    model = Sequential()
    model.add(Dense(128, activation="elu", input_shape=input_shape))
    model.add(Flatten())
    model.add(Dense(128, activation="elu"))
    model.add(Dense(128, activation="elu"))
    model.add(Dense(n_action, activation="linear"))

    # Defining the DQN
    memory = SequentialMemory(limit=memory_limit, window_length=1)

    policy = LinearAnnealedPolicy(
        EpsGreedyQPolicy(),
        attr="eps",
        value_max=1.0,
        value_min=0.01,
        value_test=0.0,
        nb_steps=anneal_steps,
    )

    dqn = DQNAgent(
        model=model,
        nb_actions=n_action,
        policy=policy,
        memory=memory,
        nb_steps_warmup=1000,
        gamma=0.99,
        target_model_update=100,
        delta_clip=0.01,
        enable_double_dqn=True,
    )
    dqn.compile(Adam(learning_rate=0.001), metrics=["mae"])


    return dqn

def train_model(*, dqn: DQNAgent, train_env, num_steps=10000):
    print("train_model()...", num_steps)
    dqn.fit(train_env, nb_steps=num_steps)
    train_env.close()

def eval_model(*, agent: DQNAgent, eval_env, opponent: Player, num_eval_episodes=100):
    print("eval_model()...")

    eval_env.reset_env(restart=True, opponent=opponent)
    print(f"Results against {opponent.username}:")
    agent.test(eval_env, nb_episodes=num_eval_episodes, verbose=False, visualize=False)
    print(
        f"DQN Evaluation: {eval_env.n_won_battles} victories out of {eval_env.n_finished_battles} episodes"
    )


In [15]:

train_env, eval_env = create_rl_env_random()

# create the model
dqn = create_model(
    train_env=train_env,
    anneal_steps=10000,
    memory_limit=100000,
)

# Training the model
# for epoch in range(10):
#     print(f"epoch {epoch}...")

train_env, _ = create_rl_env_random()
train_model(
    dqn=dqn, 
    train_env=train_env,
    num_steps=1000000
)

# Evaluating the model
## recreate eval_env (sometimes breaks if the training takes too long)
_, eval_env = create_rl_env_random()
eval_model(
    agent=dqn,
    eval_env=eval_env,
    opponent=create_random_player(),
    num_eval_episodes=100,
)

create_rl_env_random()...
create_model()...


2023-11-30 10:28:13.974186: W tensorflow/c/c_api.cc:305] Operation '{name:'dense_26_1/kernel/Assign' id:5197 op device:{requested: '', assigned: ''} def:{{{node dense_26_1/kernel/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](dense_26_1/kernel, dense_26_1/kernel/Initializer/stateless_random_uniform)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.


create_rl_env_random()...
train_model()... 1000000
Training for 1000000 steps ...
Interval 1 (0 steps performed)
    1/10000 [..............................] - ETA: 33:17 - reward: 0.0000e+00

2023-11-30 10:28:16.512349: W tensorflow/c/c_api.cc:305] Operation '{name:'dense_27/BiasAdd' id:5113 op device:{requested: '', assigned: ''} def:{{{node dense_27/BiasAdd}} = BiasAdd[T=DT_FLOAT, _has_manual_control_dependencies=true, data_format="NHWC"](dense_27/MatMul, dense_27/BiasAdd/ReadVariableOp)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
2023-11-30 10:28:16.597761: W tensorflow/c/c_api.cc:305] Operation '{name:'total_24/Assign' id:5332 op device:{requested: '', assigned: ''} def:{{{node total_24/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](total_24, total_24/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after runni

 1000/10000 [==>...........................] - ETA: 3:29 - reward: -4.1000

2023-11-30 10:28:39.992987: W tensorflow/c/c_api.cc:305] Operation '{name:'dense_27_1/BiasAdd' id:5231 op device:{requested: '', assigned: ''} def:{{{node dense_27_1/BiasAdd}} = BiasAdd[T=DT_FLOAT, _has_manual_control_dependencies=true, data_format="NHWC"](dense_27_1/MatMul, dense_27_1/BiasAdd/ReadVariableOp)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
2023-11-30 10:28:40.116312: W tensorflow/c/c_api.cc:305] Operation '{name:'loss_27/AddN' id:5463 op device:{requested: '', assigned: ''} def:{{{node loss_27/AddN}} = AddN[N=2, T=DT_FLOAT, _has_manual_control_dependencies=true](loss_27/mul, loss_27/mul_1)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
2023-11-

882 episodes - episode_reward: -27.211 [-100.000, 100.000] - loss: 0.111 - mae: 4.632 - mean_q: 5.801 - mean_eps: 0.456

Interval 2 (10000 steps performed)
1012 episodes - episode_reward: -6.719 [-100.000, 100.000] - loss: 0.110 - mae: 5.481 - mean_q: 5.680 - mean_eps: 0.010

Interval 3 (20000 steps performed)
1308 episodes - episode_reward: 15.138 [-100.000, 100.000] - loss: 0.139 - mae: 21.644 - mean_q: 22.797 - mean_eps: 0.010

Interval 4 (30000 steps performed)
1310 episodes - episode_reward: 19.237 [-100.000, 100.000] - loss: 0.152 - mae: 24.242 - mean_q: 24.682 - mean_eps: 0.010

Interval 5 (40000 steps performed)
1313 episodes - episode_reward: 35.872 [-100.000, 100.000] - loss: 0.160 - mae: 29.797 - mean_q: 30.816 - mean_eps: 0.010

Interval 6 (50000 steps performed)
1338 episodes - episode_reward: 39.163 [-100.000, 100.000] - loss: 0.167 - mae: 35.393 - mean_q: 34.798 - mean_eps: 0.010

Interval 7 (60000 steps performed)
1345 episodes - episode_reward: 41.115 [-100.000, 100.00

In [17]:
opponent = create_random_player()

train_env, eval_env = create_rl_env_random()
eval_env.reset_env(restart=True, opponent=opponent)
print(f"Results against {opponent.username}:")
dqn.test(eval_env, nb_episodes=100, verbose=False, visualize=False)
print(
    f"DQN Evaluation: {eval_env.n_won_battles} victories out of {eval_env.n_finished_battles} episodes"
)

create_rl_env_random()...
Results against RandomPlayer 70:
DQN Evaluation: 78 victories out of 100 episodes


In [None]:
train_env.close()

In [None]:
eval_env.close()

KeyboardInterrupt: 