In [1]:
import asyncio

import numpy as np
from gym.spaces import Box, Space
from gym.utils.env_checker import check_env
from rl.agents.dqn import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import EpsGreedyQPolicy, LinearAnnealedPolicy
from tabulate import tabulate
from keras.layers import Dense, Flatten
from keras.models import Sequential
from keras.optimizers.legacy import Adam

from poke_env.environment.abstract_battle import AbstractBattle
from poke_env.environment import Move
from poke_env.player import (
    Player,
    Gen8EnvSinglePlayer,
    MaxBasePowerPlayer,
    ObsType,
    RandomPlayer,
    SimpleHeuristicsPlayer,
    background_cross_evaluate,
    background_evaluate_player,
    wrap_for_old_gym_api,
)
from poke_env.data import GenData


In [2]:
%cd ~/cs/15888/poke
!ls

/Users/richardzhan/cs/15888/poke
[1m[36mdata[m[m                [1m[36mpokemon-showdown[m[m    [1m[36mscripts[m[m
poke.code-workspace [1m[36mpython[m[m              [1m[36msrc[m[m


In [3]:
import sys
sys.path.append("./python")

In [4]:
%load_ext autoreload
%autoreload 2
from rzlib.env import embed
from rzlib.env.simple_rl_player import SimpleRLPlayer

In [5]:
_battle_format = "gen8ou"
_team1_fname = "./data/team1.txt"
_team2_fname = "./data/team2.txt"

In [6]:
def load_team(fname):
    # load_bot()
    with open(fname, "r") as f:
        return "".join(f.readlines())

def create_random_player():
    return RandomPlayer(battle_format=_battle_format, team=load_team(_team1_fname))

def create_max_power_player():
    return MaxBasePowerPlayer(battle_format=_battle_format, team=load_team(_team1_fname))

def create_rl_player(opponent, **kwargs):
    return SimpleRLPlayer(
        battle_format=_battle_format,
        opponent=opponent,
        start_challenging=True,
        team=load_team(_team2_fname),
        **kwargs
    )

def create_rl_env_random(sanity_check=True):
    print("create_rl_env_random()...")
    if sanity_check:
        # First test the environment to ensure the class is consistent
        # with the OpenAI API
        sanity_check_env = create_rl_player(opponent=create_random_player())
        check_env(sanity_check_env)
        sanity_check_env.close()

    # Create one environment for training and one for evaluation
    train_env = create_rl_player(opponent=create_random_player())
    train_env = wrap_for_old_gym_api(train_env)

    eval_env = create_rl_player(opponent=create_random_player())
    eval_env = wrap_for_old_gym_api(eval_env)

    return train_env, eval_env


def create_model(train_env, memory_limit=10000, anneal_steps=10000):
    print("create_model()...")
    # Compute dimensions
    n_action = train_env.action_space.n
    input_shape = (1,) + train_env.observation_space.shape

    # Create model
    model = Sequential()
    model.add(Dense(128, activation="elu", input_shape=input_shape))
    model.add(Flatten())
    model.add(Dense(128, activation="elu"))
    model.add(Dense(128, activation="elu"))
    model.add(Dense(n_action, activation="linear"))

    # Defining the DQN
    memory = SequentialMemory(limit=memory_limit, window_length=1)

    policy = LinearAnnealedPolicy(
        EpsGreedyQPolicy(),
        attr="eps",
        value_max=1.0,
        value_min=0.01,
        value_test=0.0,
        nb_steps=anneal_steps,
    )

    dqn = DQNAgent(
        model=model,
        nb_actions=n_action,
        policy=policy,
        memory=memory,
        nb_steps_warmup=1000,
        gamma=0.95,
        target_model_update=100,
        delta_clip=0.01,
        enable_double_dqn=True,
    )
    dqn.compile(Adam(learning_rate=0.001), metrics=["mae"])


    return dqn

def train_model(*, dqn: DQNAgent, train_env, num_steps=10000):
    print("train_model()...", num_steps)
    dqn.fit(train_env, nb_steps=num_steps)
    train_env.close()

def eval_model(*, agent: DQNAgent, eval_env, opponent: Player, num_eval_episodes=100):
    print("eval_model()...")

    eval_env.reset_env(restart=True, opponent=opponent)
    print(f"Results against {opponent.username}:")
    agent.test(eval_env, nb_episodes=num_eval_episodes, verbose=False, visualize=False)
    print(
        f"DQN Evaluation: {eval_env.n_won_battles} victories out of {eval_env.n_finished_battles} episodes"
    )


In [7]:

train_env, eval_env = create_rl_env_random()

# create the model
dqn = create_model(
    train_env=train_env,
    anneal_steps=10000,
)

# Training the model
# for epoch in range(10):
#     print(f"epoch {epoch}...")

train_env, _ = create_rl_env_random()
train_model(
    dqn=dqn, 
    train_env=train_env,
    num_steps=100000
)

# Evaluating the model
## recreate eval_env (sometimes breaks if the training takes too long)
_, eval_env = create_rl_env_random()
eval_model(
    agent=dqn,
    eval_env=eval_env,
    opponent=create_random_player(),
    num_eval_episodes=100,
)

create_rl_env_random()...


  logger.warn(
  logger.warn(
  if not isinstance(terminated, (bool, np.bool8)):


create_model()...


2023-11-29 21:45:11.806229: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:388] MLIR V1 optimization pass is not enabled
2023-11-29 21:45:11.817913: W tensorflow/c/c_api.cc:305] Operation '{name:'dense_3_1/bias/Assign' id:232 op device:{requested: '', assigned: ''} def:{{{node dense_3_1/bias/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](dense_3_1/bias, dense_3_1/bias/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.


create_rl_env_random()...
train_model()... 100000
Training for 100000 steps ...
Interval 1 (0 steps performed)
    7/10000 [..............................] - ETA: 3:46 - reward: -14.2857  

  updates=self.state_updates,
2023-11-29 21:45:14.208841: W tensorflow/c/c_api.cc:305] Operation '{name:'dense_3/BiasAdd' id:119 op device:{requested: '', assigned: ''} def:{{{node dense_3/BiasAdd}} = BiasAdd[T=DT_FLOAT, _has_manual_control_dependencies=true, data_format="NHWC"](dense_3/MatMul, dense_3/BiasAdd/ReadVariableOp)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
2023-11-29 21:45:14.222919: W tensorflow/c/c_api.cc:305] Operation '{name:'count_3/Assign' id:373 op device:{requested: '', assigned: ''} def:{{{node count_3/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](count_3, count_3/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modi

 1001/10000 [==>...........................] - ETA: 2:59 - reward: -4.9950

2023-11-29 21:45:34.224893: W tensorflow/c/c_api.cc:305] Operation '{name:'dense_3_1/BiasAdd' id:237 op device:{requested: '', assigned: ''} def:{{{node dense_3_1/BiasAdd}} = BiasAdd[T=DT_FLOAT, _has_manual_control_dependencies=true, data_format="NHWC"](dense_3_1/MatMul, dense_3_1/BiasAdd/ReadVariableOp)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
2023-11-29 21:45:34.294230: W tensorflow/c/c_api.cc:305] Operation '{name:'loss_3/AddN' id:469 op device:{requested: '', assigned: ''} def:{{{node loss_3/AddN}} = AddN[N=2, T=DT_FLOAT, _has_manual_control_dependencies=true](loss_3/mul, loss_3/mul_1)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
2023-11-29 21:45:3

886 episodes - episode_reward: -44.695 [-100.000, 100.000] - loss: 0.096 - mae: 3.003 - mean_q: -0.473 - mean_eps: 0.456

Interval 2 (10000 steps performed)
929 episodes - episode_reward: -45.102 [-100.000, 100.000] - loss: 0.090 - mae: 1.376 - mean_q: -0.110 - mean_eps: 0.010

Interval 3 (20000 steps performed)
891 episodes - episode_reward: -46.801 [-100.000, 100.000] - loss: 0.093 - mae: 1.324 - mean_q: -0.319 - mean_eps: 0.010

Interval 4 (30000 steps performed)
877 episodes - episode_reward: -40.251 [-100.000, 100.000] - loss: 0.090 - mae: 1.432 - mean_q: -0.550 - mean_eps: 0.010

Interval 5 (40000 steps performed)
893 episodes - episode_reward: -41.769 [-100.000, 100.000] - loss: 0.091 - mae: 1.972 - mean_q: -0.183 - mean_eps: 0.010

Interval 6 (50000 steps performed)
883 episodes - episode_reward: -42.922 [-100.000, 100.000] - loss: 0.092 - mae: 2.172 - mean_q: -0.449 - mean_eps: 0.010

Interval 7 (60000 steps performed)
  765/10000 [=>............................] - ETA: 3:58 -

In [None]:
opponent = create_random_player()

train_env, eval_env = create_rl_env_random()
eval_env.reset_env(restart=True, opponent=opponent)
print(f"Results against {opponent.username}:")
dqn.test(eval_env, nb_episodes=100, verbose=False, visualize=False)
print(
    f"DQN Evaluation: {eval_env.n_won_battles} victories out of {eval_env.n_finished_battles} episodes"
)

create_rl_env_random()...
Results against RandomPlayer 10:
DQN Evaluation: 78 victories out of 100 episodes


In [None]:
train_env.close()

In [None]:
eval_env.close()

KeyboardInterrupt: 