In [22]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [40]:
import torch
import numpy as np

from darts_pro.base import DartBoard, AbstractDartsGame, compute_probability_lookup
from darts_pro.dqn import (
    DQN,
    AgentPlayer,
    Agent,
    ReplayMemory,
    EpsilonGreedyStrategy,
    PureGreedyStrategy,
    Experience,
    QValues
)
from darts_pro.games.o_one import X01Game, X01GameState

In [29]:
ACCURACY_SIGMA_X = 0.25
ACCURACY_SIGMA_Y = 0.25
MAX_TURNS = 50
MIN_WIN_TURNS = 3

MEMORY_CAPACITY = 100000
N_EPISODES = 1000
TRAIN_BATCH_SIZE = 64
GAMMA = 0.9999
NETWORK_SHAPE = [128, 128]
TARGET_NET_UPDATE = 10
LEARNING_RATE = 1e-3
AGENT_THROWS_BETWEEN_UPDATE = 10

# Explore/exploit parameters
EXPLORATION_DECAY_RATE = 0.95
START_EXPLORATION_RATE = 1.0
END_EXPLORATION_RATE = 0.1

In [36]:
class Single301Network(DQN):
    def __init__(self, input_size: int, output_size: int, inner_layer_sizes: list[int]):
        super().__init__()
        layers = []
        last_layer_size = input_size
        for size in inner_layer_sizes:
            layers.append(torch.nn.Linear(last_layer_size, size))
            last_layer_size = size
        layers.append(torch.nn.Linear(size, output_size))
        self._layers = torch.nn.ModuleList(layers)

    def forward(self, t: torch.Tensor) -> torch.Tensor:
        for layer in self._layers:
            t = layer(t)
            t = torch.nn.functional.relu(t)
        return t

def extract_tensors(
    experiences: list[Experience]
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    states = torch.stack([e.initial_state for e in experiences])
    actions = torch.from_numpy(np.array([e.action for e in experiences], dtype=np.int64))
    rewards = torch.Tensor([e.reward for e in experiences])
    next_states = torch.stack([e.next_state for e in experiences])

    return states, actions, rewards, next_states

def get_starting_score(episode_number: int, total_episodes: int):
    return 301

# def compute_winner_reward(turn_number, max_turns, min_win_turns) -> float:
#     return 1 - ((turn_number - min_win_turns) / (max_turns - min_win_turns))

def compute_winner_reward(turn_number, max_turns, min_win_turns) -> float:
    dist = turn_number - min_win_turns
    return np.power(0.95, dist)

In [37]:
strategy = EpsilonGreedyStrategy(START_EXPLORATION_RATE, END_EXPLORATION_RATE, EXPLORATION_DECAY_RATE)
board = DartBoard.get_default_dartboard(False)
all_actions = list(board.indexed_targets.keys())

prob_lookup = compute_probability_lookup(ACCURACY_SIGMA_X, ACCURACY_SIGMA_Y, len(board.radial_targets))

dummy_state = X01GameState({0: 0}, 301, 0, 0, '', 0)
agent = Agent(strategy, all_actions)
memory = ReplayMemory(MEMORY_CAPACITY)

policy_network = Single301Network(dummy_state.to_tensor().shape[0], len(board.indexed_targets), NETWORK_SHAPE)
target_network = Single301Network(dummy_state.to_tensor().shape[0], len(board.indexed_targets), NETWORK_SHAPE)
target_network.load_state_dict(policy_network.state_dict())
target_network.eval()
optimizer = torch.optim.Adam(params=policy_network.parameters(), lr=LEARNING_RATE)

player = AgentPlayer(agent, policy_network, 0, 'agent', prob_lookup)
team_one = [player]
teams = { 0: team_one }

In [38]:
total_agent_throws = 0

for episode in range(N_EPISODES):
    starting_score = get_starting_score(episode, N_EPISODES)
    starting_scores = { 0: starting_score }
    game = X01Game(teams, board, starting_scores=starting_scores)

    done = False
    winner = None

    while not done:
        reward = 0
        pre_throw_state = game.state()
        throw_result, winner = game.play_next_throw()
        total_agent_throws += 1
        if total_agent_throws > 0 and  total_agent_throws % AGENT_THROWS_BETWEEN_UPDATE == 0:
            agent.increment_strategy_step()
            total_agent_throws = 0
        done = throw_result.ended_game
        post_throw_state = game.state()

        if throw_result.ended_game:
            reward = compute_winner_reward(pre_throw_state.turn_number, MAX_TURNS, MIN_WIN_TURNS) if winner is not None else -1.0

        experience = Experience(
            pre_throw_state.to_tensor(),
            agent.last_action_taken,
            reward,
            post_throw_state.to_tensor()
        )

        memory.push(experience)

        if memory.can_provide_sample(TRAIN_BATCH_SIZE):
            experiences = memory.sample(TRAIN_BATCH_SIZE)
            states, actions, rewards, next_states = extract_tensors(experiences)

            current_q_values = QValues.get_current(policy_network, states, actions)
            next_q_values = QValues.get_next(target_network, next_states)
            target_q_values = (next_q_values * GAMMA) + rewards

            loss = torch.nn.functional.mse_loss(current_q_values, next_q_values.unsqueeze(1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    print(f"Finished game. Final score: {post_throw_state.team_scores[0]}, turn: {post_throw_state.turn_number}, reward: {reward}")
    if episode % TARGET_NET_UPDATE == 0:
        target_network.load_state_dict(policy_network.state_dict())

Finished game. Final score: 0, turn: 19, reward: 0.44012666865176536
Finished game. Final score: 0, turn: 30, reward: 0.2503440897424549
Finished game. Final score: 0, turn: 7, reward: 0.8145062499999999
Finished game. Final score: 0, turn: 38, reward: 0.16608338398760716
Finished game. Final score: 0, turn: 23, reward: 0.3584859224085419
Finished game. Final score: 0, turn: 14, reward: 0.5688000922764597
Finished game. Final score: 0, turn: 11, reward: 0.6634204312890623
Finished game. Final score: 0, turn: 17, reward: 0.48767497911552954
Finished game. Final score: 0, turn: 13, reward: 0.5987369392383787
Finished game. Final score: 0, turn: 34, reward: 0.2039068257457904
Finished game. Final score: 0, turn: 7, reward: 0.8145062499999999
Finished game. Final score: 0, turn: 45, reward: 0.11598222130000556
Finished game. Final score: 0, turn: 26, reward: 0.3073568677250236
Finished game. Final score: 0, turn: 27, reward: 0.2919890243387724
Finished game. Final score: 0, turn: 28, rewar

In [47]:
new_agent = Agent(PureGreedyStrategy(), policy_network)
dummy_state = X01GameState({0: 5}, 301, 17, 0, '', 0).to_tensor()[None]
act = new_agent.select_action(dummy_state, policy_network)
print(board.indexed_targets[act])

Target(value=2, multiplier=3, is_bullseye=False)
