In [1]:
from tictac import TicTacToeEnv
from opponents import Opponent, RuleBasedOpponent
from model import DQN, minimaxDQN
from dataset import ReplayDataset

import random

import numpy as np
import matplotlib.pyplot as plt
plt.style.use('dark_background')

import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F

from IPython.display import clear_output

In [2]:
BUFFER_SIZE = 1000000
EPISODES = 1000
BATCH_SIZE = 20
EPOCHS = 200
LR = 1e-4
GAMMA = 0.1
LR_STEP_SIZE = 500

In [3]:
env = TicTacToeEnv()
model = DQN()
model.train()

target_model = DQN()
target_model.load_state_dict(model.state_dict())
target_model.eval()

optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=LR_STEP_SIZE, gamma=GAMMA)

In [4]:
opponent = Opponent()
# opponent = RuleBasedOpponent()

In [5]:
# # Demo
# observation, info = env.reset()
# terminated = False
# while not terminated:
#     player_action = int(input())
#     opponent_action = opponent.choose_action(env)

#     clear_output(wait=False)
#     observation, reward, terminated, _, info = env.step(player_action, opponent_action)

#     print(env.render())
#     print(observation)

In [6]:
# model = DQN()
# opponent = Opponent()
# EPISODES = 100

# wins, losses, draws = 0, 0, 0
# for episode in range(EPISODES):
#     observation, info = env.reset()
#     terminated = False
#     while not terminated:
#         player_action = model.choose_action(observation['board'])
#         opponent_action = opponent.choose_action(env)

#         # clear_output(wait=False)
#         observation, reward, terminated, _, info = env.step(player_action, opponent_action)

#     if info['winner'] == -1: losses += 1
#     elif info['winner'] == 1: wins += 1
#     elif info['winner'] is None: draws += 1

# print("wins", "losses", "draws", sep="\t")
# print(wins, losses, draws, sep="\t")

In [7]:
# model = DQN()
# model.load_state_dict(torch.load('saved_models/dqn2.pth', weights_only=True))
# opponent = Opponent()
# EPISODES = 100

# wins, losses, draws = 0, 0, 0
# for episode in range(EPISODES):
#     observation, info = env.reset()
#     terminated = False
#     while not terminated:
#         player_action = model.choose_action(observation['board'])
#         opponent_action = opponent.choose_action(env)

#         # clear_output(wait=False)
#         observation, reward, terminated, _, info = env.step(player_action, opponent_action)
#         # print(env.render())

#     if info['winner'] == -1: losses += 1
#     elif info['winner'] == 1: wins += 1
#     elif info['winner'] is None: draws += 1

# print("wins", "losses", "draws", sep="\t")
# print(wins, losses, draws, sep="\t")

In [8]:
model = minimaxDQN()
opponent = Opponent()
EPISODES = 100

wins, losses, draws = 0, 0, 0
for episode in range(EPISODES):
    observation, info = env.reset()
    terminated = False
    while not terminated:
        player_action = model.choose_action(observation['board'])
        opponent_action = opponent.choose_action(env)

        # clear_output(wait=False)
        observation, reward, terminated, _, info = env.step(player_action, opponent_action)
        # print(env.render())

    if info['winner'] == -1: losses += 1
    elif info['winner'] == 1: wins += 1
    elif info['winner'] is None: draws += 1

print("wins", "losses", "draws", sep="\t")
print(wins, losses, draws, sep="\t")

wins	losses	draws
8	83	9


In [9]:
q_values = model.forward(torch.randn(9,).to(torch.float)).reshape((9, 9)).detach()
q_values
player_policy = F.softmax(torch.min(q_values, axis=1).values, dim=0).detach().numpy()
player_policy
player_action = np.random.choice(np.arange(9), p=player_policy)
player_action

np.int64(0)

In [10]:
player_policy.sum()

np.float32(1.0000001)

In [11]:
env.MAX_TURNS = 100

In [12]:
model = minimaxDQN()
model.load_state_dict(torch.load('saved_models/minmax_dqn3.pth', weights_only=True))
opponent = RuleBasedOpponent()
# opponent = Opponent()
EPISODES = 100

wins, losses, draws = 0, 0, 0
for episode in range(EPISODES):
    observation, info = env.reset()
    terminated = False
    while not terminated:
        
        # opponent_action = random.sample(env.valid_actions(), 1)[0]
        player_action = model.choose_action(observation['board'], best=True)
        opponent_action = opponent.choose_action(env)
        # player_action, _ = model.choose_actions(observation['board'], opponent_action, best=True)

        # clear_output(wait=False)
        observation, reward, terminated, _, info = env.step(player_action, opponent_action)
        # print(env.render())

    if info['winner'] == -1: losses += 1
    elif info['winner'] == 1: wins += 1
    elif info['winner'] is None: draws += 1

print("wins", "losses", "draws", sep="\t")
print(wins, losses, draws, sep="\t")

wins	losses	draws
1	0	99


In [13]:
# # Demo
# model = minimaxDQN()
# model.load_state_dict(torch.load('saved_models/minmax_dqn3.pth', weights_only=True))

# observation, info = env.reset()
# terminated = False
# while not terminated:
#     player_action = int(input())
#     opponent_action, _ = model.choose_actions(observation['board'])

#     clear_output(wait=False)
#     observation, reward, terminated, _, info = env.step(player_action, opponent_action)

#     print(env.render())
#     print(observation)

In [14]:
from dataset import ReplayDataset
import numpy as np
import random

In [15]:
# Opponent choosing invalid action gives a positive reward

In [16]:
max_val, player_action = torch.max(torch.min(q_values, axis=1).values, axis=0)
player_action

tensor(3)

In [17]:
state = torch.randn(4)
state

tensor([ 1.4247, -0.1421, -0.3173,  1.2412])

In [18]:
player_vec = torch.randn(4)
player_vec

tensor([-1.9531, -0.0063,  0.4903,  0.9780])

In [19]:
opponents = [Opponent(), RuleBasedOpponent()]

opponent_no = random.choice(range(len(opponents)))

In [20]:
from model import PolicyNet

policynet = PolicyNet()
policynet.load_state_dict(torch.load('saved_models/policynet.pth', weights_only=True))

<All keys matched successfully>

In [21]:
observation, info = env.reset()

state = observation['board']
state

array([0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [29]:
q_values = model.forward(torch.tensor(state).to(torch.float)).reshape((9, 9)).detach()
q_values.shape

torch.Size([9, 9])

In [30]:
policynet.return_policy(state, 0).shape

torch.Size([9])

In [37]:
out = policynet.return_policy(state, 0) * torch.randn(5, 9)
out.sum(axis=1)

tensor([-0.0239,  0.5513,  0.4318, -0.0925,  0.2675])