In [1]:
from tictac import TicTacToeEnv
from opponents import Opponent, RuleBasedOpponent
from model import DQN, minimaxDQN, PolicyNet
from dataset import ReplayDataset

import random

import numpy as np
import matplotlib.pyplot as plt
plt.style.use('dark_background')

import torch
from torch.utils.data import DataLoader

from IPython.display import clear_output

In [140]:
env = TicTacToeEnv(max_turns=30)
model = minimaxDQN()
policynet = PolicyNet()

policynet.load_state_dict(torch.load('saved_models/policynet.pth', weights_only=True))
model.load_state_dict(torch.load('saved_models/minmax_dqn3.pth', weights_only=True))

opponents = [Opponent(), RuleBasedOpponent()]
# opponent = Opponent()

EPISODES = 100
LOOK_AHEAD = 1
wins, losses, draws = 0, 0, 0
for episode in range(EPISODES):

    opponent_num = 0
    opponent = opponents[opponent_num]

    observation, info = env.reset()
    terminated = False

    while not terminated:
        
        opponent_action = opponent.choose_action(env)
        # opponent_action = model.choose_action(-observation['board'], best=False)

        player_action = model.choose_action(observation['board'], opponent_num, policynet, depth=LOOK_AHEAD, best=True)
        # player_action = model.choose_action(observation['board'], opponent_action, best=True)

        # clear_output(wait=False)
        observation, reward, terminated, _, info = env.step(player_action, opponent_action)
        # print(env.render())

    if info['winner'] == -1: losses += 1
    elif info['winner'] == 1:
        wins += 1
        # print(player_action, opponent_action)
        # print(env.render())
    elif info['winner'] is None: draws += 1

    clear_output(wait=False)
    print("wins", "losses", "draws", sep="\t")
    print(wins, losses, draws, sep="\t")

wins	losses	draws
100	0	0


In [None]:
torch.max(torch.tensor([-7.8296, -7.5128, -6.5433, -7.7056, -5.6361, -7.7585, -7.3463, -8.6944,
        -7.3719]))

In [65]:
from sympy import Matrix

In [85]:
obs, _ = env.reset()
state = obs['board']

state[[0, 4]] = -1
state[[2, 8]] = 1

In [86]:
Matrix(state.reshape((3,3)))

Matrix([
[-1.0,  0.0, 1.0],
[ 0.0, -1.0, 0.0],
[ 0.0,  0.0, 1.0]])

In [94]:
dqn = DQN()
dqn.load_state_dict(torch.load("saved_models/dqn2.pth", weights_only=True))

<All keys matched successfully>

In [117]:
q_vals = np.round(model.forward(torch.tensor(state).to(torch.float)).detach().reshape((9, 9)), 2)

Matrix(q_vals)

Matrix([
[-5.72, -4.41, -7.92,  -4.5, -7.23, -7.42, -4.88, -5.93, -6.29],
[-1.35, -3.18, -1.92, -0.18, -0.39,  -3.0, -1.06,  0.34, -1.93],
[ -7.2, -6.09, -7.21, -7.05, -7.25, -6.49, -6.11,  -7.3, -6.34],
[-2.21,  -0.7, -2.16, -2.48,  -1.7, -1.43,  1.51, -0.79, -1.49],
[-7.48,  -2.6, -7.59, -5.71, -5.88, -3.18, -5.32, -6.28, -7.37],
[ 6.33,  8.32,  -1.8,  6.67,  5.29, -2.05,  6.62,  7.49, -0.33],
[-2.19,   1.2, -1.39, -1.21, -1.57, -1.59, -2.07,  0.54,  0.14],
[-2.98,  0.17, -4.22, -0.52, -1.83, -4.76, -1.68, -2.61, -3.15],
[-8.36, -6.33, -6.48,  -6.5, -7.16, -6.77, -6.17, -7.46, -6.23]])

In [125]:
torch.tensor(state).to(torch.float)

array([-1.,  0.,  1.,  0., -1.,  0.,  0.,  0.,  1.])

In [129]:
state.reshape((3,3))

array([[-1.,  0.,  1.],
       [ 0., -1.,  0.],
       [ 0.,  0.,  1.]])

In [127]:
policynet.return_policy(state, 0)

tensor(1.)

In [128]:
Matrix(policynet.return_policy(state, 0).numpy().round(2).reshape(1, -1))

Matrix([[0, 0.03, 0, 0.12, 0, 0.28, 0.42, 0.14, 0]])

In [None]:
env = TicTacToeEnv(max_turns=30)
model = minimaxDQN()
model.load_state_dict(torch.load('saved_models/minmax_dqn3.pth', weights_only=True))

opponent = RuleBasedOpponent()
opponent = Opponent()

EPISODES = 100

wins, losses, draws = 0, 0, 0
for episode in range(EPISODES):
    observation, info = env.reset()
    terminated = False
    while not terminated:
        
        
        opponent_action = opponent.choose_action(env)
        # opponent_action = model.choose_action(-observation['board'], best=False)

        player_action = model.choose_action(observation['board'], best=True)
        # player_action = model.choose_action(observation['board'], opponent_action, best=True)

        # clear_output(wait=False)
        observation, reward, terminated, _, info = env.step(player_action, opponent_action)
        # print(env.render())

    if info['winner'] == -1: losses += 1
    elif info['winner'] == 1:
        wins += 1
        # print(player_action, opponent_action)
        # print(env.render())
    elif info['winner'] is None: draws += 1

print("wins", "losses", "draws", sep="\t")
print(wins, losses, draws, sep="\t")

In [142]:
# Demo
model = minimaxDQN()
model.load_state_dict(torch.load('saved_models/minmax_dqn3.pth', weights_only=True))

observation, info = env.reset()
terminated = False
while not terminated:
    player_action = int(input())
    opponent_action = model.choose_action(-observation['board'], best=True)

    clear_output(wait=False)
    observation, reward, terminated, _, info = env.step(player_action, opponent_action)

    print(env.render())
    print(observation)

[[ 0.  0.  0.]
 [ 0. -1.  0.]
 [ 0.  1.  0.]]
{'board': array([ 0.,  0.,  0.,  0., -1.,  0.,  0.,  1.,  0.])}


ValueError: invalid literal for int() with base 10: ''