In [12]:
import warnings

warnings.filterwarnings("ignore")

In [13]:
from env import ShogiEnv
import gymnasium as gym

gym.register(id="Shogi-v0", entry_point="env:ShogiEnv", kwargs={})
env: ShogiEnv = gym.make("Shogi-v0")

## Train the Agent

In [14]:
import pandas as pd
import torch
from model.shogi_agent import ShogiAgent
import matplotlib.pyplot as plt

In [15]:
def Q_learning(agent: ShogiAgent, games_to_play: int):

    loss = []
    final_score = []
    games = 0
    steps = 0

    # we play n games
    while games < games_to_play:
        print(f"Game {games} of {games_to_play}")
        games += 1

        board_score_after = 0
        terminated = False
        truncated = False

        env.reset()

        # until game is not finished
        while not terminated and not truncated:
            print(f"Game: {games}. Move: {env.move}")

            steps += 1

            # choose action, here the agent choose whether to explore or exploit
            action_index, move, current_bit_state, valid_moves_tensor = (
                agent.select_action(env)
            )
            valid_moves = torch.from_numpy(valid_moves_tensor.flatten())

            next_bit_state, reward, terminated, truncated, _ = env.step(move)
            board_score_after += reward

            mask, _ = env.mask_and_valid_moves()
            mask_tensor = torch.from_numpy(mask.flatten())

            # store sample in memory
            agent.remember(
                agent.MAX_PRIORITY,
                current_bit_state,
                action_index,
                reward,
                next_bit_state[0],
                (terminated or truncated),
                valid_moves,
                mask_tensor,
            )

            # train model and store loss
            loss.append(agent.learn_experience_replay(debug=False))

            # adjust epsilon (exploration rate)
            agent.adaptiveEGreedy()

        # save final game score
        final_score.append(board_score_after)

    # plot training results
    score_df = pd.DataFrame(final_score, columns=["score"])
    score_df["ma"] = score_df["score"].rolling(window=games // 5).mean()
    loss_df = pd.DataFrame(loss, columns=["loss"])
    loss_df["ma"] = loss_df["loss"].rolling(window=steps // 5).mean()

    # Create a figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2)

    # Plot the score chart in the first subplot
    ax1.plot(score_df.index, score_df["score"], linewidth=0.2)
    ax1.plot(score_df.index, score_df["ma"])
    ax1.set_title("Final score by game")

    # Plot the loss chart in the second subplot
    ax2.plot(loss_df.index, loss_df["loss"], linewidth=0.1)
    ax2.plot(loss_df.index, loss_df["ma"])
    ax2.set_title("Loss by training step")

    # Show the plot
    plt.show()

In [None]:
agent = ShogiAgent()
env.reset()

Q_learning(agent, games_to_play=10)

Game 0 of 10
Game: 1. Move: 0
Game: 1. Move: 1
Game: 1. Move: 2
Game: 1. Move: 3
Game: 1. Move: 4
Game: 1. Move: 5
Game: 1. Move: 6
Game: 1. Move: 7
Game: 1. Move: 8
Game: 1. Move: 9
Game: 1. Move: 10
Game: 1. Move: 11
Game: 1. Move: 12
Game: 1. Move: 13
Game: 1. Move: 14
Game: 1. Move: 15
Game: 1. Move: 16
Game: 1. Move: 17
Game: 1. Move: 18
Game: 1. Move: 19
Game: 1. Move: 20
Game: 1. Move: 21
Game: 1. Move: 22
Game: 1. Move: 23
Game: 1. Move: 24
Game: 1. Move: 25
Game: 1. Move: 26
Game: 1. Move: 27
Game: 1. Move: 28
Game: 1. Move: 29
Game: 1. Move: 30
Game: 1. Move: 31
Game: 1. Move: 32
Game: 1. Move: 33
Game: 1. Move: 34
Game: 1. Move: 35
Game: 1. Move: 36
Game: 1. Move: 37
Game: 1. Move: 38
Game: 1. Move: 39
Game: 1. Move: 40
Game: 1. Move: 41
Game: 1. Move: 42
Game: 1. Move: 43
Game: 1. Move: 44
Game: 1. Move: 45
Game: 1. Move: 46
Game: 1. Move: 47
Game: 1. Move: 48
Game: 1. Move: 49
Game: 1. Move: 50
Game: 1. Move: 51
Game: 1. Move: 52
Game: 1. Move: 53
Game: 1. Move: 54
Game: 1

In [None]:
Q_learning(agent, games_to_play=20)

In [None]:
agent.save_model("shogimodel/second_model.pth")