In [47]:
import torch
import numpy as np
import json
import pandas as pd
import random
from torch.utils.tensorboard import SummaryWriter

In [48]:
data_dir = 'dataset/'

In [49]:
master_df = pd.read_csv(data_dir + 'episodes.csv')
master_df.head()

Unnamed: 0,Id,EpisodeId,Index,Reward,State,SubmissionId,InitialConfidence,InitialScore,UpdatedConfidence,UpdatedScore
0,132759051,58420861,0,1.0,2,41789980,,,200.0,600.0
1,132759052,58420861,1,4.0,2,41789980,,,200.0,600.0
2,132765118,58423894,1,5.0,2,41789980,200.0,980.923901,200.0,1040.8575
3,132765797,58424233,0,5.0,2,41789980,200.0,1040.857579,200.0,1088.025
4,132760394,58421533,1,5.0,2,41789980,200.0,600.0,200.0,733.9188


In [50]:
def load_episode_json(file_pathes):
    file_path = random.choice(file_pathes)
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        return data
    except FileNotFoundError:
        print(f"Error: File {file_path} not found.")
        return None


In [51]:
# file_path list
file_pathes = []
for sub_id, df in master_df.groupby("SubmissionId"):
    episode_ids = df["EpisodeId"].unique()
    for ep_id in episode_ids:
        file_path = data_dir + f"{sub_id}_{ep_id}.json"
        file_pathes.append(file_path)

## 学習

In [46]:
from model import *

In [None]:
def build_tile_graph(map_features, relic_nodes, units, team, device='cuda'):
    # (W, H, 2)
    tiles = np.concatenate([
        map_features['tile_type'][..., np.newaxis],
        map_features['energy'][..., np.newaxis]
    ], axis=-1)

    # relic_nodesに該当するセルのtile_typeを3に変更
    for pos in relic_nodes:
        x, y = pos
        if x == -1 and y == -1:
            continue
        tiles[x, y, 0] = 3  # tile_typeを3に設定

    # 敵座標
    adversal_pos = np.zeros((24, 24))
    for pos in units['position'][1 - team]:
        x, y = pos
        if x == -1 and y == -1:
            continue
        adversal_pos[x, y] += 1

    # (W, H, 3)
    tiles = np.concatenate([tiles, adversal_pos[..., np.newaxis]], axis=-1)

    # タイル埋め込み
    tile_embedder = TileEmbedding()
    embed_tile = tile_embedder(torch.tensor(tiles, dtype=torch.float32, device=device))

    tile_features = []
    for pos in units['position'][team]:
        x, y = pos
        if x == -1 and y == -1:
            continue
        tile_features.append(embed_tile[x, y, :])

    # タイル特徴をTensorに変換し、デバイスに送る
    return torch.stack(tile_features).to(device)


def build_unit_graph(units, units_mask, team, device='cuda'):
    # チームごとのグラフ構成
    units_nodes = []
    mask = units_mask[team].astype(bool)
    team_positions = units['position'][team][mask]
    team_energies = units['energy'][team][mask]

    # ノード情報: [x座標, y座標, エネルギー]
    team_nodes = np.zeros((len(team_positions), 3))
    team_nodes[:, :2] = team_positions
    team_nodes[:, 2] = team_energies

    # ノードをTensorに変換し、デバイスに送る
    units_nodes = torch.tensor(team_nodes, dtype=torch.float32, device=device)

    return units_nodes

In [None]:
# モデルとパラメータの初期化
device = 'cuda' if torch.cuda.is_available() else 'cpu'
imitator = GATActor().to(device)
optimizer = torch.optim.Adam(imitator.parameters(), lr=0.001)
alpha = 1  # BC損失の重み
beta = 1-alpha   # RL損失の重み
gamma_base = 0.95  # 割引率
NUM_LEARN = 10000   # 学習回数
BATCH_SIZE = 32    # バッチサイズ
NUM_STEPS = 100    # エピソード内のステップ数
writer = SummaryWriter(log_dir='./logs')

In [None]:
# 学習ループ
for learn_step in range(NUM_LEARN):
    total_bc_loss = 0
    total_rl_loss = 0

    for _ in range(BATCH_SIZE):
        data = load_episode_json(file_pathes)
        
        # 勝者の行動を模倣
        winner = np.argmax(data['rewards'])
        ep = random.randint(0, 4)
        step = random.randint(1, NUM_STEPS)
        
        step_log = data['steps'][ep * (NUM_STEPS + 1) + step][winner]
        
        obs = step_log['observations']['obs']
        sample_actions = step_log['action']
        
        # 観測データ
        units_mask = obs['units_mask']
        units = obs['units']
        map_features = obs['map_features']
        relic_nodes = obs['relic_nodes']
        relic_nodes_mask = obs['relic_nodes_mask']
        visible = obs['sensor_mask']
        
        # グラフ構築
        unit_graph = build_unit_graph(units, units_mask, winner, device=device)
        tile_graph = build_tile_graph(map_features, relic_nodes, units, winner, device=device)
        input_graph = torch.cat([unit_graph, tile_graph])
        
        # モデルによる行動予測
        action_probs, action_values = imitator.forward(input_graph)
        
        # BC損失（模倣学習的損失）
        bc_loss = -torch.log(action_probs[sample_actions] + 1e-8).mean()
        
        # RL損失（強化学習的損失）
        current_point_diff = step_log['team_points'][winner] - step_log['team_points'][1 - winner]
        previous_point_diff = 0 if step == 0 else data['steps'][ep * (NUM_STEPS + 1) + step - 1][winner]['team_points'][winner] - data['steps'][ep * (NUM_STEPS + 1) + step - 1][winner]['team_points'][1 - winner]
        reward = current_point_diff - previous_point_diff
        gamma = gamma_base ** (NUM_STEPS - step)
        q = action_values[sample_actions].sum()
        rl_loss = ((1 - torch.tanh(torch.tensor(reward) + gamma * q)) ** 2).mean()
        
        # バッチ内損失を合計
        total_bc_loss += bc_loss
        total_rl_loss += rl_loss

    # バッチ平均損失
    avg_bc_loss = total_bc_loss / BATCH_SIZE
    avg_rl_loss = total_rl_loss / BATCH_SIZE
    avg_loss = alpha * avg_bc_loss + beta * avg_rl_loss

    # 勾配計算とモデル更新
    optimizer.zero_grad()
    avg_loss.backward()
    optimizer.step()

    # TensorBoardへの記録
    writer.add_scalar('Loss/Total', avg_loss.item(), learn_step)
    writer.add_scalar('Loss/BC', avg_bc_loss.item(), learn_step)
    writer.add_scalar('Loss/RL', avg_rl_loss.item(), learn_step)

# TensorBoardクローズ
writer.close()
