In [47]:
import torch
import numpy as np
import json
import pandas as pd
import random

In [48]:
data_dir = 'dataset/'

In [49]:
master_df = pd.read_csv(data_dir + 'episodes.csv')
master_df.head()

Unnamed: 0,Id,EpisodeId,Index,Reward,State,SubmissionId,InitialConfidence,InitialScore,UpdatedConfidence,UpdatedScore
0,132759051,58420861,0,1.0,2,41789980,,,200.0,600.0
1,132759052,58420861,1,4.0,2,41789980,,,200.0,600.0
2,132765118,58423894,1,5.0,2,41789980,200.0,980.923901,200.0,1040.8575
3,132765797,58424233,0,5.0,2,41789980,200.0,1040.857579,200.0,1088.025
4,132760394,58421533,1,5.0,2,41789980,200.0,600.0,200.0,733.9188


In [50]:
def load_episode_json(file_pathes):
    file_path = random.choice(file_pathes)
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        return data
    except FileNotFoundError:
        print(f"Error: File {file_path} not found.")
        return None


In [51]:
# file_path list
file_pathes = []
for sub_id, df in master_df.groupby("SubmissionId"):
    episode_ids = df["EpisodeId"].unique()
    for ep_id in episode_ids:
        file_path = data_dir + f"{sub_id}_{ep_id}.json"
        file_pathes.append(file_path)

## 学習

In [46]:
from model import *

In [None]:
# 1試合当たりのステップ数
NUM_STEPS = 100
# バッチサイズ
BATCH_SIZE = 64
# 学習回数
NUM_LEARN = 10000

# 損失関数のブレンド割合
alpha = 0.5
beta = 1-alpha

In [None]:
imitator = GATActor(input_dim=10)

In [53]:
data = load_episode_json(file_pathes)
data.keys()

dict_keys(['configuration', 'description', 'id', 'info', 'name', 'rewards', 'schema_version', 'specification', 'statuses', 'steps', 'title', 'version'])

In [83]:
for key, value in data['steps'][505][0]['observation'].items():
    print(key, value)

obs {"units": {"position": [[[4, 3], [16, 2], [15, 9], [13, 7], [1, 3], [16, 7], [0, 1], [15, 7], [15, 10], [14, 6], [13, 8], [15, 6], [-1, -1], [-1, -1], [-1, -1], [-1, -1]], [[-1, -1], [19, 10], [-1, -1], [-1, -1], [-1, -1], [-1, -1], [17, 10], [-1, -1], [-1, -1], [-1, -1], [16, 14], [-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]], "energy": [[109, 256, 23, 271, 94, -5, 95, 9, 27, 134, 190, 319, -1, -1, -1, -1], [-1, 49, -1, -1, -1, -1, 67, -1, -1, -1, 97, -1, -1, -1, -1, -1]]}, "units_mask": [[true, true, true, true, true, true, true, true, true, true, true, true, false, false, false, false], [false, true, false, false, false, false, true, false, false, false, true, false, false, false, false, false]], "sensor_mask": [[true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false], [true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false,

In [101]:
data['steps'][504][0]['action']

[[2, 1, 0],
 [3, 0, 1],
 [2, 1, 0],
 [2, 1, 0],
 [2, 1, 0],
 [3, 0, 1],
 [0, 0, 0],
 [3, 0, 1],
 [2, 1, 0],
 [2, 1, 0],
 [3, 0, 1],
 [2, 1, 0],
 [0, 0, 0],
 [0, 0, 0],
 [0, 0, 0],
 [0, 0, 0]]

In [None]:
for _ in range(NUM_LEARN):
    for _ in range(BATCH_SIZE):
        data = load_episode_json(file_pathes)
        env_cfg = data['configuration']['env_cfg']
        # imitate winner move
        winner = np.argmax(data['rewards'])
        ep = random.randint(0, 4)
        step = random.randint(1,100)
        
        step_log = data['steps'][ep * 101 + step][winner]
        
        obs = step_log['observations']['obs']
        sample_actions = step_log['action']
        
        # unit
        units_mask = obs['units_mask']
        units_pos = obs['units']['position']
        units_energy = obs['units']['energy']
        # tile(width, height)
        tile_mask = obs['tile_mask']
        tile_types = obs['map_features']['type']
        tile_energy = obs['map_features']['energy']
        relic_nodes = obs['relic_nodes']
        relic_nodes_mask = obs['relic_nodes_mask']
        visible = obs['sensor_mask']
        
        # compute action
        unit_graph = build_unit_graph(units, units_mask)
        tile_graph = build_tile_graph(tiles, units, units_mask)
        input_graph = torch.cat([unit_graph, tile_graph])
        action_probs, action_value = imitator.forward(input_graph)
        
        # 模倣学習的クロスエントロピーロス
        bc_loss = np.log(action_probs[sample_actions])
        # 強化学習的損失関数
        current_point_diff = step_log['team_points'][winner] - step_log['team_points'][1-winner]
        previous_point_diff = 0 if step == 0 else data['steps'][ep * 101 + step - 1][winner]['team_points'][winner] - data['steps'][ep * 101 + step - 1][winner]['team_points'][1-winner]
        reward = current_point_diff - previous_point_diff
        gamma = 0.95 ** (NUM_STEPS - step)
        rl_loss = np.mean((1 - torch.sigmoid(reward + gamma * action_value))**2)

        loss = alpha * bc_loss + beta * rl_loss