In [17]:
import torch
import numpy as np
import json
import pandas as pd
import random

In [18]:
data_dir = 'dataset/'

In [19]:
master_df = pd.read_csv(data_dir + 'episodes.csv')
master_df.head()

Unnamed: 0,Id,EpisodeId,Index,Reward,State,SubmissionId,InitialConfidence,InitialScore,UpdatedConfidence,UpdatedScore
0,133522497,58802584,0,5.0,2,41862933,200.0,600.0,200.0,701.73895
1,133523193,58802932,0,5.0,2,41862933,200.0,701.738952,200.0,790.51196
2,133523896,58803281,0,5.0,2,41862933,200.0,790.511986,200.0,872.51587
3,133527407,58805040,0,5.0,2,41862933,200.0,1250.139385,200.0,1315.7439
4,133533045,58807859,0,3.0,2,41862933,200.0,1829.229017,200.0,1941.3763


In [20]:
def load_episode_json(file_pathes):
    file_path = random.choice(file_pathes)
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        return data
    except FileNotFoundError:
        print(f"Error: File {file_path} not found.")
        return None


In [21]:
# file_path list
file_pathes = []
for sub_id, df in master_df.groupby("SubmissionId"):
    episode_ids = df["EpisodeId"].unique()
    for ep_id in episode_ids:
        file_path = data_dir + f"{sub_id}_{ep_id}.json"
        file_pathes.append(file_path)

## 学習

In [46]:
from model import *

In [None]:
# 1試合当たりのステップ数
NUM_STEPS = 100
# バッチサイズ
BATCH_SIZE = 64
# 学習回数
NUM_LEARN = 10000

# 損失関数のブレンド割合
alpha = 0.5
beta = 1-alpha

In [None]:
imitator = GATActor(input_dim=10)

In [None]:
for _ in range(NUM_LEARN):
    for _ in range(BATCH_SIZE):
        data = load_episode_json(file_pathes)
        env_cfg = data['configuration']['env_cfg']
        # imitate winner move
        winner = np.argmax(data['rewards'])
        id = random.randint(500)
        
        step_log = data['steps'][id][winner]
        current_step = step_log['match_step']
        
        obs = step_log['observations']
        sample_actions = step_log['actions']
        
        units_mask = obs['units_mask']
        units_pos = obs['units']['position']
        units_energy = obs['units']['energy']
        
        tile_mask = obs['tile_mask']
        tile_types = obs['map_features']['type']
        tile_energy = obs['map_features']['energy']
        
        
        # compute action
        unit_graph = build_unit_graph(units, units_mask)
        tile_graph = build_tile_graph(tiles, units, units_mask)
        input_graph = torch.cat([unit_graph, tile_graph])
        action_probs, action_value = imitator.forward(input_graph)
        
        # 模倣学習的クロスエントロピーロス
        bc_loss = np.log(action_probs[sample_actions])
        # 強化学習的損失関数
        current_point_diff = step_log['team_points'][winner] - step_log['team_points'][1-winner]
        previous_point_diff = 0 if current_step == 0 else data['steps'][id-1][winner]['team_points'][winner] - data['steps'][id-1][winner]['team_points'][1-winner]
        reward = current_point_diff - previous_point_diff
        gamma = 0.95 ** (NUM_STEPS - current_step)
        rl_loss = np.mean((1 - torch.sigmoid(reward + gamma * action_value))**2)