<a href="https://colab.research.google.com/github/ryu622/gnn-counterattack-xai-v2/blob/feat%2Fnew-file/scientificdata_one6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

卒論で使用した最終的なデータ前処理コード

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install torch_geometric

In [None]:
!pip install floodlight

In [None]:
import torch
import numpy as np
import pandas as pd
import os
import xml.etree.ElementTree as ET
from collections import defaultdict
from torch_geometric.data import Data
from torch_geometric.nn import radius_graph
import floodlight.io.dfl as dfl_io

# ==========================================
# 1. GNNDataBuilder クラス (全メソッド完全・距離閾値グラフ版)
# ==========================================
class GNNDataBuilder:
    def __init__(self):
        self.OBSERVATION_WINDOW = 25   # 1秒間の観測（速度算出用）
        self.PREDICTION_TARGET = 125   # 5秒後の到達地点を確認
        self.dt_velocity = 0.20        # 5フレーム差での速度計算
        self.pitch_length_half = 52.5
        self.pitch_width_half = 34.0
        self.SUCCESS_X_THRESHOLD = 25.0

    def extract_sequences(self, df_pos, df_event, match_id_idx, manual_flip, left_team_id):
        recovery_types = ['TacklingGame', 'BallClaiming', 'BallDeflection']
        recovery_events = df_event[df_event['eID'].isin(recovery_types)]
        sequences = []

        is_cm = df_pos['Ball_x'].abs().max() > 500
        scale = 100.0 if is_cm else 1.0

        # Offset補正
        max_x_raw = df_pos['Ball_x'].max() / scale
        min_x_raw = df_pos['Ball_x'].min() / scale
        # 最小値が-30以下ならセンター0系と判断（Match 4対策）
        offset_x = 0.0 if min_x_raw < -30 else 52.5

        stadium_flip = float(manual_flip)

        for idx, event in recovery_events.iterrows():
            period = str(event['period'])
            striking_team = str(event['tID']).strip()

            # --- 物理整合性のためのFlipロジック ---
            if period in ['1', 'firstHalf']:
                raw_is_attacking_right = (striking_team == left_team_id)
            else:
                raw_is_attacking_right = (striking_team != left_team_id)

            dynamic_flip = (1.0 if raw_is_attacking_right else -1.0) * stadium_flip

            # フレーム特定
            start_f_raw = int(event['gameclock'] * 25)
            potential_start = df_pos[df_pos['frame_idx'] >= start_f_raw].index
            if len(potential_start) == 0: continue
            start_idx = potential_start[0]
            target_idx = start_idx + self.PREDICTION_TARGET
            if target_idx >= len(df_pos): continue

            # --- ラベル判定 ---
            start_x_flipped = ((df_pos.iloc[start_idx]['Ball_x'] / scale) - offset_x) * dynamic_flip
            target_x_flipped = ((df_pos.iloc[target_idx]['Ball_x'] / scale) - offset_x) * dynamic_flip

            is_in_deep_area = target_x_flipped > self.SUCCESS_X_THRESHOLD
            is_progressing = (target_x_flipped - start_x_flipped) > 5.0

            label = 1 if (is_in_deep_area and is_progressing) else 0

            # 観測ウィンドウ（1秒分）
            obs_frames = df_pos.iloc[start_idx : start_idx + self.OBSERVATION_WINDOW].copy()
            obs_frames.loc[:, 'offset_x_val'] = float(offset_x)
            obs_frames.loc[:, 'flip_factor'] = float(dynamic_flip)
            obs_frames.loc[:, 'label'] = int(label)
            obs_frames.loc[:, 'SequenceID'] = int(idx + (match_id_idx * 1000))
            sequences.append(obs_frames)

        return sequences

    def to_pyg_data(self, sequences, team_map):
        pyg_list = []
        if not sequences: return []

        is_cm = sequences[0]['Ball_x'].abs().max() > 500
        scale = 100.0 if is_cm else 1.0

        for seq in sequences:
            # 物理ゲート用に3地点を取得
            frame_pprev = seq.iloc[-11] # 10枚前 (pprev)
            frame_prev  = seq.iloc[-6]  # 5枚前 (prev)
            frame_curr  = seq.iloc[-1]  # 現在 (pos)

            off_x = frame_curr['offset_x_val']
            flip = frame_curr['flip_factor']
            label = int(frame_curr['label'])
            sid = int(frame_curr['SequenceID'])

            def transform_x(raw_val):
                return (((raw_val / scale) - off_x) * flip) / self.pitch_length_half
            def transform_y(raw_val, y_aug=1.0):
                return (((raw_val / scale) * flip) * y_aug) / self.pitch_width_half

            for y_aug in [1.0, -1.0]:
                node_features, pos_list, prev_pos_list, pprev_pos_list = [], [], [], []

                entities = []
                for team_prefix in ['Home', 'Away']:
                    team_val = 0.0 if team_prefix == 'Home' else 1.0
                    for p_id in team_map[team_prefix]:
                        entities.append((f"{p_id}_x", f"{p_id}_y", team_val))
                entities.append(('Ball_x', 'Ball_y', 2.0))

                for col_x, col_y, t_val in entities:
                    if col_x not in frame_curr or pd.isna(frame_curr[col_x]) or frame_curr[col_x] == 0:
                        continue

                    px, py = transform_x(frame_curr[col_x]), transform_y(frame_curr[col_y], y_aug)
                    px_p, py_p = transform_x(frame_prev[col_x]), transform_y(frame_prev[col_y], y_aug)
                    px_pp, py_pp = transform_x(frame_pprev[col_x]), transform_y(frame_pprev[col_y], y_aug)

                    vx = (px - px_p) / self.dt_velocity
                    vy = (py - py_p) / self.dt_velocity

                    node_features.append([px, py, vx, vy, 0.0, 0.0, t_val])
                    pos_list.append([px, py])
                    prev_pos_list.append([px_p, py_p])
                    pprev_pos_list.append([px_pp, py_pp])

                # テンソル化
                x_tensor = torch.tensor(node_features, dtype=torch.float)
                pos_tensor = torch.tensor(pos_list, dtype=torch.float)

                # ---  手動距離閾値グラフ構築  ---
                # r=0.5 (約26m) を閾値とする
                r = 0.6
                # 全ノード間の距離行列を計算 (形状: [N, N])
                dist_matrix = torch.cdist(pos_tensor, pos_tensor, p=2)
                # 距離が r 以下かつ自分自身でないインデックスを取得
                edge_index = (dist_matrix <= r).nonzero(as_tuple=False).t()
                # 自己ループ (i==j) を除去
                mask = edge_index[0] != edge_index[1]
                edge_index = edge_index[:, mask]

                pyg_list.append(Data(
                    x=x_tensor,
                    edge_index=edge_index,
                    y=torch.tensor([label], dtype=torch.long),
                    pos=pos_tensor,
                    prev_pos=torch.tensor(prev_pos_list, dtype=torch.float),
                    pprev_pos=torch.tensor(pprev_pos_list, dtype=torch.float),
                    sequence_id=torch.tensor([sid if y_aug == 1.0 else sid + 5000], dtype=torch.long)
                ))
        return pyg_list

# ==========================================
# 2. 補助関数の定義
# ==========================================
def get_match_direction_map(e_path):
    tree = ET.parse(e_path)
    root = tree.getroot()
    d_map = {}
    for event in root.findall('.//Event'):
        ko = event.find('KickOff')
        if ko is not None:
            period = ko.get('GameSection')
            d_map[period] = {'Left': ko.get('TeamLeft'), 'Right': ko.get('TeamRight')}
    return d_map

def parse_dfl_positions_to_wide(p_path):
    tree = ET.parse(p_path)
    root = tree.getroot()
    data_dict = defaultdict(dict)
    all_pids = set()
    for frameset in root.findall('.//FrameSet'):
        pID, period = frameset.get('PersonId'), frameset.get('GameSection')
        all_pids.add(pID)
        for frame in frameset.findall('Frame'):
            n = int(frame.get('N'))
            data_dict[n][pID] = [float(frame.get('X')), float(frame.get('Y')), float(frame.get('S'))]
            data_dict[n]['period'] = period
    sorted_frames = sorted(data_dict.keys())
    sorted_pIDs = sorted(list(all_pids))
    final_data = []
    for n in sorted_frames:
        row = {'frame_idx': n, 'period': data_dict[n].get('period')}
        for pID in sorted_pIDs:
            vals = data_dict[n].get(pID, [np.nan, np.nan, np.nan])
            name = 'Ball' if '0000XT' in pID else pID
            row[f'{name}_x'], row[f'{name}_y'], row[f'{name}_s'] = vals
        final_data.append(row)
    return pd.DataFrame(final_data).ffill()

In [None]:
# ==========================================
# 3. 1試合ずつ検証・実行メインループ
# ==========================================
raw_data_path = "/content/drive/MyDrive/GNN_Football_Analysis/Raw_Data"
save_dir = "/content/drive/MyDrive/GNN_Football_Analysis/Processed_Data/matches_v17"
os.makedirs(save_dir, exist_ok=True)

builder = GNNDataBuilder()

# --- 【確定設定】Match 4 救済設定 ---
TARGET_MATCH_IDX = 3
SET_MANUAL_FLIP = 1.0 # ボール推進率を改善するために反転
SET_LEFT_TEAM = "00000Q"

info_files = sorted([f for f in os.listdir(raw_data_path) if "matchinformation" in f])
i_f = info_files[TARGET_MATCH_IDX]
match_id_str = i_f.split('_')[-1].replace('.xml', '')

print(f"\n===== 【実行】 試合 {TARGET_MATCH_IDX + 1}: {match_id_str} =====")

i_path = os.path.join(raw_data_path, i_f)
p_path = next(os.path.join(raw_data_path, f) for f in os.listdir(raw_data_path) if match_id_str in f and "positions_raw" in f)
e_path = next(os.path.join(raw_data_path, f) for f in os.listdir(raw_data_path) if match_id_str in f and "events_raw" in f)

try:
    # 1. データの読み込み
    sheets = dfl_io.read_teamsheets_from_mat_info_xml(i_path)
    df_pos = parse_dfl_positions_to_wide(p_path)
    events, _, _ = dfl_io.read_event_data_xml(e_path, i_path)

    all_events_list = []
    for half in events:
        for team_label in events[half]:
            df_ev = events[half][team_label].events.copy()
            df_ev['tID'] = str(sheets[team_label].teamsheet['tID'].iloc[0]).strip()
            df_ev['period'] = half
            if 'type' in df_ev.columns: df_ev = df_ev.rename(columns={'type': 'eID'})
            all_events_list.append(df_ev)
    df_event_all = pd.concat(all_events_list)

    # 2. 変換実行
    team_map = {'Home': list(sheets['Home'].teamsheet['pID']), 'Away': list(sheets['Away'].teamsheet['pID'])}

    match_sequences = builder.extract_sequences(
        df_pos=df_pos,
        df_event=df_event_all,
        match_id_idx=TARGET_MATCH_IDX + 1,
        manual_flip=SET_MANUAL_FLIP,
        left_team_id=SET_LEFT_TEAM
    )

    pyg_data = builder.to_pyg_data(match_sequences, team_map)

    # 3. 保存
    if pyg_data:
        save_path = os.path.join(save_dir, f"match_{TARGET_MATCH_IDX + 1}.pt")
        torch.save(pyg_data, save_path)
        print(f"->  成功: {len(pyg_data)} シーンを保存しました。")
    else:
        print("->  成功シーンが0件でした。")

except Exception as e:
    print(f"->  エラー: {e}")

In [None]:
# ==========================================
# 3. 1試合ずつ検証・実行メインループ
# ==========================================
raw_data_path = "/content/drive/MyDrive/GNN_Football_Analysis/Raw_Data"
save_dir = "/content/drive/MyDrive/GNN_Football_Analysis/Processed_Data/matches_v17"
os.makedirs(save_dir, exist_ok=True)

builder = GNNDataBuilder()

# --- 【確定設定】Match 4 救済設定 ---
TARGET_MATCH_IDX = 6
SET_MANUAL_FLIP = -1.0 # ボール推進率を改善するために反転
SET_LEFT_TEAM = "00000P"

info_files = sorted([f for f in os.listdir(raw_data_path) if "matchinformation" in f])
i_f = info_files[TARGET_MATCH_IDX]
match_id_str = i_f.split('_')[-1].replace('.xml', '')

print(f"\n===== 【実行】 試合 {TARGET_MATCH_IDX + 1}: {match_id_str} =====")

i_path = os.path.join(raw_data_path, i_f)
p_path = next(os.path.join(raw_data_path, f) for f in os.listdir(raw_data_path) if match_id_str in f and "positions_raw" in f)
e_path = next(os.path.join(raw_data_path, f) for f in os.listdir(raw_data_path) if match_id_str in f and "events_raw" in f)

try:
    # 1. データの読み込み
    sheets = dfl_io.read_teamsheets_from_mat_info_xml(i_path)
    df_pos = parse_dfl_positions_to_wide(p_path)
    events, _, _ = dfl_io.read_event_data_xml(e_path, i_path)

    all_events_list = []
    for half in events:
        for team_label in events[half]:
            df_ev = events[half][team_label].events.copy()
            df_ev['tID'] = str(sheets[team_label].teamsheet['tID'].iloc[0]).strip()
            df_ev['period'] = half
            if 'type' in df_ev.columns: df_ev = df_ev.rename(columns={'type': 'eID'})
            all_events_list.append(df_ev)
    df_event_all = pd.concat(all_events_list)

    # 2. 変換実行
    team_map = {'Home': list(sheets['Home'].teamsheet['pID']), 'Away': list(sheets['Away'].teamsheet['pID'])}

    match_sequences = builder.extract_sequences(
        df_pos=df_pos,
        df_event=df_event_all,
        match_id_idx=TARGET_MATCH_IDX + 1,
        manual_flip=SET_MANUAL_FLIP,
        left_team_id=SET_LEFT_TEAM
    )

    pyg_data = builder.to_pyg_data(match_sequences, team_map)

    # 3. 保存
    if pyg_data:
        save_path = os.path.join(save_dir, f"match_{TARGET_MATCH_IDX + 1}.pt")
        torch.save(pyg_data, save_path)
        print(f"->  成功: {len(pyg_data)} シーンを保存しました。")
    else:
        print("->  成功シーンが0件でした。")

except Exception as e:
    print(f"->  エラー: {e}")

In [None]:
# ==========================================
# 3. 1試合ずつ検証・実行メインループ
# ==========================================
raw_data_path = "/content/drive/MyDrive/GNN_Football_Analysis/Raw_Data"
save_dir = "/content/drive/MyDrive/GNN_Football_Analysis/Processed_Data/matches_v17"
os.makedirs(save_dir, exist_ok=True)

builder = GNNDataBuilder()

# --- 【確定設定】Match 4 救済設定 ---
TARGET_MATCH_IDX = 5
SET_MANUAL_FLIP = 1.0 # ボール推進率を改善するために反転
SET_LEFT_TEAM = "00000H"

info_files = sorted([f for f in os.listdir(raw_data_path) if "matchinformation" in f])
i_f = info_files[TARGET_MATCH_IDX]
match_id_str = i_f.split('_')[-1].replace('.xml', '')

print(f"\n===== 【実行】 試合 {TARGET_MATCH_IDX + 1}: {match_id_str} =====")

i_path = os.path.join(raw_data_path, i_f)
p_path = next(os.path.join(raw_data_path, f) for f in os.listdir(raw_data_path) if match_id_str in f and "positions_raw" in f)
e_path = next(os.path.join(raw_data_path, f) for f in os.listdir(raw_data_path) if match_id_str in f and "events_raw" in f)

try:
    # 1. データの読み込み
    sheets = dfl_io.read_teamsheets_from_mat_info_xml(i_path)
    df_pos = parse_dfl_positions_to_wide(p_path)
    events, _, _ = dfl_io.read_event_data_xml(e_path, i_path)

    all_events_list = []
    for half in events:
        for team_label in events[half]:
            df_ev = events[half][team_label].events.copy()
            df_ev['tID'] = str(sheets[team_label].teamsheet['tID'].iloc[0]).strip()
            df_ev['period'] = half
            if 'type' in df_ev.columns: df_ev = df_ev.rename(columns={'type': 'eID'})
            all_events_list.append(df_ev)
    df_event_all = pd.concat(all_events_list)

    # 2. 変換実行
    team_map = {'Home': list(sheets['Home'].teamsheet['pID']), 'Away': list(sheets['Away'].teamsheet['pID'])}

    match_sequences = builder.extract_sequences(
        df_pos=df_pos,
        df_event=df_event_all,
        match_id_idx=TARGET_MATCH_IDX + 1,
        manual_flip=SET_MANUAL_FLIP,
        left_team_id=SET_LEFT_TEAM
    )

    pyg_data = builder.to_pyg_data(match_sequences, team_map)

    # 3. 保存
    if pyg_data:
        save_path = os.path.join(save_dir, f"match_{TARGET_MATCH_IDX + 1}.pt")
        torch.save(pyg_data, save_path)
        print(f"->  成功: {len(pyg_data)} シーンを保存しました。")
    else:
        print("->  成功シーンが0件でした。")

except Exception as e:
    print(f"->  エラー: {e}")

In [None]:
# ==========================================
# 3. 1試合ずつ検証・実行メインループ
# ==========================================
raw_data_path = "/content/drive/MyDrive/GNN_Football_Analysis/Raw_Data"
save_dir = "/content/drive/MyDrive/GNN_Football_Analysis/Processed_Data/matches_v17"
os.makedirs(save_dir, exist_ok=True)

builder = GNNDataBuilder()

# --- 【確定設定】Match 4 救済設定 ---
TARGET_MATCH_IDX = 4
SET_MANUAL_FLIP = -1.0 # ボール推進率を改善するために反転
SET_LEFT_TEAM = "000005"

info_files = sorted([f for f in os.listdir(raw_data_path) if "matchinformation" in f])
i_f = info_files[TARGET_MATCH_IDX]
match_id_str = i_f.split('_')[-1].replace('.xml', '')

print(f"\n===== 【実行】 試合 {TARGET_MATCH_IDX + 1}: {match_id_str} =====")

i_path = os.path.join(raw_data_path, i_f)
p_path = next(os.path.join(raw_data_path, f) for f in os.listdir(raw_data_path) if match_id_str in f and "positions_raw" in f)
e_path = next(os.path.join(raw_data_path, f) for f in os.listdir(raw_data_path) if match_id_str in f and "events_raw" in f)

try:
    # 1. データの読み込み
    sheets = dfl_io.read_teamsheets_from_mat_info_xml(i_path)
    df_pos = parse_dfl_positions_to_wide(p_path)
    events, _, _ = dfl_io.read_event_data_xml(e_path, i_path)

    all_events_list = []
    for half in events:
        for team_label in events[half]:
            df_ev = events[half][team_label].events.copy()
            df_ev['tID'] = str(sheets[team_label].teamsheet['tID'].iloc[0]).strip()
            df_ev['period'] = half
            if 'type' in df_ev.columns: df_ev = df_ev.rename(columns={'type': 'eID'})
            all_events_list.append(df_ev)
    df_event_all = pd.concat(all_events_list)

    # 2. 変換実行
    team_map = {'Home': list(sheets['Home'].teamsheet['pID']), 'Away': list(sheets['Away'].teamsheet['pID'])}

    match_sequences = builder.extract_sequences(
        df_pos=df_pos,
        df_event=df_event_all,
        match_id_idx=TARGET_MATCH_IDX + 1,
        manual_flip=SET_MANUAL_FLIP,
        left_team_id=SET_LEFT_TEAM
    )

    pyg_data = builder.to_pyg_data(match_sequences, team_map)

    # 3. 保存
    if pyg_data:
        save_path = os.path.join(save_dir, f"match_{TARGET_MATCH_IDX + 1}.pt")
        torch.save(pyg_data, save_path)
        print(f"->  成功: {len(pyg_data)} シーンを保存しました。")
    else:
        print("->  成功シーンが0件でした。")

except Exception as e:
    print(f"->  エラー: {e}")

In [None]:
# ==========================================
# 3. 1試合ずつ検証・実行メインループ
# ==========================================
raw_data_path = "/content/drive/MyDrive/GNN_Football_Analysis/Raw_Data"
save_dir = "/content/drive/MyDrive/GNN_Football_Analysis/Processed_Data/matches_v17"
os.makedirs(save_dir, exist_ok=True)

builder = GNNDataBuilder()

# --- 【確定設定】Match 4 救済設定 ---
TARGET_MATCH_IDX = 2
SET_MANUAL_FLIP = 1.0 # ボール推進率を改善するために反転
SET_LEFT_TEAM = "000011"

info_files = sorted([f for f in os.listdir(raw_data_path) if "matchinformation" in f])
i_f = info_files[TARGET_MATCH_IDX]
match_id_str = i_f.split('_')[-1].replace('.xml', '')

print(f"\n===== 【実行】 試合 {TARGET_MATCH_IDX + 1}: {match_id_str} =====")

i_path = os.path.join(raw_data_path, i_f)
p_path = next(os.path.join(raw_data_path, f) for f in os.listdir(raw_data_path) if match_id_str in f and "positions_raw" in f)
e_path = next(os.path.join(raw_data_path, f) for f in os.listdir(raw_data_path) if match_id_str in f and "events_raw" in f)

try:
    # 1. データの読み込み
    sheets = dfl_io.read_teamsheets_from_mat_info_xml(i_path)
    df_pos = parse_dfl_positions_to_wide(p_path)
    events, _, _ = dfl_io.read_event_data_xml(e_path, i_path)

    all_events_list = []
    for half in events:
        for team_label in events[half]:
            df_ev = events[half][team_label].events.copy()
            df_ev['tID'] = str(sheets[team_label].teamsheet['tID'].iloc[0]).strip()
            df_ev['period'] = half
            if 'type' in df_ev.columns: df_ev = df_ev.rename(columns={'type': 'eID'})
            all_events_list.append(df_ev)
    df_event_all = pd.concat(all_events_list)

    # 2. 変換実行
    team_map = {'Home': list(sheets['Home'].teamsheet['pID']), 'Away': list(sheets['Away'].teamsheet['pID'])}

    match_sequences = builder.extract_sequences(
        df_pos=df_pos,
        df_event=df_event_all,
        match_id_idx=TARGET_MATCH_IDX + 1,
        manual_flip=SET_MANUAL_FLIP,
        left_team_id=SET_LEFT_TEAM
    )

    pyg_data = builder.to_pyg_data(match_sequences, team_map)

    # 3. 保存
    if pyg_data:
        save_path = os.path.join(save_dir, f"match_{TARGET_MATCH_IDX + 1}.pt")
        torch.save(pyg_data, save_path)
        print(f"->  成功: {len(pyg_data)} シーンを保存しました。")
    else:
        print("->  成功シーンが0件でした。")

except Exception as e:
    print(f"->  エラー: {e}")

In [None]:
# ==========================================
# 3. 1試合ずつ検証・実行メインループ
# ==========================================
raw_data_path = "/content/drive/MyDrive/GNN_Football_Analysis/Raw_Data"
save_dir = "/content/drive/MyDrive/GNN_Football_Analysis/Processed_Data/matches_v17"
os.makedirs(save_dir, exist_ok=True)

builder = GNNDataBuilder()

# --- 【確定設定】Match 4 救済設定 ---
TARGET_MATCH_IDX = 1
SET_MANUAL_FLIP = 1.0 # ボール推進率を改善するために反転
SET_LEFT_TEAM = "00000B"

info_files = sorted([f for f in os.listdir(raw_data_path) if "matchinformation" in f])
i_f = info_files[TARGET_MATCH_IDX]
match_id_str = i_f.split('_')[-1].replace('.xml', '')

print(f"\n===== 【実行】 試合 {TARGET_MATCH_IDX + 1}: {match_id_str} =====")

i_path = os.path.join(raw_data_path, i_f)
p_path = next(os.path.join(raw_data_path, f) for f in os.listdir(raw_data_path) if match_id_str in f and "positions_raw" in f)
e_path = next(os.path.join(raw_data_path, f) for f in os.listdir(raw_data_path) if match_id_str in f and "events_raw" in f)

try:
    # 1. データの読み込み
    sheets = dfl_io.read_teamsheets_from_mat_info_xml(i_path)
    df_pos = parse_dfl_positions_to_wide(p_path)
    events, _, _ = dfl_io.read_event_data_xml(e_path, i_path)

    all_events_list = []
    for half in events:
        for team_label in events[half]:
            df_ev = events[half][team_label].events.copy()
            df_ev['tID'] = str(sheets[team_label].teamsheet['tID'].iloc[0]).strip()
            df_ev['period'] = half
            if 'type' in df_ev.columns: df_ev = df_ev.rename(columns={'type': 'eID'})
            all_events_list.append(df_ev)
    df_event_all = pd.concat(all_events_list)

    # 2. 変換実行
    team_map = {'Home': list(sheets['Home'].teamsheet['pID']), 'Away': list(sheets['Away'].teamsheet['pID'])}

    match_sequences = builder.extract_sequences(
        df_pos=df_pos,
        df_event=df_event_all,
        match_id_idx=TARGET_MATCH_IDX + 1,
        manual_flip=SET_MANUAL_FLIP,
        left_team_id=SET_LEFT_TEAM
    )

    pyg_data = builder.to_pyg_data(match_sequences, team_map)

    # 3. 保存
    if pyg_data:
        save_path = os.path.join(save_dir, f"match_{TARGET_MATCH_IDX + 1}.pt")
        torch.save(pyg_data, save_path)
        print(f"->  成功: {len(pyg_data)} シーンを保存しました。")
    else:
        print("-> 成功シーンが0件でした。")

except Exception as e:
    print(f"->  エラー: {e}")

In [None]:
# ==========================================
# 3. 1試合ずつ検証・実行メインループ
# ==========================================
raw_data_path = "/content/drive/MyDrive/GNN_Football_Analysis/Raw_Data"
save_dir = "/content/drive/MyDrive/GNN_Football_Analysis/Processed_Data/matches_v17"
os.makedirs(save_dir, exist_ok=True)

builder = GNNDataBuilder()

# --- 【確定設定】Match 4 救済設定 ---
TARGET_MATCH_IDX = 0
SET_MANUAL_FLIP = 1.0 # ボール推進率を改善するために反転
SET_LEFT_TEAM = "00000G"

info_files = sorted([f for f in os.listdir(raw_data_path) if "matchinformation" in f])
i_f = info_files[TARGET_MATCH_IDX]
match_id_str = i_f.split('_')[-1].replace('.xml', '')

print(f"\n===== 【実行】 試合 {TARGET_MATCH_IDX + 1}: {match_id_str} =====")

i_path = os.path.join(raw_data_path, i_f)
p_path = next(os.path.join(raw_data_path, f) for f in os.listdir(raw_data_path) if match_id_str in f and "positions_raw" in f)
e_path = next(os.path.join(raw_data_path, f) for f in os.listdir(raw_data_path) if match_id_str in f and "events_raw" in f)

try:
    # 1. データの読み込み
    sheets = dfl_io.read_teamsheets_from_mat_info_xml(i_path)
    df_pos = parse_dfl_positions_to_wide(p_path)
    events, _, _ = dfl_io.read_event_data_xml(e_path, i_path)

    all_events_list = []
    for half in events:
        for team_label in events[half]:
            df_ev = events[half][team_label].events.copy()
            df_ev['tID'] = str(sheets[team_label].teamsheet['tID'].iloc[0]).strip()
            df_ev['period'] = half
            if 'type' in df_ev.columns: df_ev = df_ev.rename(columns={'type': 'eID'})
            all_events_list.append(df_ev)
    df_event_all = pd.concat(all_events_list)

    # 2. 変換実行
    team_map = {'Home': list(sheets['Home'].teamsheet['pID']), 'Away': list(sheets['Away'].teamsheet['pID'])}

    match_sequences = builder.extract_sequences(
        df_pos=df_pos,
        df_event=df_event_all,
        match_id_idx=TARGET_MATCH_IDX + 1,
        manual_flip=SET_MANUAL_FLIP,
        left_team_id=SET_LEFT_TEAM
    )

    pyg_data = builder.to_pyg_data(match_sequences, team_map)

    # 3. 保存
    if pyg_data:
        save_path = os.path.join(save_dir, f"match_{TARGET_MATCH_IDX + 1}.pt")
        torch.save(pyg_data, save_path)
        print(f"->  成功: {len(pyg_data)} シーンを保存しました。")
    else:
        print("->  成功シーンが0件でした。")

except Exception as e:
    print(f"->  エラー: {e}")

クロスバリデーションのために、そのまま保存

In [None]:
import os
import torch
from collections import Counter

# ==========================================
# セクション2: データのロード（match_idを明示的に付与）
# ==========================================
save_dir = "/content/drive/MyDrive/GNN_Football_Analysis/Processed_Data/matches_v17"
final_output_path = "/content/drive/MyDrive/GNN_Football_Analysis/Processed_Data/gnn_data_v18_final.pt"

match_files = sorted([os.path.join(save_dir, f) for f in os.listdir(save_dir) if f.startswith('match_') and f.endswith('.pt')])
all_data = []

print(f"{len(match_files)} 試合分の統合開始...")

for i, f in enumerate(match_files):
    # ファイル名に関わらず、1から始まる連番を試合IDとして確定させる
    current_match_id = i + 1

    m_data = torch.load(f, weights_only=False)

    # 【ここが重要！】
    # 各データオブジェクトに match_id 属性を追加する
    for d in m_data:
        d.match_id = torch.tensor([current_match_id])

    all_data.extend(m_data)
    print(f" -> {os.path.basename(f)}: {len(m_data)} frames loaded. (Marked as Match {current_match_id})")

# ==========================================
# セクション3: 保存
# ==========================================
print(f"\n--- 最終データ構成（CV用・ID刻印済み） ---")
print(f"総フレーム数: {len(all_data)}")

all_lbls = Counter([int(d.y.item()) for d in all_data])
print(f"全データ内訳: 成功 {all_lbls[1]} 枚 / 失敗 {all_lbls[0]} 枚")

# 試合ごとの内訳も確認（デバッグ用）
match_counts = Counter([int(d.match_id.item()) for d in all_data])
print(f"試合別フレーム数: {dict(sorted(match_counts.items()))}")

save_obj = {
    'all_data': all_data,
    'description': 'v16 integrated data with explicit match_id'
}

torch.save(save_obj, final_output_path)
print(f"\n 保存完了: {final_output_path}")