# 概要

GNNを用いたシンプルなモデルです。

特徴量も限りなく少ないので改善すれば結構スコア伸びるかもしれません。


NNの構造周りについては、NN系初心者なので基本的なミスもあるかもしれません。

もし見つけた方（もしくはこうしてみても良いかも！）などあれば
コメントいただけると嬉しいです。


# 事前準備

In [1]:
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric  -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html  -Uq

# ライブラリの読み込み

In [1]:
import os
import random
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.metrics import f1_score
from tqdm import tqdm
from fastprogress import progress_bar

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import NNConv, TransformerConv, PNAConv

import torch_geometric.nn as pyg_nn
import torch_geometric.utils as pyg_utils

from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
import wandb


import datetime
from pytz import timezone
now = datetime.datetime.now(timezone('UTC'))
yyyymmdd_hhmm = "{0:%Y%m%d_%H%M}".format(now.astimezone(timezone('Asia/Tokyo')))

In [1]:
# 乱数シードの固定
def set_seed(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(seed=42)

In [1]:
NOTE_ID = 'CLB_N007'

In [1]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

# データの読み込み・前処理・マージ

In [1]:
DATA_DIR = Path("/kaggle/input/shigglecup-2nd/")
print(os.listdir(DATA_DIR))

In [1]:
df_train = pd.read_csv(DATA_DIR / "train.csv")
df_test = pd.read_csv(DATA_DIR / "test.csv")
df_team = pd.read_csv(DATA_DIR / "team_id.csv")
df_pokemon = pd.read_csv(DATA_DIR / "pokemon.csv")
df_type = pd.read_csv(DATA_DIR / "typetable.csv")
df_sub = pd.read_csv(DATA_DIR / "sample_submission.csv")

In [1]:
df_pokemon['Legendary'] = df_pokemon['Legendary'].apply(lambda x: 1 if x == True else 0)

In [1]:
# 標準化
status_cols = ['HP', 'Attack', 'Defense',
       'Sp_Atk', 'Sp_Def', 'Speed', 'Generation']
scaler = StandardScaler()
df_pokemon[status_cols] = scaler.fit_transform(df_pokemon[status_cols])

In [1]:
def merge_team(df_mart, df_team):
    """team情報をマージ"""
    pokemon_cols = ['pokemon_id_1', 'pokemon_id_2', 'pokemon_id_3',
       'pokemon_id_4', 'pokemon_id_5', 'pokemon_id_6']

    df_first = pd.merge(df_mart, df_team, left_on='first', right_on='team_id', how='left')
    df_second = pd.merge(df_mart, df_team, left_on='second', right_on='team_id', how='left')

    df_mart = pd.concat([
                            df_mart,
                            df_first[pokemon_cols].add_suffix('_first'),
                            df_second[pokemon_cols].add_suffix('_second'),
                        ], axis=1)
    return df_mart

In [1]:
df_train = merge_team(df_train, df_team)
df_test = merge_team(df_test, df_team)

df_train.shape, df_test.shape

In [1]:
def merge_pokemon(df_mart, df_pokemon):
    """ポケモン情報をマージ"""
    pokemon_cols = ['pokemon_id_1', 'pokemon_id_2', 'pokemon_id_3',
       'pokemon_id_4', 'pokemon_id_5', 'pokemon_id_6']
    pokemon_id_cols = [c + '_first' for c in pokemon_cols] + [c + '_second' for c in pokemon_cols]
    pokemon_detail_cols = ['Type_1', 'Type_2', 'HP', 'Attack', 'Defense',
        'Sp_Atk', 'Sp_Def', 'Speed', 'Generation', 'Legendary']

    df_list = []
    for col in pokemon_id_cols:
        suffix = col[10:] # '_1_first'みたいな
        
        df_tmp = pd.merge(df_mart, df_pokemon, left_on=col, right_on='pokemon_id', how='left')
        df_list.append(df_tmp[pokemon_detail_cols].add_suffix(suffix))
    
    df_mart = pd.concat([df_mart] + df_list, axis=1)

    return df_mart

In [1]:
df_train = merge_pokemon(df_train, df_pokemon)
df_test = merge_pokemon(df_test, df_pokemon)

df_train.shape, df_test.shape

# ノードとエッジ、ラベルを定義
- GNNではノードとエッジという２つの要素によってグラフ構造が与えられます。
    - 今回は以下で構造をもたせます。
    - ノード：ポケモン（firstチーム6体、secondチーム6体の合計12体）
    - エッジ：ポケモン同士の対戦組み合わせ（firstチーム6体とsecondチーム6対の全組み合わせ 6x6 = 36組み合わせ）


- ノードとエッジにはそれぞれ特徴量をもたせられます。
    - 今回は簡単に以下のような特徴量をもたせています。
    - ノード：各ポケモンのステータス値（標準化済み） -> 8特徴量
    - エッジ：対戦するポケモン同士のスピード値の差(first - second) -> 1特徴量
    
    
- ラベルはターゲットの値。GNNでは、大きくタスクは「ノード」のラベルを当てるものと、「グラフ構造」のラベルを当てるものの２種類があるが、今回は後者。
- 各グラフ構造（＝対戦カード）でどっちが勝ったのかを当てる

In [1]:
def build_nodes(sample: pd.DataFrame):
    # speed_cols = [f"Speed_{num}_{team}" for team in ['first', 'second'] for num in range(1,7) ]
    speed_cols = [f"Speed_{num}_{team}" for team in ['first', 'second'] for num in range(1,7) ]
    hp_cols = [f"HP_{num}_{team}" for team in ['first', 'second'] for num in range(1,7) ]
    attack_cols = [f"Attack_{num}_{team}" for team in ['first', 'second'] for num in range(1,7) ]
    defense_cols = [f"Defense_{num}_{team}" for team in ['first', 'second'] for num in range(1,7) ]
    sp_atk_cols = [f"Sp_Atk_{num}_{team}" for team in ['first', 'second'] for num in range(1,7) ]
    sp_def_cols = [f"Sp_Def_{num}_{team}" for team in ['first', 'second'] for num in range(1,7) ]
    gen_cols = [f"Generation_{num}_{team}" for team in ['first', 'second'] for num in range(1,7) ]
    legend_cols = [f"Legendary_{num}_{team}" for team in ['first', 'second'] for num in range(1,7) ]
    x = np.vstack([
                   sample[speed_cols].astype(float).values,
                   sample[hp_cols].astype(float).values,
                   sample[attack_cols].astype(float).values,
                   sample[defense_cols].astype(float).values,
                   sample[sp_atk_cols].astype(float).values,
                   sample[sp_def_cols].astype(float).values,
                   sample[gen_cols].astype(float).values,
                   sample[legend_cols].astype(int).values,
                   ]).T # [num_nodes, num_node_features]
    return torch.tensor(x, dtype=torch.float)


In [1]:
# サンプル確認
# 1試合目の12ポケモンx8特徴量
x = build_nodes(df_train.iloc[0])
print(x)

In [1]:
def build_edges(sample: pd.DataFrame):
    speed_cols = [f"Speed_{num}_{team}" for team in ['first', 'second'] for num in range(1,7) ]
    num_nodes = 12
    speed_val = sample[speed_cols].astype(float).values

    edge_idx, speed_diff = [], []
    # 0-5: first team
    # 6-11: second team
    for f_poke in range(6):
        for s_poke in range(6, 12):
            edge_idx.append([f_poke,s_poke])
            speed_diff.append(speed_val[f_poke] - speed_val[s_poke])
            
    edge_attr = np.vstack([speed_diff]).T # [num_edges, num_edge_features]
    return torch.tensor(edge_idx, dtype=torch.long).t().contiguous(), torch.tensor(edge_attr, dtype=torch.float)


In [1]:
# サンプル確認
# エッジのインデックス（どのノードを接続しているか）、とエッジの持つ特徴量（スピード差）
edge_index, edge_attr  = build_edges(df_train.iloc[0])
print("edge index:", edge_index)
print("edge attr:", edge_attr)

In [1]:
def build_labels(sample: pd.DataFrame):
    y = sample.target
    return torch.tensor(y, dtype=torch.long)

In [1]:
# サンプル確認
y = build_labels(df_train.iloc[0])
print("y:", y)

In [1]:
# Dataとしてまとめた場合のサンプル
data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
data

In [1]:
def build_data_list(df_input: pd.DataFrame, istrain=True):
    data_list = []
    for _, row in df_input.iterrows():
        x = build_nodes(row)
        edge_index, edge_attr = build_edges(row)
        if istrain:
            y = build_labels(row)
        else:
            y = 0
        data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
        data_list.append(data)
    return data_list

# Config Definition

In [1]:
# config の定義は後半にも分割してでてきます。。
config = {
   'loss_fn':  nn.CrossEntropyLoss(),
    'batch_size': 64,
    'node_hidden_channels': 8,
    'edge_hidden_channels': 4,
    'dropout_rate': 0.05,
}

# SplitとDataLoader
- 今回はサンプルなのでCVではなく、train-test splitな1fold分しか分けてません。（分割はGroupKで実施）


In [1]:
fold_num = 6
kf = GroupKFold(n_splits=fold_num)

for i, (train_idx, test_idx) in enumerate(kf.split(df_train, df_train['target'], groups=df_train["first"])):
    df_tr = df_train.iloc[train_idx]
    df_va = df_train.iloc[test_idx]
    break

In [1]:
df_tr.shape, df_va.shape, df_test.shape

In [1]:
train_data_list = build_data_list(df_tr)
valid_data_list = build_data_list(df_va)
test_data_list = build_data_list(df_test, istrain=False)

In [1]:
train_loader = DataLoader(train_data_list, batch_size=config['batch_size'], shuffle=True)
valid_loader = DataLoader(valid_data_list, batch_size=config['batch_size'], shuffle=False)
test_loader = DataLoader(test_data_list, batch_size=config['batch_size'], shuffle=False)

# Modeling

In [1]:
class EarlyStopping:
    def __init__(self, patience=7, mode="min", delta=0.):
        self.patience = patience
        self.counter = 0
        self.mode = mode
        self.best_score = None
        self.early_stop = False
        self.delta = delta
        if self.mode == "min":
            self.val_score = np.Inf
        else:
            self.val_score = -np.Inf

    def __call__(self, epoch_score, model, model_path):

        if self.mode == "min":
            score = -1.0 * epoch_score
        else:
            score = np.copy(epoch_score)

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
        elif score < self.best_score: #  + self.delta
            self.counter += 1
            print('EarlyStopping counter: {} out of {}'.format(self.counter, self.patience))
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            # ema.apply_shadow()
            self.save_checkpoint(epoch_score, model, model_path)
            # ema.restore()
            self.counter = 0

    def save_checkpoint(self, epoch_score, model, model_path):
        if epoch_score not in [-np.inf, np.inf, -np.nan, np.nan]:
            print('Validation score improved ({} --> {}). Saving model!'.format(self.val_score, epoch_score))
            torch.save(model.state_dict(), model_path)
        self.val_score = epoch_score

In [1]:
class SimpleGCN(nn.Module):
    def __init__(self,
                 num_node_features: int,
                 num_edge_features: int,
                 node_hidden_channels: int,
                 edge_hidden_channels: int,
                 num_classes: int,
                 dropout_rate: float):
        super(SimpleGCN, self).__init__()

        self.node_encoder = nn.Linear(num_node_features, node_hidden_channels)
        self.edge_encoder = nn.Linear(num_edge_features, edge_hidden_channels)
        self.conv1 = NNConv(in_channels=node_hidden_channels,
                            out_channels=node_hidden_channels,
                            nn=nn.Linear(edge_hidden_channels, 
                                         node_hidden_channels * node_hidden_channels))
        self.conv2 = NNConv(in_channels=node_hidden_channels,
                            out_channels=node_hidden_channels,
                            nn=nn.Linear(edge_hidden_channels, 
                                         node_hidden_channels * node_hidden_channels))
        self.dropout = nn.Dropout(dropout_rate)
        self.bn = nn.BatchNorm1d(node_hidden_channels)
        self.linear = nn.Linear(node_hidden_channels, num_classes)

    def forward(self, data):
        x = data.x
        edge_index = data.edge_index
        edge_attr = data.edge_attr
        batch = data.batch

        x = self.node_encoder(x) # [num_nodes, node_hidden_channels]
        edge_attr = self.edge_encoder(edge_attr) # [num_edges, node_edge_features]
        x = F.relu(self.conv1(x, edge_index, edge_attr)) # [num_nodes, node_hidden_channels]
        x = F.relu(self.conv2(x, edge_index, edge_attr)) # [num_nodes, node_hidden_channels]
        x = self.linear(x) # [num_nodes, num_classes]
        x = self.dropout(x)
        x = pyg_nn.global_mean_pool(x, batch)

        return F.softmax(x, dim=1)


In [1]:
model = SimpleGCN(num_node_features=train_data_list[0].x.shape[1],
                  num_edge_features=train_data_list[0].edge_attr.shape[1],
                  node_hidden_channels=config['node_hidden_channels'],
                  edge_hidden_channels=config['edge_hidden_channels'],
                  num_classes=2,
                  dropout_rate=config['dropout_rate'])
model.to(device);

In [1]:
config['optimizer'] = torch.optim.Adam(model.parameters(), lr=1e-3)
config['early_stopping'] = EarlyStopping(patience=50, mode="min")
optimizer = config['optimizer']
loss_fn = config['loss_fn']
es = config['early_stopping']


In [1]:
def train_step(model, data_loader, config):
    model.train()
    preds, targets, losses = [], [], []
    with tqdm(total=len(data_loader), unit="batch") as pbar:
        pbar.set_description(f"[train] Epoch {epoch+1}/{EPOCH}")
        for batch_idx, data in enumerate(data_loader):
            optimizer.zero_grad()
            data = data.to(device)
            target = data.y
            output = model(data)
            loss = loss_fn(output, target)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
            preds.append(torch.argmax(output, dim=1).detach().cpu().numpy())
            targets.append(target.detach().cpu().numpy())
            pbar.set_postfix(tr_loss=np.array(losses).mean(), 
                             auc=roc_auc_score(np.hstack(targets), np.hstack(preds)))
            pbar.update(1)

def eval_step(model, data_loader, config):
    model.eval()
    preds, targets, losses = [], [], []
    with tqdm(total=len(data_loader), unit="batch") as pbar:
        pbar.set_description(f"[eval]  Epoch {epoch+1}/{EPOCH}")
        with torch.no_grad():
            for batch_idx, data in enumerate(data_loader):
                data = data.to(device)
                target = data.y
                output = model(data)
                loss = loss_fn(output, target)
                losses.append(loss.item())
                target = target.cpu().numpy()
                preds.append(torch.argmax(output, dim=1).detach().cpu().numpy())
                targets.append(target)
                pbar.set_postfix(va_loss=np.array(losses).mean(),
                                 auc=roc_auc_score(np.hstack(targets), np.hstack(preds)))
                pbar.update(1)
    
    val_loss = np.array(losses).mean()
    return val_loss, model

def inference_step(model, data_loader):
    model.eval()
    logits, preds, targets = [], [], []
    with tqdm(total=len(data_loader), unit="batch") as pbar:
        pbar.set_description(f"[inference]")
        with torch.no_grad():
            for data in data_loader:
                data = data.to(device)
                output = model(data)
                logits.append(output.cpu().numpy())
                output = torch.argmax(output, dim=1).cpu().numpy()
                preds.append(output)
                pbar.update(1)

    return np.vstack(logits), np.hstack(preds)

In [1]:
%%time
EPOCH = 1000

for epoch in progress_bar(range(EPOCH)):
    train_step(model, train_loader, config)
    val_loss, model = eval_step(model, valid_loader, config)

    es(val_loss, model, model_path=f"./model_{yyyymmdd_hhmm}.pth")
    if es.early_stop:
        print("Early stopping")
        break

## Inference

In [1]:
model.load_state_dict(torch.load(f'./model_{yyyymmdd_hhmm}.pth'))
test_proba, test_preds = inference_step(model, test_loader)
test_proba

# Submission

In [1]:
df_sub.head(3)

In [1]:
df_sub['target'] = test_proba[:,1]

In [1]:
df_sub.head()

In [1]:
filename = f"./{NOTE_ID}_{yyyymmdd_hhmm}_df_sub.csv"
df_sub.to_csv(filename, index=None)
