In [None]:
import sys
import os
import pandas as pd
import numpy as np
from pathlib import Path

sys.path.append(os.path.abspath('..'))
from configs.config import *
from src.util import Validation

## 1. データ読み込み

In [None]:
# データ読み込み
df_train = pd.read_csv(os.path.join(DIR_INPUT, 'atmaCup22_metadata', 'train_meta.csv'))
df_test = pd.read_csv(os.path.join(DIR_INPUT, 'atmaCup22_metadata', 'test_meta.csv'))

print(f"Train: {len(df_train):,} samples")
print(f"Test:  {len(df_test):,} samples")

# グループ列の作成（quarter_session）
df_train['group'] = df_train['quarter'] + '_' + df_train['session'].astype(str)

print(f"\nUnique groups: {df_train['group'].nunique()}")
print(f"Unique players: {df_train['label_id'].nunique()}")

## 2. GroupKFold でのリークチェック

In [None]:
# GroupKFold validator の作成
validator = Validation.create_validator(method='group', n_splits=5)

# リークチェック実行
result = Validation.check_group_leak(
    validator=validator,
    X=df_train,
    y=df_train['label_id'].values,
    groups=df_train['group'].values,
    verbose=True
)

print(f"\n{'='*80}")
print(f"Has Leak: {result['has_leak']}")
print(f"{'='*80}")

## 3. CV統計情報の取得

In [None]:
# CV統計情報を取得
stats_df = Validation.get_cv_statistics(
    validator=validator,
    X=df_train,
    y=df_train['label_id'].values,
    groups=df_train['group'].values
)

print("\nCV Statistics:")
display(stats_df)

print("\nSummary:")
print(f"  Average train samples: {stats_df['train_samples'].mean():.0f}")
print(f"  Average valid samples: {stats_df['valid_samples'].mean():.0f}")
print(f"  Average train groups: {stats_df['train_groups'].mean():.0f}")
print(f"  Average valid groups: {stats_df['valid_groups'].mean():.0f}")

## 4. StratifiedGroupKFold との比較（参考）

In [None]:
# StratifiedGroupKFold validator の作成
validator_stratified = Validation.create_validator(
    method='stratified_group', 
    n_splits=5,
    shuffle=True,
    random_state=42
)

# リークチェック実行
result_stratified = Validation.check_group_leak(
    validator=validator_stratified,
    X=df_train,
    y=df_train['label_id'].values,
    groups=df_train['group'].values,
    verbose=True
)

print(f"\n{'='*80}")
print(f"Has Leak (StratifiedGroupKFold): {result_stratified['has_leak']}")
print(f"{'='*80}")

## 5. まとめ

- **GroupKFold**: リークなし、推奨
- **StratifiedGroupKFold**: リークなし（グループ制約が優先）、層化は部分的

このコンペでは **GroupKFold** を推奨します。