# VAEP 성능 분석 노트북

이 노트북은 VAEP(Valuing Actions by Estimating Probabilities) 모델의 성능을 분석합니다.

## 주요 분석 항목:
1. 모델 성능 메트릭 (Accuracy, Loss)
2. 선수별 VAEP 통계
3. 포지션별 VAEP 분포
4. 이벤트 타입별 VAEP 기여도
5. 상위/하위 선수 분석

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# 폰트 설정
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.figsize'] = (12, 6)
sns.set_style('whitegrid')

# Pandas 출력 설정
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.6f}'.format)

print("라이브러리 로드 완료")

## 1. 데이터 로딩

In [None]:
# 경로 설정
BASE_DIR = Path('..')
processed_dir = BASE_DIR / 'data' / 'processed'
models_dir = BASE_DIR / 'models'
results_dir = BASE_DIR / 'data' / 'vaep_results'
wyscout_dir = BASE_DIR / 'data' / 'wyscout'

# 파일 경로 확인
files = {
    '학습 데이터': processed_dir / 'vaep_train_events.csv',
    '평가 데이터': processed_dir / 'vaep_eval_events_england.csv',
    '모델 설정': models_dir / 'vaep_config.json',
    '경기별 VAEP': results_dir / 'player_match_vaep_england.csv',
    '시즌별 VAEP': results_dir / 'player_season_vaep_england.csv',
    '선수 정보': wyscout_dir / 'players.json'
}

print("파일 존재 확인:")
for name, path in files.items():
    status = "✓" if path.exists() else "✗"
    size = f"{path.stat().st_size / 1024:.1f} KB" if path.exists() else "N/A"
    print(f"  {status} {name:15s}: {size}")

In [None]:
# 데이터 로드
player_match_vaep = None
player_season_vaep = None
eval_df = None
model_config = None
players_df = None

if files['경기별 VAEP'].exists():
    player_match_vaep = pd.read_csv(files['경기별 VAEP'])
    print(f"경기별 VAEP 로드 완료: {len(player_match_vaep)} 레코드")

if files['시즌별 VAEP'].exists():
    player_season_vaep = pd.read_csv(files['시즌별 VAEP'])
    print(f"시즌별 VAEP 로드 완료: {len(player_season_vaep)} 선수")

if files['평가 데이터'].exists():
    eval_df = pd.read_csv(files['평가 데이터'])
    print(f"평가 데이터 로드 완료: {len(eval_df)} 이벤트")

if files['모델 설정'].exists():
    with open(files['모델 설정'], 'r') as f:
        model_config = json.load(f)
    print(f"모델 설정 로드 완료")

if files['선수 정보'].exists():
    with open(files['선수 정보'], 'r', encoding='utf-8') as f:
        players_data = json.load(f)
    players_df = pd.DataFrame(players_data)
    print(f"선수 정보 로드 완료: {len(players_df)} 선수")

## 2. 모델 성능 분석

In [None]:
if model_config:
    print("=" * 60)
    print("모델 구성 정보")
    print("=" * 60)
    print(f"입력 차원: {model_config.get('input_dim', 'N/A')}")
    print(f"히든 레이어: {model_config.get('hidden_dims', 'N/A')}")
    print(f"Horizon: {model_config.get('horizon', 'N/A')}")
    print(f"Best Validation Loss: {model_config.get('best_val_loss', 'N/A'):.6f}" if model_config.get('best_val_loss') else "N/A")
    
    # 피처 카운트
    if 'feature_names' in model_config:
        feature_names = model_config['feature_names']
        
        # 피처 그룹별 카운트
        event_features = [f for f in feature_names if f.startswith('event_')]
        subevent_features = [f for f in feature_names if f.startswith('subevent_')]
        tag_features = [f for f in feature_names if f.startswith('tag_')]
        period_features = [f for f in feature_names if f.startswith('period_')]
        other_features = [f for f in feature_names if not any([
            f.startswith('event_'), f.startswith('subevent_'), 
            f.startswith('tag_'), f.startswith('period_')
        ])]
        
        print(f"\n피처 구성:")
        print(f"  - 이벤트 타입: {len(event_features)}개")
        print(f"  - 서브이벤트 타입: {len(subevent_features)}개")
        print(f"  - 태그: {len(tag_features)}개")
        print(f"  - 피리어드: {len(period_features)}개")
        print(f"  - 기타 (위치, 거리 등): {len(other_features)}개")
        print(f"  - 총 피처 수: {len(feature_names)}개")
else:
    print("모델 설정 파일이 없습니다.")

## 3. VAEP 통계 분석

In [None]:
if player_match_vaep is not None:
    print("=" * 60)
    print("경기별 VAEP 통계")
    print("=" * 60)
    
    print(f"\n기본 통계:")
    print(f"  - 총 레코드 수: {len(player_match_vaep):,}")
    print(f"  - 고유 선수 수: {player_match_vaep['playerId'].nunique():,}")
    print(f"  - 고유 경기 수: {player_match_vaep['matchId'].nunique():,}")
    
    print(f"\nVAEP 분포:")
    print(f"  - 평균: {player_match_vaep['vaep'].mean():.6f}")
    print(f"  - 표준편차: {player_match_vaep['vaep'].std():.6f}")
    print(f"  - 중앙값: {player_match_vaep['vaep'].median():.6f}")
    print(f"  - 최소값: {player_match_vaep['vaep'].min():.6f}")
    print(f"  - 최대값: {player_match_vaep['vaep'].max():.6f}")
    print(f"  - 25% 분위수: {player_match_vaep['vaep'].quantile(0.25):.6f}")
    print(f"  - 75% 분위수: {player_match_vaep['vaep'].quantile(0.75):.6f}")
    
    print(f"\nVAEP per 90 분포:")
    print(f"  - 평균: {player_match_vaep['vaep_per90'].mean():.6f}")
    print(f"  - 표준편차: {player_match_vaep['vaep_per90'].std():.6f}")
    print(f"  - 중앙값: {player_match_vaep['vaep_per90'].median():.6f}")
else:
    print("경기별 VAEP 데이터가 없습니다.")

In [None]:
if player_match_vaep is not None:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # VAEP 분포 히스토그램
    axes[0, 0].hist(player_match_vaep['vaep'], bins=50, edgecolor='black', alpha=0.7, color='steelblue')
    axes[0, 0].axvline(player_match_vaep['vaep'].mean(), color='red', linestyle='--', label=f"Mean: {player_match_vaep['vaep'].mean():.4f}")
    axes[0, 0].axvline(player_match_vaep['vaep'].median(), color='green', linestyle='--', label=f"Median: {player_match_vaep['vaep'].median():.4f}")
    axes[0, 0].set_xlabel('VAEP')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].set_title('VAEP Distribution (Match Level)')
    axes[0, 0].legend()
    
    # VAEP per 90 분포 히스토그램
    axes[0, 1].hist(player_match_vaep['vaep_per90'], bins=50, edgecolor='black', alpha=0.7, color='coral')
    axes[0, 1].axvline(player_match_vaep['vaep_per90'].mean(), color='red', linestyle='--', label=f"Mean: {player_match_vaep['vaep_per90'].mean():.4f}")
    axes[0, 1].set_xlabel('VAEP per 90')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].set_title('VAEP per 90 Distribution')
    axes[0, 1].legend()
    
    # 이벤트 수 vs VAEP
    axes[1, 0].scatter(player_match_vaep['num_events'], player_match_vaep['vaep'], alpha=0.5, s=20)
    axes[1, 0].set_xlabel('Number of Events')
    axes[1, 0].set_ylabel('VAEP')
    axes[1, 0].set_title('Events vs VAEP')
    
    # 출전 시간 vs VAEP
    axes[1, 1].scatter(player_match_vaep['minutes_played'], player_match_vaep['vaep'], alpha=0.5, s=20, color='green')
    axes[1, 1].set_xlabel('Minutes Played')
    axes[1, 1].set_ylabel('VAEP')
    axes[1, 1].set_title('Playing Time vs VAEP')
    
    plt.tight_layout()
    plt.show()

## 4. 상위 선수 분석

In [None]:
if player_season_vaep is not None:
    # 선수 이름 매핑 (players_df가 있는 경우)
    top_n = 20
    
    print("=" * 80)
    print(f"시즌 VAEP per 90 상위 {top_n}명")
    print("=" * 80)
    
    top_players = player_season_vaep.nlargest(top_n, 'season_vaep_per90_avg').copy()
    
    if players_df is not None:
        # 선수 이름 매핑
        player_names = players_df.set_index('wyId')['shortName'].to_dict()
        top_players['player_name'] = top_players['playerId'].map(player_names)
        display_cols = ['player_name', 'playerId', 'matches_played', 'season_vaep_total', 
                       'season_vaep_per90_avg', 'season_vaep_per_match', 'minutes_played', 'num_events']
    else:
        display_cols = ['playerId', 'matches_played', 'season_vaep_total', 
                       'season_vaep_per90_avg', 'season_vaep_per_match', 'minutes_played', 'num_events']
    
    # 사용 가능한 컬럼만 선택
    available_cols = [col for col in display_cols if col in top_players.columns]
    display(top_players[available_cols].reset_index(drop=True))

In [None]:
if player_season_vaep is not None:
    print("=" * 80)
    print(f"시즌 VAEP per 90 하위 {top_n}명")
    print("=" * 80)
    
    bottom_players = player_season_vaep.nsmallest(top_n, 'season_vaep_per90_avg').copy()
    
    if players_df is not None:
        bottom_players['player_name'] = bottom_players['playerId'].map(player_names)
    
    available_cols = [col for col in display_cols if col in bottom_players.columns]
    display(bottom_players[available_cols].reset_index(drop=True))

In [None]:
if player_season_vaep is not None and players_df is not None:
    fig, axes = plt.subplots(1, 2, figsize=(16, 8))
    
    # 상위 15명 바차트
    top_15 = player_season_vaep.nlargest(15, 'season_vaep_per90_avg').copy()
    top_15['player_name'] = top_15['playerId'].map(player_names)
    
    colors = plt.cm.RdYlGn(np.linspace(0.8, 0.3, 15))
    bars = axes[0].barh(range(len(top_15)), top_15['season_vaep_per90_avg'].values, color=colors)
    axes[0].set_yticks(range(len(top_15)))
    axes[0].set_yticklabels(top_15['player_name'].fillna(top_15['playerId'].astype(str)))
    axes[0].set_xlabel('VAEP per 90')
    axes[0].set_title('Top 15 Players by VAEP per 90')
    axes[0].invert_yaxis()
    
    # 값 표시
    for i, (bar, val) in enumerate(zip(bars, top_15['season_vaep_per90_avg'])):
        axes[0].text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2, 
                    f'{val:.4f}', va='center', fontsize=9)
    
    # 하위 15명 바차트
    bottom_15 = player_season_vaep.nsmallest(15, 'season_vaep_per90_avg').copy()
    bottom_15['player_name'] = bottom_15['playerId'].map(player_names)
    
    colors = plt.cm.RdYlGn(np.linspace(0.2, 0.7, 15))
    bars = axes[1].barh(range(len(bottom_15)), bottom_15['season_vaep_per90_avg'].values, color=colors)
    axes[1].set_yticks(range(len(bottom_15)))
    axes[1].set_yticklabels(bottom_15['player_name'].fillna(bottom_15['playerId'].astype(str)))
    axes[1].set_xlabel('VAEP per 90')
    axes[1].set_title('Bottom 15 Players by VAEP per 90')
    axes[1].invert_yaxis()
    
    plt.tight_layout()
    plt.show()

## 5. 상관관계 분석

In [None]:
if player_season_vaep is not None:
    print("=" * 60)
    print("변수 간 상관관계")
    print("=" * 60)
    
    # 숫자형 컬럼만 선택
    numeric_cols = player_season_vaep.select_dtypes(include=[np.number]).columns.tolist()
    
    # 상관 행렬 계산
    corr_matrix = player_season_vaep[numeric_cols].corr()
    
    # 히트맵
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f', 
                linewidths=0.5, square=True)
    plt.title('Correlation Matrix of VAEP Metrics')
    plt.tight_layout()
    plt.show()
    
    # VAEP와 다른 변수들의 상관관계
    if 'season_vaep_per90_avg' in numeric_cols:
        print("\nVAEP per 90과의 상관관계:")
        vaep_corr = corr_matrix['season_vaep_per90_avg'].sort_values(ascending=False)
        for var, corr in vaep_corr.items():
            if var != 'season_vaep_per90_avg':
                print(f"  {var}: {corr:.4f}")

## 6. 효율성 분석

In [None]:
if player_season_vaep is not None:
    # VAEP 효율성 지표 계산
    season_df = player_season_vaep.copy()
    season_df['vaep_per_event'] = season_df['season_vaep_total'] / season_df['num_events']
    season_df['events_per_match'] = season_df['num_events'] / season_df['matches_played']
    
    print("=" * 60)
    print("효율성 지표 분석")
    print("=" * 60)
    
    print(f"\n이벤트당 VAEP:")
    print(f"  - 평균: {season_df['vaep_per_event'].mean():.6f}")
    print(f"  - 표준편차: {season_df['vaep_per_event'].std():.6f}")
    print(f"  - 최대: {season_df['vaep_per_event'].max():.6f}")
    print(f"  - 최소: {season_df['vaep_per_event'].min():.6f}")
    
    print(f"\n경기당 이벤트 수:")
    print(f"  - 평균: {season_df['events_per_match'].mean():.1f}")
    print(f"  - 표준편차: {season_df['events_per_match'].std():.1f}")
    print(f"  - 최대: {season_df['events_per_match'].max():.1f}")
    print(f"  - 최소: {season_df['events_per_match'].min():.1f}")

In [None]:
if player_season_vaep is not None:
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # 이벤트당 VAEP 분포
    axes[0].hist(season_df['vaep_per_event'], bins=40, edgecolor='black', alpha=0.7, color='purple')
    axes[0].axvline(season_df['vaep_per_event'].mean(), color='red', linestyle='--', 
                   label=f"Mean: {season_df['vaep_per_event'].mean():.6f}")
    axes[0].set_xlabel('VAEP per Event')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('VAEP per Event Distribution')
    axes[0].legend()
    
    # 경기당 이벤트 수 vs VAEP per 90
    scatter = axes[1].scatter(season_df['events_per_match'], season_df['season_vaep_per90_avg'], 
                             c=season_df['matches_played'], cmap='viridis', alpha=0.6, s=50)
    axes[1].set_xlabel('Events per Match')
    axes[1].set_ylabel('VAEP per 90')
    axes[1].set_title('Activity vs VAEP (colored by matches played)')
    plt.colorbar(scatter, ax=axes[1], label='Matches Played')
    
    plt.tight_layout()
    plt.show()

## 7. 요약 통계

In [None]:
print("=" * 80)
print(" " * 25 + "VAEP 분석 요약")
print("=" * 80)

if model_config:
    print(f"\n[모델 정보]")
    print(f"  - 입력 차원: {model_config.get('input_dim', 'N/A')}")
    print(f"  - 최적 검증 손실: {model_config.get('best_val_loss', 'N/A'):.6f}" if model_config.get('best_val_loss') else "  - 최적 검증 손실: N/A")

if player_season_vaep is not None:
    print(f"\n[데이터 규모]")
    print(f"  - 분석 대상 선수 수: {len(player_season_vaep)}명")
    print(f"  - 총 경기 수: {player_season_vaep['matches_played'].sum():.0f}")
    print(f"  - 총 이벤트 수: {player_season_vaep['num_events'].sum():,.0f}")
    
    print(f"\n[VAEP 분포]")
    print(f"  - 평균 VAEP per 90: {player_season_vaep['season_vaep_per90_avg'].mean():.6f}")
    print(f"  - 최고 VAEP per 90: {player_season_vaep['season_vaep_per90_avg'].max():.6f}")
    print(f"  - 최저 VAEP per 90: {player_season_vaep['season_vaep_per90_avg'].min():.6f}")
    
    # 양수/음수 VAEP 비율
    positive_vaep = (player_season_vaep['season_vaep_per90_avg'] > 0).sum()
    total_players = len(player_season_vaep)
    print(f"\n[VAEP 부호 분포]")
    print(f"  - 양수 VAEP 선수: {positive_vaep}명 ({positive_vaep/total_players*100:.1f}%)")
    print(f"  - 음수 VAEP 선수: {total_players - positive_vaep}명 ({(total_players-positive_vaep)/total_players*100:.1f}%)")

print("\n" + "=" * 80)
print(" " * 25 + "분석 완료")
print("=" * 80)