# Model Training and Evaluation

Train and evaluate all models: baseline, XGBoost, neural network, and ensemble.


In [None]:
import sys
from pathlib import Path

sys.path.insert(0, str(Path().resolve().parent.parent / "src"))

from src.data.preprocess import load_match_data, extract_team_compositions, train_val_test_split
from src.features.team_composition import TeamCompositionFeatureExtractor
from src.models.baseline import BaselineModel
from src.models.xgboost_model import XGBoostModel
from src.models.neural_net import NeuralNetModel
from src.models.ensemble import EnsembleModel
import numpy as np

# Load data and extract features
data_path = Path().resolve().parent.parent / "data" / "raw" / "synthetic_matches.csv"
df = load_match_data(data_path)
team1_list, team2_list, y = extract_team_compositions(df)

# Load embeddings
embeddings_path = Path().resolve().parent.parent / "data" / "embeddings" / "hero_embeddings.npy"
embeddings = np.load(embeddings_path) if embeddings_path.exists() else None

# Extract features
extractor = TeamCompositionFeatureExtractor(embeddings=embeddings)
X = np.array([extractor.extract_feature_vector(t1, t2) for t1, t2 in zip(team1_list, team2_list)])

# Split data
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(X, y)

print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

# Train models
baseline = BaselineModel()
baseline.train(X_train, y_train)

xgboost = XGBoostModel(n_estimators=100)
xgboost.train(X_train, y_train, X_val, y_val)

neural_net = NeuralNetModel(input_dim=X.shape[1], epochs=20)
neural_net.train(X_train, y_train, X_val, y_val)

# Create ensemble
ensemble = EnsembleModel(
    xgboost_model=xgboost,
    neural_net_model=neural_net,
    xgboost_weight=0.6,
    neural_net_weight=0.4
)

# Evaluate on test set
from src.utils.metrics import calculate_metrics

test_pred = ensemble.predict(X_test)
test_proba = ensemble.predict_proba(X_test)[:, 1]
metrics = calculate_metrics(y_test, test_pred, test_proba)

print(f"Test Accuracy: {metrics['accuracy']:.4f}")
print(f"Test F1: {metrics['f1']:.4f}")

# Save models
models_dir = Path().resolve().parent.parent / "models"
baseline.save(models_dir / "baseline.pkl")
xgboost.save(models_dir / "xgboost_model.pkl")
neural_net.save(models_dir / "neural_net_model.pt")
ensemble.save(models_dir / "ensemble")
