In [1]:
from clickhouse_driver import Client
from typing import Generator, Any
import os
import json
from dateutil.parser import parse
from collections import defaultdict
import numpy as np
import warnings
import logging as log
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder as SklearnLabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy import sparse
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import RFECV

warnings.filterwarnings("ignore")
log.basicConfig(level=log.INFO)

# ------------------------------
# DATA LOADING AND VALIDATION
# ------------------------------
def generate_game_raw(path_to_games_raw_dir: str = "data/games_raw") -> Generator[dict[str, Any], None, None]:
    for filename in os.listdir(path_to_games_raw_dir):
        try:
            with open(os.path.join(path_to_games_raw_dir, filename), "r", encoding="utf-8") as f:
                yield json.load(f)
        except:
            pass

def validate_game(game: dict) -> bool:
    try:
        int(game["id"])
        parse(game["begin_at"])
        int(game["match"]["league"]["id"])
        int(game["match"]["serie"]["id"])
        int(game["match"]["tournament"]["id"])
        int(game["map"]["id"])

        team_players = defaultdict(list)
        for p in game["players"]:
            team_players[p["team"]["id"]].append(p["player"]["id"])

        assert len(team_players) == 2
        for t_id, p_ids in team_players.items():
            assert len(set(p_ids)) == 5

        team_ids = list(team_players.keys())
        rounds = []
        for r in game["rounds"]:
            assert r["round"] is not None
            assert r["ct"] in team_ids
            assert r["terrorists"] in team_ids
            assert r["winner_team"] in team_ids
            rounds.append(r["round"])
        assert min(rounds) == 1
        assert max(rounds) >= 16
        return True
    except:
        return False

def get_game_ids(path_to_games_raw_dir: str = "data/games_raw") -> list[int]:
    game_ids_valid = []
    game_begin_at_valid = []
    for game in generate_game_raw(path_to_games_raw_dir):
        if validate_game(game):
            game_ids_valid.append(game["id"])
            game_begin_at_valid.append(parse(game["begin_at"]))
    return np.array(game_ids_valid)[np.argsort(game_begin_at_valid)].tolist()

# ------------------------------
# X, y PREPARATION
# ------------------------------
def get_player_id_X_for_game(path_to_games_raw_dir: str, game_id: int):
    with open(os.path.join(path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
        game = json.load(f)
    X = []
    team_players = defaultdict(list)
    for p in game["players"]:
        team_players[p["team"]["id"]].append(p["player"]["id"])
    t1_id, t2_id = sorted(team_players.keys())    
    X.extend(sorted(team_players[t1_id])) 
    X.extend(sorted(team_players[t2_id]))    
    return X

def get_team_id_X_for_game(path_to_games_raw_dir: str, game_id: int):    
    with open(os.path.join(path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
        game = json.load(f)
    team_ids = sorted({p["team"]["id"] for p in game["players"]})
    assert len(team_ids) == 2
    return team_ids

def get_game_info_X_for_game(path_to_games_raw_dir: str, game_id: int):    
    with open(os.path.join(path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
        game = json.load(f)
    timestamp = parse(game["begin_at"]).timestamp()
    map_id = int(game["map"]["id"])
    league_id = int(game["match"]["league"]["id"])
    serie_id = int(game["match"]["serie"]["id"])
    serie_tier = game["match"]["serie"].get("tier", "unknown")
    tournament_id = int(game["match"]["tournament"]["id"])
    return [timestamp, map_id, league_id, serie_id, serie_tier, tournament_id]

def get_y_for_game(path_to_games_raw_dir: str, game_id: int):
    with open(os.path.join(path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
        game = json.load(f)
    t1_id, t2_id = np.unique([p["team"]["id"] for p in game["players"]])
    team_win_count = {t1_id:0, t2_id:0}
    for r in game["rounds"]:
       team_win_count[r["winner_team"]] += 1 
    return int(team_win_count[t1_id] > team_win_count[t2_id])

def get_X_y(path_to_games_raw: str, game_ids: list[int]):
    X_game_info, X_team_id, X_player_id, y = [], [], [], []
    for game_id in game_ids:
        try:
            X_player_id.append(get_player_id_X_for_game(path_to_games_raw, game_id))
            X_team_id.append(get_team_id_X_for_game(path_to_games_raw, game_id))
            X_game_info.append(get_game_info_X_for_game(path_to_games_raw, game_id))
            y.append(get_y_for_game(path_to_games_raw, game_id))
        except Exception as e:
            log.warning(f"Skipping game {game_id} due to error: {e}")
            continue
    return (np.array(X_game_info, dtype=object),
            np.array(X_team_id),
            np.array(X_player_id),
            np.array(y, dtype=int))

# ------------------------------
# CUSTOM TRANSFORMERS
# ------------------------------
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return np.array(X)[:, self.columns]

class LabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoder = SklearnLabelEncoder()
    def fit(self, X, y=None):
        X = np.asarray(X)
        self.encoder.fit(X.ravel().astype(object))
        return self
    def transform(self, X):
        X = np.asarray(X).copy()
        flat = X.ravel().astype(object)
        mask = np.isin(flat, self.encoder.classes_)
        flat_result = np.full(flat.shape, -1, dtype=int)
        flat_result[mask] = self.encoder.transform(flat[mask])
        return flat_result.reshape(X.shape)

class PlayerBagEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X = np.asarray(X)
        self.d = {val: idx for idx, val in enumerate(np.unique(X.flatten()))}
        return self
    def transform(self, X):
        X = np.asarray(X)
        rows, cols, data = [], [], []
        for i, row in enumerate(X):
            for j, val in enumerate(row):
                if val in self.d:
                    rows.append(i)
                    cols.append(self.d[val])
                    data.append(1 if j < 5 else -1)
        return sparse.csr_matrix((data, (rows, cols)), shape=(X.shape[0], len(self.d)), dtype=int)

class TeamBagEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X = np.asarray(X)
        self.d = {val: idx for idx, val in enumerate(np.unique(X.flatten()))}
        return self
    def transform(self, X):
        X = np.asarray(X)
        rows, cols, data = [], [], []
        for i, row in enumerate(X):
            for j, val in enumerate(row):
                if val in self.d:
                    rows.append(i)
                    cols.append(self.d[val])
                    data.append(1 if j == 0 else -1)
        return sparse.csr_matrix((data, (rows, cols)), shape=(X.shape[0], len(self.d)), dtype=int)

class GameInfoEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X = np.array(X, dtype=object)
        self.scaler = MinMaxScaler().fit(X[:, [0]])
        self.ohe = OneHotEncoder(sparse_output=True, handle_unknown='ignore').fit(X[:, 1:])
        return self
    def transform(self, X):
        X = np.array(X, dtype=object)
        ts_scaled = self.scaler.transform(X[:, [0]])
        cat_ohe = self.ohe.transform(X[:, 1:])
        return sparse.hstack([ts_scaled, cat_ohe]).tocsr()

from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy import sparse
import numpy as np

from joblib import Parallel, delayed

# —Ñ—É–Ω–∫—Ü–∏—è –¥–ª—è –æ–¥–Ω–æ–≥–æ –ø—Ä–∏–∑–Ω–∞–∫–∞
def perm_importance_worker(estimator, X_val, y_val, i, baseline, random_seed):
    col = X_val[:, i].copy()
    if sparse.issparse(col):
        col = col.toarray().ravel()
    rng = np.random.RandomState(random_seed)
    rng.shuffle(col)

    X_perm = X_val.copy()
    if sparse.issparse(X_perm):
        X_perm = X_perm.tolil()
        X_perm[:, i] = col.reshape(-1, 1)
        X_perm = X_perm.tocsr()
    else:
        X_perm[:, i] = col

    perm_score = roc_auc_score(y_val, estimator.predict_proba(X_perm)[:, 1])
    return baseline - perm_score


class SparsePermutationImportanceFeatureSelector:
    def __init__(self, estimator=None, scoring=roc_auc_score, random_state=42, n_jobs=-1):
        self.estimator = estimator if estimator else LogisticRegression(max_iter=1000, random_state=random_state)
        self.scoring = scoring
        self.random_state = random_state
        self.feature_importances_ = None
        self.n_jobs = n_jobs  # —á–∏—Å–ª–æ –ø–∞—Ä–∞–ª–ª–µ–ª—å–Ω—ã—Ö –ø—Ä–æ—Ü–µ—Å—Å–æ–≤

    def fit(self, X, y):
        if not sparse.issparse(X):
            X = sparse.csr_matrix(X)

        mask = np.arange(X.shape[1])
        iteration = 0
        best_score = -np.inf
        best_mask = mask.copy()
        n_splits = 10

        while True:
            iteration += 1
            print(f"\n=== –ò—Ç–µ—Ä–∞—Ü–∏—è {iteration} ===")
            X_subset = X[:, mask]
            print(f"–¢–µ–∫—É—â–∏–π —Ä–∞–∑–º–µ—Ä –ø—Ä–∏–∑–Ω–∞–∫–æ–≤: {X_subset.shape[1]}")

            tscv = TimeSeriesSplit(n_splits=n_splits)
            fold_importances_list = []
            fold_scores = []

            for fold_idx, (train_idx, val_idx) in enumerate(tscv.split(X_subset)):
                print(f"\n--- Fold {fold_idx + 1}/{n_splits} ---")
                X_train, X_val = X_subset[train_idx], X_subset[val_idx]
                y_train, y_val = y[train_idx], y[val_idx]

                self.estimator.fit(X_train, y_train)
                baseline = self.scoring(y_val, self.estimator.predict_proba(X_val)[:, 1])
                fold_scores.append(baseline)
                print(f"ROC-AUC –Ω–∞ –≤–∞–ª–∏–¥–∞—Ü–∏–∏: {baseline:.4f}")

                # –ø–∞—Ä–∞–ª–ª–µ–ª—å–Ω–æ–µ –≤—ã—á–∏—Å–ª–µ–Ω–∏–µ permutation importance –ø–æ –ø—Ä–∏–∑–Ω–∞–∫–∞–º
                fold_importances = Parallel(n_jobs=self.n_jobs)(
                    delayed(perm_importance_worker)(self.estimator, X_val, y_val, i, baseline, self.random_state + fold_idx)
                    for i in range(X_subset.shape[1])
                )
                fold_importances = np.array(fold_importances)
                print(f"–°—Ä–µ–¥–Ω—è—è –≤–∞–∂–Ω–æ—Å—Ç—å –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –≤ fold {fold_idx + 1}: {np.mean(fold_importances):.6f}")
                fold_importances_list.append(fold_importances)

            mean_importances = np.mean(fold_importances_list, axis=0)
            avg_score = np.mean(fold_scores)
            print(f"\n–°—Ä–µ–¥–Ω–∏–π ROC-AUC –ø–æ –≤—Å–µ–º —Ñ–æ–ª–¥–∞–º: {avg_score:.4f}")
            print(f"–°—Ä–µ–¥–Ω—è—è –≤–∞–∂–Ω–æ—Å—Ç—å –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –ø–æ –≤—Å–µ–º —Ñ–æ–ª–¥–∞–º: {np.mean(mean_importances):.6f}")

            if avg_score < best_score:
                print(f"–°—Ä–µ–¥–Ω—è—è –º–µ—Ç—Ä–∏–∫–∞ —É–ø–∞–ª–∞ —Å {best_score:.4f} –¥–æ {avg_score:.4f}. –û—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞–µ–º —Ä–µ–∫—É—Ä—Å–∏—é.")
                mask = best_mask
                break

            best_score = avg_score
            best_mask = mask.copy()

            positive_mask = mean_importances > 0
            n_positive = np.sum(positive_mask)
            print(f"–ü—Ä–∏–∑–Ω–∞–∫–æ–≤ —Å –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω–æ–π –≤–∞–∂–Ω–æ—Å—Ç—å—é: {n_positive} –∏–∑ {len(mean_importances)}")

            if n_positive == X_subset.shape[1]:
                print("–í—Å–µ –æ—Å—Ç–∞–≤—à–∏–µ—Å—è –ø—Ä–∏–∑–Ω–∞–∫–∏ –∏–º–µ—é—Ç –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω—É—é –≤–∞–∂–Ω–æ—Å—Ç—å. –û—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞–µ–º —Ä–µ–∫—É—Ä—Å–∏—é.")
                break

            mask = mask[positive_mask]
            print(f"–£–¥–∞–ª—è–µ–º –ø—Ä–∏–∑–Ω–∞–∫–∏ —Å –Ω—É–ª–µ–≤–æ–π –∏–ª–∏ –æ—Ç—Ä–∏—Ü–∞—Ç–µ–ª—å–Ω–æ–π –≤–∞–∂–Ω–æ—Å—Ç—å—é, –æ—Å—Ç–∞–≤–ª—è–µ–º {len(mask)} –ø—Ä–∏–∑–Ω–∞–∫–æ–≤")

        self.feature_importances_ = np.zeros(X.shape[1])
        self.feature_importances_[mask] = 1
        print("\n–ü–µ—Ä–µ—Å—Ç–∞–Ω–æ–≤–æ—á–Ω–∞—è –≤–∞–∂–Ω–æ—Å—Ç—å –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –≤—ã—á–∏—Å–ª–µ–Ω–∞!")
        print(f"–û—Å—Ç–∞–≤–ª–µ–Ω–æ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤: {np.sum(self.feature_importances_)} –∏–∑ {X.shape[1]}")
        return self

    def transform(self, X):
        mask = self.feature_importances_ > 0
        print(f"–í—ã–±—Ä–∞–Ω—ã –ø—Ä–∏–∑–Ω–∞–∫–∏ —Å –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω–æ–π –≤–∞–∂–Ω–æ—Å—Ç—å—é: {np.sum(mask)} –∏–∑ {len(mask)}")
        if not sparse.issparse(X):
            X = sparse.csr_matrix(X)
        return X[:, mask]

    def fit_transform(self, X, y):
        self.fit(X, y)
        return self.transform(X)

def postprocess_player_features(X: np.ndarray) -> np.ndarray:
    """
    X: np.ndarray, shape (n_samples, 10)
       –ø–µ—Ä–≤—ã–µ 5 —Å—Ç–æ–ª–±—Ü–æ–≤ ‚Äî –∏–≥—Ä–æ–∫–∏ –∫–æ–º–∞–Ω–¥—ã 1
       —Å–ª–µ–¥—É—é—â–∏–µ 5 —Å—Ç–æ–ª–±—Ü–æ–≤ ‚Äî –∏–≥—Ä–æ–∫–∏ –∫–æ–º–∞–Ω–¥—ã 2
    """
    X_processed = []

    for i in range(X.shape[0]):
        team1 = np.sort(X[i, :5])  # –æ—Ç—Å–æ—Ä—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è –∫–æ–º–∞–Ω–¥—ã 1
        team2 = np.sort(X[i, 5:])  # –æ—Ç—Å–æ—Ä—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è –∫–æ–º–∞–Ω–¥—ã 2

        team1_mean = team1.mean()
        team2_mean = team2.mean()
        mean_diff = team1_mean - team2_mean

        # –ø–æ–ø–∞—Ä–Ω—ã–µ —Ä–∞–∑–Ω–∏—Ü—ã –≤—Å–µ—Ö –∏–≥—Ä–æ–∫–æ–≤ –∫–æ–º–∞–Ω–¥—ã 1 –ø—Ä–æ—Ç–∏–≤ –∫–æ–º–∞–Ω–¥—ã 2
        pairwise_diff = []
        for j in range(5):
            for k in range(5):
                pairwise_diff.append(team1[j] - team2[k])

        # —Å–æ–±–∏—Ä–∞–µ–º –Ω–æ–≤—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏: sorted –∫–æ–º–∞–Ω–¥—ã + —Å—Ä–µ–¥–Ω–∏–µ + —Ä–∞–∑–Ω–∏—Ü–∞ —Å—Ä–µ–¥–Ω–∏—Ö + –ø–æ–ø–∞—Ä–Ω—ã–µ —Ä–∞–∑–Ω–∏—Ü—ã
        new_row = np.concatenate([team1, team2, [team1_mean, team2_mean, mean_diff], pairwise_diff])
        X_processed.append(new_row)

    return np.array(X_processed, dtype=float)
  
class PlayerStatSumFeatureExtractor(BaseEstimator, TransformerMixin):
    """Cumulative sum of a player stat across previous games."""
    def __init__(self, path_to_games_raw_dir: str, game_ids: list[int], key: str = "kills"):
        self.path_to_games_raw_dir = path_to_games_raw_dir
        self.game_ids = game_ids
        self.key = key
        self.cumulative_dict = {}
        self.X_train = None

    def _get_player_stat(self, game_id: int) -> dict:
        with open(os.path.join(self.path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
            game = json.load(f)
        return {p["player"]["id"]: int(p.get(self.key) or 0) for p in game["players"]}

    def fit(self, X, y=None):
        self.X_train = np.zeros((X.shape[0], 10), dtype=int)
        for i, player_ids in enumerate(X):
            # Current cumulative values
            self.X_train[i, :] = [self.cumulative_dict.get(p_id, 0) for p_id in player_ids]
            # Update cumulative_dict with this game's stats
            game_stats = self._get_player_stat(self.game_ids[i])
            for p_id in player_ids:
                self.cumulative_dict[p_id] = self.cumulative_dict.get(p_id, 0) + game_stats.get(p_id, 0)        
        return self

    def transform(self, X):
        # –ï—Å–ª–∏ –ø–µ—Ä–µ–¥–∞–Ω X_train (–æ–±—É—á–∞—é—â–∏–µ –¥–∞–Ω–Ω—ã–µ), –∏—Å–ø–æ–ª—å–∑—É–µ–º —Å–æ—Ö—Ä–∞–Ω—ë–Ω–Ω–æ–µ cumulative
        if self.X_train is not None and X.shape[0] == self.X_train.shape[0]:
            return postprocess_player_features(self.X_train)
        
        # –î–ª—è –Ω–æ–≤—ã—Ö –¥–∞–Ω–Ω—ã—Ö —Å—á–∏—Ç–∞–µ–º cumulative –Ω–∞ –ª–µ—Ç—É (–∏—Å–ø–æ–ª—å–∑—É–µ–º —Å–ª–æ–≤–∞—Ä—å)
        X_cumulative_new = np.array([[self.cumulative_dict.get(p_id, 0) for p_id in row] for row in X], dtype=int)
        return postprocess_player_features(X_cumulative_new)


class PlayerStatMeanFeatureExtractor(BaseEstimator, TransformerMixin):
    """Mean of a player stat across previous games, —Å post-processing."""
    def __init__(self, path_to_games_raw_dir: str, game_ids: list[int], key: str = "kills"):
        self.path_to_games_raw_dir = path_to_games_raw_dir
        self.game_ids = game_ids
        self.key = key
        self.cumulative_dict = {}
        self.count_dict = {}
        self.X_train = None

    def _get_player_stat(self, game_id: int) -> dict:
        with open(os.path.join(self.path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
            game = json.load(f)
        return {p["player"]["id"]: int(p.get(self.key) or 0) for p in game["players"]}

    def fit(self, X, y=None):
        self.X_train = np.zeros((X.shape[0], 10), dtype=float)
        for i, player_ids in enumerate(X):
            self.X_train[i, :] = [
                self.cumulative_dict.get(p_id, 0) / max(self.count_dict.get(p_id, 1), 1)
                for p_id in player_ids
            ]
            game_stats = self._get_player_stat(self.game_ids[i])
            for p_id in player_ids:
                self.cumulative_dict[p_id] = self.cumulative_dict.get(p_id, 0) + game_stats.get(p_id, 0)
                self.count_dict[p_id] = self.count_dict.get(p_id, 0) + 1        
        return self

    def transform(self, X):
        if self.X_train is not None and X.shape[0] == self.X_train.shape[0]:
            return postprocess_player_features(self.X_train)
        X_mean_new = np.array([
            [self.cumulative_dict.get(p_id, 0) / max(self.count_dict.get(p_id, 1), 1) for p_id in row]
            for row in X
        ], dtype=float)
        return postprocess_player_features(X_mean_new)


class PlayerStatPerRoundFeatureExtractor(BaseEstimator, TransformerMixin):
    """Mean stat per round for a player across previous games, —Å post-processing."""
    def __init__(self, path_to_games_raw_dir: str, game_ids: list[int], key: str = "kills"):
        self.path_to_games_raw_dir = path_to_games_raw_dir
        self.game_ids = game_ids
        self.key = key
        self.cumulative_dict = {}
        self.count_dict = {}
        self.X_train = None

    def _get_player_stat_per_round(self, game_id: int) -> tuple[dict, int]:
        with open(os.path.join(self.path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
            game = json.load(f)
        max_round = max(r["round"] for r in game["rounds"])
        player_stats = {p["player"]["id"]: int(p.get(self.key) or 0) for p in game["players"]}
        return player_stats, max_round

    def fit(self, X, y=None):
        self.X_train = np.zeros((X.shape[0], 10), dtype=float)
        for i, player_ids in enumerate(X):
            game_stats, max_round = self._get_player_stat_per_round(self.game_ids[i])
            self.X_train[i, :] = [
                self.cumulative_dict.get(p_id, 0) / max(self.count_dict.get(p_id, 1), 1)
                for p_id in player_ids
            ]
            for p_id in player_ids:
                self.cumulative_dict[p_id] = self.cumulative_dict.get(p_id, 0) + game_stats.get(p_id, 0)
                self.count_dict[p_id] = self.count_dict.get(p_id, 0) + max_round       
        return self

    def transform(self, X):
        if self.X_train is not None and X.shape[0] == self.X_train.shape[0]:
            return postprocess_player_features(self.X_train)
        X_per_round_new = np.array([
            [self.cumulative_dict.get(p_id, 0) / max(self.count_dict.get(p_id, 1), 1) for p_id in row]
            for row in X
        ], dtype=float)
        return postprocess_player_features(X_per_round_new)

# --------------------------
# FeatureUnion —Å –±–µ–∑–æ–ø–∞—Å–Ω—ã–º –≤—ã—Ö–æ–¥–æ–º
# --------------------------
class SafeFeatureUnion(FeatureUnion):
    def transform(self, X):
        Xs = []
        for name, trans in self.transformer_list:
            X_trans = trans.transform(X)
            if not sparse.issparse(X_trans):
                X_trans = np.array(X_trans)
            Xs.append(X_trans)
        if any(sparse.issparse(x) for x in Xs):
            Xs = [sparse.csr_matrix(x) if not sparse.issparse(x) else x for x in Xs]
            return sparse.hstack(Xs).tocsr()
        return np.hstack(Xs)


In [2]:
PATH_TO_GAMES_RAW = "data/games_raw"
TEST_SIZE = 100

game_ids = get_game_ids(PATH_TO_GAMES_RAW)
game_ids_train, game_ids_test = game_ids[:-TEST_SIZE], game_ids[-TEST_SIZE:]

In [None]:
PATH_TO_GAMES_RAW = "data/games_raw"
TEST_SIZE = 100
N_SPLITS = 10
RANDOM_STATE = 42

PLAYER_STATS_SUM = ["kills", "deaths", "assists", "headshots", "flash_assists"]
PLAYER_STATS_MEAN = [
    "kills", "deaths", "assists", "headshots", "flash_assists",
    "first_kills_diff", "k_d_diff", "adr", "kast", "rating"
]

TIMESTAMP_COL = [0]
GAME_INFO_COL = [1, 2, 3, 4, 5]
TEAM_ID_COL = [6, 7]
PLAYER_ID_COL = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

BASE_ESTIMATOR = LogisticRegression(solver="liblinear", random_state=RANDOM_STATE)

def get_X_y(path_to_games_raw: str, game_ids: list[int]):
    X_game_info, X_team_id, X_player_id, y = [], [], [], []
    for game_id in game_ids:
        try:
            X_player_id.append(get_player_id_X_for_game(path_to_games_raw, game_id))
            X_team_id.append(get_team_id_X_for_game(path_to_games_raw, game_id))
            X_game_info.append(get_game_info_X_for_game(path_to_games_raw, game_id))
            y.append(get_y_for_game(path_to_games_raw, game_id))
        except Exception as e:
            log.warning(f"Skipping game {game_id} due to error: {e}")
            continue
    return (
        np.array(X_game_info, dtype=object),
        np.array(X_team_id),
        np.array(X_player_id),
        np.array(y, dtype=int),
    )


game_ids = get_game_ids(PATH_TO_GAMES_RAW)
game_ids_train, game_ids_test = game_ids[:-TEST_SIZE], game_ids[-TEST_SIZE:]

# ======================================================
# –ü–ê–ô–ü–õ–ê–ô–ù
# ======================================================
pipeline = Pipeline([
    ('features', SafeFeatureUnion(
        transformer_list=[
            # –ò–≥—Ä–æ–∫–∏
            ('players', Pipeline([
                ('selector', ColumnSelector(columns=PLAYER_ID_COL)),
                ('le', LabelEncoder()),
                ('bag', PlayerBagEncoder())
            ])),

            # –ö–æ–º–∞–Ω–¥—ã
            ('teams', Pipeline([
                ('selector', ColumnSelector(columns=TEAM_ID_COL)),
                ('le', LabelEncoder()),
                ('bag', TeamBagEncoder())
            ])),

            # –ò–Ω—Ñ–æ—Ä–º–∞—Ü–∏—è –æ –º–∞—Ç—á–µ
            ('game_info', Pipeline([
                ('selector', ColumnSelector(columns=TIMESTAMP_COL + GAME_INFO_COL)),
                ('encoder', GameInfoEncoder())
            ])),

            # –°—É–º–º–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∏
            *[
                (f"{key}_sum", Pipeline([
                    ('selector', ColumnSelector(columns=PLAYER_ID_COL)),
                    (f"{key}_sum", PlayerStatSumFeatureExtractor(
                        path_to_games_raw_dir=PATH_TO_GAMES_RAW,
                        game_ids=game_ids_train,
                        key=key
                    )),
                    ('scaler', MinMaxScaler())
                ]))
                for key in PLAYER_STATS_SUM
            ],

            # –°—Ä–µ–¥–Ω–∏–µ —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∏
            *[
                (f"{key}_mean", Pipeline([
                    ('selector', ColumnSelector(columns=PLAYER_ID_COL)),
                    (f"{key}_mean", PlayerStatMeanFeatureExtractor(
                        path_to_games_raw_dir=PATH_TO_GAMES_RAW,
                        game_ids=game_ids_train,
                        key=key
                    )),
                    ('scaler', MinMaxScaler())
                ]))
                for key in PLAYER_STATS_MEAN
            ],

            # –°—Ä–µ–¥–Ω–∏–µ —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∏ –Ω–∞ —Ä–∞—É–Ω–¥
            *[
                (f"{key}_per_round", Pipeline([
                    ('selector', ColumnSelector(columns=PLAYER_ID_COL)),
                    (f"{key}_per_round", PlayerStatPerRoundFeatureExtractor(
                        path_to_games_raw_dir=PATH_TO_GAMES_RAW,
                        game_ids=game_ids_train,
                        key=key
                    )),
                    ('scaler', MinMaxScaler())
                ]))
                for key in PLAYER_STATS_SUM
            ]
        ]
    )),

    # –ü–µ—Ä–µ—Å—Ç–∞–Ω–æ–≤–æ—á–Ω–∞—è –≤–∞–∂–Ω–æ—Å—Ç—å
    ('perm_selector', SparsePermutationImportanceFeatureSelector(
        estimator=BASE_ESTIMATOR,
        scoring=roc_auc_score,
        n_jobs=-1
    )),

    # RFECV
    ('rfecv', RFECV(
        estimator=BASE_ESTIMATOR,
        step=1,
        cv=TimeSeriesSplit(n_splits=N_SPLITS),
        scoring='roc_auc',
        min_features_to_select=1,
        verbose=2,
        n_jobs=-1
    )),

    # –§–∏–Ω–∞–ª—å–Ω—ã–π –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ç–æ—Ä
    ('clf', BASE_ESTIMATOR)
])

# ======================================================
# –ó–ê–ì–†–£–ó–ö–ê –î–ê–ù–ù–´–•
# ======================================================
print("\nüì• –ó–∞–≥—Ä—É–∂–∞–µ–º –æ–±—É—á–∞—é—â—É—é –∏ —Ç–µ—Å—Ç–æ–≤—É—é –≤—ã–±–æ—Ä–∫–∏...")
X_game_info_train, X_team_id_train, X_player_id_train, y_train = get_X_y(PATH_TO_GAMES_RAW, game_ids_train)
X_game_info_test, X_team_id_test, X_player_id_test, y_test = get_X_y(PATH_TO_GAMES_RAW, game_ids_test)

X_train = np.c_[X_game_info_train, X_team_id_train, X_player_id_train]
X_test = np.c_[X_game_info_test, X_team_id_test, X_player_id_test]

# ======================================================
# –û–ë–£–ß–ï–ù–ò–ï
# ======================================================
print("\nüöÄ –û–±—É—á–µ–Ω–∏–µ –ø–∞–π–ø–ª–∞–π–Ω–∞...")
pipeline.fit(X_train, y_train)

# ======================================================
# –ü–†–ï–î–°–ö–ê–ó–ê–ù–ò–Ø
# ======================================================
print("\nüéØ –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –Ω–∞ —Ç–µ—Å—Ç–æ–≤–æ–π –≤—ã–±–æ—Ä–∫–µ...")
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

# ======================================================
# –†–ï–ó–£–õ–¨–¢–ê–¢–´
# ======================================================
print("\n‚úÖ –†–ï–ó–£–õ–¨–¢–ê–¢–´ –ú–û–î–ï–õ–ò:")
print(f"Accuracy : {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall   : {recall_score(y_test, y_pred):.4f}")
print(f"F1-score : {f1_score(y_test, y_pred):.4f}")
print(f"ROC-AUC  : {roc_auc_score(y_test, y_proba):.4f}")

print("\n–ú–∞—Ç—Ä–∏—Ü–∞ –æ—à–∏–±–æ–∫:")
print(confusion_matrix(y_test, y_pred))



üì• –ó–∞–≥—Ä—É–∂–∞–µ–º –æ–±—É—á–∞—é—â—É—é –∏ —Ç–µ—Å—Ç–æ–≤—É—é –≤—ã–±–æ—Ä–∫–∏...

üöÄ –û–±—É—á–µ–Ω–∏–µ –ø–∞–π–ø–ª–∞–π–Ω–∞...

=== –ò—Ç–µ—Ä–∞—Ü–∏—è 1 ===
–¢–µ–∫—É—â–∏–π —Ä–∞–∑–º–µ—Ä –ø—Ä–∏–∑–Ω–∞–∫–æ–≤: 10171

--- Fold 1/10 ---
ROC-AUC –Ω–∞ –≤–∞–ª–∏–¥–∞—Ü–∏–∏: 0.5555
