In [1]:
from clickhouse_driver import Client
from typing import Generator, Any
import os
import json
from dateutil.parser import parse
from collections import defaultdict
import numpy as np
import warnings
import logging as log
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder as SklearnLabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy import sparse
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import RFECV

warnings.filterwarnings("ignore")
log.basicConfig(level=log.INFO)

# ------------------------------
# DATA LOADING AND VALIDATION
# ------------------------------
def generate_game_raw(path_to_games_raw_dir: str = "data/games_raw") -> Generator[dict[str, Any], None, None]:
    for filename in os.listdir(path_to_games_raw_dir):
        try:
            with open(os.path.join(path_to_games_raw_dir, filename), "r", encoding="utf-8") as f:
                yield json.load(f)
        except:
            pass

def validate_game(game: dict) -> bool:
    try:
        int(game["id"])
        parse(game["begin_at"])
        int(game["match"]["league"]["id"])
        int(game["match"]["serie"]["id"])
        int(game["match"]["tournament"]["id"])
        int(game["map"]["id"])

        team_players = defaultdict(list)
        for p in game["players"]:
            team_players[p["team"]["id"]].append(p["player"]["id"])

        assert len(team_players) == 2
        for t_id, p_ids in team_players.items():
            assert len(set(p_ids)) == 5

        team_ids = list(team_players.keys())
        rounds = []
        for r in game["rounds"]:
            assert r["round"] is not None
            assert r["ct"] in team_ids
            assert r["terrorists"] in team_ids
            assert r["winner_team"] in team_ids
            rounds.append(r["round"])
        assert min(rounds) == 1
        assert max(rounds) >= 16
        return True
    except:
        return False

def get_game_ids(path_to_games_raw_dir: str = "data/games_raw") -> list[int]:
    game_ids_valid = []
    game_begin_at_valid = []
    for game in generate_game_raw(path_to_games_raw_dir):
        if validate_game(game):
            game_ids_valid.append(game["id"])
            game_begin_at_valid.append(parse(game["begin_at"]))
    return np.array(game_ids_valid)[np.argsort(game_begin_at_valid)].tolist()

# ------------------------------
# X, y PREPARATION
# ------------------------------
def get_player_id_X_for_game(path_to_games_raw_dir: str, game_id: int):
    with open(os.path.join(path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
        game = json.load(f)
    X = []
    team_players = defaultdict(list)
    for p in game["players"]:
        team_players[p["team"]["id"]].append(p["player"]["id"])
    t1_id, t2_id = sorted(team_players.keys())    
    X.extend(sorted(team_players[t1_id])) 
    X.extend(sorted(team_players[t2_id]))    
    return X

def get_team_id_X_for_game(path_to_games_raw_dir: str, game_id: int):    
    with open(os.path.join(path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
        game = json.load(f)
    team_ids = sorted({p["team"]["id"] for p in game["players"]})
    assert len(team_ids) == 2
    return team_ids

def get_game_info_X_for_game(path_to_games_raw_dir: str, game_id: int):    
    with open(os.path.join(path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
        game = json.load(f)
    timestamp = parse(game["begin_at"]).timestamp()
    map_id = int(game["map"]["id"])
    league_id = int(game["match"]["league"]["id"])
    serie_id = int(game["match"]["serie"]["id"])
    serie_tier = game["match"]["serie"].get("tier", "unknown")
    tournament_id = int(game["match"]["tournament"]["id"])
    return [timestamp, map_id, league_id, serie_id, serie_tier, tournament_id]

def get_y_for_game(path_to_games_raw_dir: str, game_id: int):
    with open(os.path.join(path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
        game = json.load(f)
    t1_id, t2_id = np.unique([p["team"]["id"] for p in game["players"]])
    team_win_count = {t1_id:0, t2_id:0}
    for r in game["rounds"]:
       team_win_count[r["winner_team"]] += 1 
    return int(team_win_count[t1_id] > team_win_count[t2_id])

def get_X_y(path_to_games_raw: str, game_ids: list[int]):
    X_game_info, X_team_id, X_player_id, y = [], [], [], []
    for game_id in game_ids:
        try:
            X_player_id.append(get_player_id_X_for_game(path_to_games_raw, game_id))
            X_team_id.append(get_team_id_X_for_game(path_to_games_raw, game_id))
            X_game_info.append(get_game_info_X_for_game(path_to_games_raw, game_id))
            y.append(get_y_for_game(path_to_games_raw, game_id))
        except Exception as e:
            log.warning(f"Skipping game {game_id} due to error: {e}")
            continue
    return (np.array(X_game_info, dtype=object),
            np.array(X_team_id),
            np.array(X_player_id),
            np.array(y, dtype=int))

# ------------------------------
# CUSTOM TRANSFORMERS
# ------------------------------
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return np.array(X)[:, self.columns]

class LabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoder = SklearnLabelEncoder()
    def fit(self, X, y=None):
        X = np.asarray(X)
        self.encoder.fit(X.ravel().astype(object))
        return self
    def transform(self, X):
        X = np.asarray(X).copy()
        flat = X.ravel().astype(object)
        mask = np.isin(flat, self.encoder.classes_)
        flat_result = np.full(flat.shape, -1, dtype=int)
        flat_result[mask] = self.encoder.transform(flat[mask])
        return flat_result.reshape(X.shape)

class PlayerBagEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X = np.asarray(X)
        self.d = {val: idx for idx, val in enumerate(np.unique(X.flatten()))}
        return self
    def transform(self, X):
        X = np.asarray(X)
        rows, cols, data = [], [], []
        for i, row in enumerate(X):
            for j, val in enumerate(row):
                if val in self.d:
                    rows.append(i)
                    cols.append(self.d[val])
                    data.append(1 if j < 5 else -1)
        return sparse.csr_matrix((data, (rows, cols)), shape=(X.shape[0], len(self.d)), dtype=int)

class TeamBagEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X = np.asarray(X)
        self.d = {val: idx for idx, val in enumerate(np.unique(X.flatten()))}
        return self
    def transform(self, X):
        X = np.asarray(X)
        rows, cols, data = [], [], []
        for i, row in enumerate(X):
            for j, val in enumerate(row):
                if val in self.d:
                    rows.append(i)
                    cols.append(self.d[val])
                    data.append(1 if j == 0 else -1)
        return sparse.csr_matrix((data, (rows, cols)), shape=(X.shape[0], len(self.d)), dtype=int)

class GameInfoEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X = np.array(X, dtype=object)
        self.scaler = MinMaxScaler().fit(X[:, [0]])
        self.ohe = OneHotEncoder(sparse_output=True, handle_unknown='ignore').fit(X[:, 1:])
        return self
    def transform(self, X):
        X = np.array(X, dtype=object)
        ts_scaled = self.scaler.transform(X[:, [0]])
        cat_ohe = self.ohe.transform(X[:, 1:])
        return sparse.hstack([ts_scaled, cat_ohe]).tocsr()

from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy import sparse
import numpy as np

from joblib import Parallel, delayed

# функция для одного признака
def perm_importance_worker(estimator, X_val, y_val, i, baseline, random_seed):
    col = X_val[:, i].copy()
    if sparse.issparse(col):
        col = col.toarray().ravel()
    rng = np.random.RandomState(random_seed)
    rng.shuffle(col)

    X_perm = X_val.copy()
    if sparse.issparse(X_perm):
        X_perm = X_perm.tolil()
        X_perm[:, i] = col.reshape(-1, 1)
        X_perm = X_perm.tocsr()
    else:
        X_perm[:, i] = col

    perm_score = roc_auc_score(y_val, estimator.predict_proba(X_perm)[:, 1])
    return baseline - perm_score


class SparsePermutationImportanceFeatureSelector:
    def __init__(self, estimator=None, scoring=roc_auc_score, random_state=42, n_jobs=-1):
        self.estimator = estimator if estimator else LogisticRegression(max_iter=1000, random_state=random_state)
        self.scoring = scoring
        self.random_state = random_state
        self.feature_importances_ = None
        self.n_jobs = n_jobs  # число параллельных процессов

    def fit(self, X, y):
        if not sparse.issparse(X):
            X = sparse.csr_matrix(X)

        mask = np.arange(X.shape[1])
        iteration = 0
        best_score = -np.inf
        best_mask = mask.copy()
        n_splits = 10

        while True:
            iteration += 1
            print(f"\n=== Итерация {iteration} ===")
            X_subset = X[:, mask]
            print(f"Текущий размер признаков: {X_subset.shape[1]}")

            tscv = TimeSeriesSplit(n_splits=n_splits)
            fold_importances_list = []
            fold_scores = []

            for fold_idx, (train_idx, val_idx) in enumerate(tscv.split(X_subset)):
                print(f"\n--- Fold {fold_idx + 1}/{n_splits} ---")
                X_train, X_val = X_subset[train_idx], X_subset[val_idx]
                y_train, y_val = y[train_idx], y[val_idx]

                self.estimator.fit(X_train, y_train)
                baseline = self.scoring(y_val, self.estimator.predict_proba(X_val)[:, 1])
                fold_scores.append(baseline)
                print(f"ROC-AUC на валидации: {baseline:.4f}")

                # параллельное вычисление permutation importance по признакам
                fold_importances = Parallel(n_jobs=self.n_jobs)(
                    delayed(perm_importance_worker)(self.estimator, X_val, y_val, i, baseline, self.random_state + fold_idx)
                    for i in range(X_subset.shape[1])
                )
                fold_importances = np.array(fold_importances)
                print(f"Средняя важность признаков в fold {fold_idx + 1}: {np.mean(fold_importances):.6f}")
                fold_importances_list.append(fold_importances)

            mean_importances = np.mean(fold_importances_list, axis=0)
            avg_score = np.mean(fold_scores)
            print(f"\nСредний ROC-AUC по всем фолдам: {avg_score:.4f}")
            print(f"Средняя важность признаков по всем фолдам: {np.mean(mean_importances):.6f}")

            if avg_score < best_score:
                print(f"Средняя метрика упала с {best_score:.4f} до {avg_score:.4f}. Останавливаем рекурсию.")
                mask = best_mask
                break

            best_score = avg_score
            best_mask = mask.copy()

            positive_mask = mean_importances > 0
            n_positive = np.sum(positive_mask)
            print(f"Признаков с положительной важностью: {n_positive} из {len(mean_importances)}")

            if n_positive == X_subset.shape[1]:
                print("Все оставшиеся признаки имеют положительную важность. Останавливаем рекурсию.")
                break

            mask = mask[positive_mask]
            print(f"Удаляем признаки с нулевой или отрицательной важностью, оставляем {len(mask)} признаков")

        self.feature_importances_ = np.zeros(X.shape[1])
        self.feature_importances_[mask] = 1
        print("\nПерестановочная важность признаков вычислена!")
        print(f"Оставлено признаков: {np.sum(self.feature_importances_)} из {X.shape[1]}")
        return self

    def transform(self, X):
        mask = self.feature_importances_ > 0
        print(f"Выбраны признаки с положительной важностью: {np.sum(mask)} из {len(mask)}")
        if not sparse.issparse(X):
            X = sparse.csr_matrix(X)
        return X[:, mask]

    def fit_transform(self, X, y):
        self.fit(X, y)
        return self.transform(X)
    
class PlayerStatSumFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, path_to_games_raw_dir: str, game_ids: list[int], key: str = "kills"):
        self.path_to_games_raw_dir = path_to_games_raw_dir
        self.game_ids = game_ids
        self.key = key
        self.cumulative_dict = {}

    def _get_player_stat(self, game_id: int) -> dict:
        with open(os.path.join(self.path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
            game = json.load(f)
        return {p["player"]["id"]: int(p.get(self.key) or 0) for p in game["players"]}

    def fit(self, X, y=None):
        n_games = X.shape[0]
        X_cumulative = np.zeros((n_games, 10), dtype=int)
        for i in range(n_games):
            game_id = self.game_ids[i]
            player_ids = X[i]
            player_stat_game = self._get_player_stat(game_id)
            row_stat = []
            for p_id in player_ids:
                self.cumulative_dict[p_id] = self.cumulative_dict.get(p_id, 0) + player_stat_game.get(p_id, 0)
                row_stat.append(self.cumulative_dict[p_id])
            X_cumulative[i, :] = row_stat
        self.X_cumulative_train = X_cumulative
        return self

    def transform(self, X):
        n_games = X.shape[0]
        if hasattr(self, "X_cumulative_train") and self.X_cumulative_train.shape[0] == n_games:
            X_cumulative = self.X_cumulative_train
        else:
            X_cumulative = np.zeros((n_games, 10), dtype=int)
            for i in range(n_games):
                X_cumulative[i, :] = np.array([self.cumulative_dict.get(p_id, 0) for p_id in X[i]])
        first5_sorted = np.sort(X_cumulative[:, :5], axis=1)
        last5_sorted = np.sort(X_cumulative[:, 5:], axis=1)
        x = np.hstack([first5_sorted, last5_sorted])
        mean_first5 = np.mean(first5_sorted, axis=1).reshape(-1, 1)
        mean_last5 = np.mean(last5_sorted, axis=1).reshape(-1, 1)
        diff_means = mean_first5 - mean_last5
        x = np.hstack([x, mean_first5, mean_last5, diff_means])
        pairwise_diffs = np.zeros((n_games, 25))
        for row_id in range(x.shape[0]):
            c = 0
            for i in range(5):
                for j in range(5):
                    pairwise_diffs[row_id, c] = first5_sorted[row_id, i] - last5_sorted[row_id, j]
                    c += 1
        x = np.hstack([x, pairwise_diffs])
        return x

class PlayerStatMeanFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, path_to_games_raw_dir: str, game_ids: list[int], key: str = "kills"):
        self.path_to_games_raw_dir = path_to_games_raw_dir
        self.game_ids = game_ids
        self.key = key
        self.cumulative_dict = {}  # stores cumulative sum
        self.count_dict = {}       # stores counts for mean calculation

    def _get_player_stat(self, game_id: int) -> dict:
        with open(os.path.join(self.path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
            game = json.load(f)
        return {p["player"]["id"]: int(p.get(self.key) or 0) for p in game["players"]}

    def fit(self, X, y=None):
        n_games = X.shape[0]
        X_mean = np.zeros((n_games, 10), dtype=float)
        for i in range(n_games):
            game_id = self.game_ids[i]
            player_ids = X[i]
            player_stat_game = self._get_player_stat(game_id)
            row_stat = []
            for p_id in player_ids:
                self.cumulative_dict[p_id] = self.cumulative_dict.get(p_id, 0) + player_stat_game.get(p_id, 0)
                self.count_dict[p_id] = self.count_dict.get(p_id, 0) + 1
                row_stat.append(self.cumulative_dict[p_id] / self.count_dict[p_id])
            X_mean[i, :] = row_stat
        self.X_mean_train = X_mean
        return self

    def transform(self, X):
        n_games = X.shape[0]
        if hasattr(self, "X_mean_train") and self.X_mean_train.shape[0] == n_games:
            X_mean = self.X_mean_train
        else:
            X_mean = np.zeros((n_games, 10), dtype=float)
            for i in range(n_games):
                X_mean[i, :] = np.array([
                    self.cumulative_dict.get(p_id, 0) / max(self.count_dict.get(p_id, 1), 1) 
                    for p_id in X[i]
                ])
        first5_sorted = np.sort(X_mean[:, :5], axis=1)
        last5_sorted = np.sort(X_mean[:, 5:], axis=1)
        x = np.hstack([first5_sorted, last5_sorted])
        mean_first5 = np.mean(first5_sorted, axis=1).reshape(-1, 1)
        mean_last5 = np.mean(last5_sorted, axis=1).reshape(-1, 1)
        diff_means = mean_first5 - mean_last5
        x = np.hstack([x, mean_first5, mean_last5, diff_means])
        pairwise_diffs = np.zeros((n_games, 25))
        for row_id in range(x.shape[0]):
            c = 0
            for i in range(5):
                for j in range(5):
                    pairwise_diffs[row_id, c] = first5_sorted[row_id, i] - last5_sorted[row_id, j]
                    c += 1
        x = np.hstack([x, pairwise_diffs])
        return x


In [None]:
PATH_TO_GAMES_RAW = "data/games_raw"
TEST_SIZE = 100

game_ids = get_game_ids(PATH_TO_GAMES_RAW)
game_ids_train, game_ids_test = game_ids[:-TEST_SIZE], game_ids[-TEST_SIZE:]

TIMESTAMP_COL = [0]
GAME_INFO_COL = [1, 2, 3, 4, 5]
TEAM_ID_COL = [6, 7]
PLAYER_ID_COL = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

pipeline = Pipeline([
    ('features', FeatureUnion(
        transformer_list=[
            # Игроки
            ('players', Pipeline([
                ('selector', ColumnSelector(columns=PLAYER_ID_COL)),
                ('le', LabelEncoder()),
                ('bag', PlayerBagEncoder())
            ])),
            
            # Команды
            ('teams', Pipeline([
                ('selector', ColumnSelector(columns=TEAM_ID_COL)),
                ('le', LabelEncoder()),
                ('bag', TeamBagEncoder())
            ])),

            # Информация о матче
            ('game_info', Pipeline([
                ('selector', ColumnSelector(columns=TIMESTAMP_COL + GAME_INFO_COL)),
                ('transform', GameInfoEncoder())  
            ])),

            # Суммированные базовые статистики игроков
            *[
                (f"{key}_sum", Pipeline([
                    ('selector', ColumnSelector(columns=PLAYER_ID_COL)),
                    (f"{key}_sum", PlayerStatSumFeatureExtractor(
                        path_to_games_raw_dir=PATH_TO_GAMES_RAW,
                        game_ids=game_ids_train,
                        key=key
                    )),
                    ('scaler', MinMaxScaler())
                ]))
                for key in ["kills", "deaths", "assists", "headshots", "flash_assists"]
            ],

            # Средние статистики игроков + расширенные показатели
            *[
                (f"{key}_mean", Pipeline([
                    ('selector', ColumnSelector(columns=PLAYER_ID_COL)),
                    (f"{key}_mean", PlayerStatMeanFeatureExtractor(
                        path_to_games_raw_dir=PATH_TO_GAMES_RAW,
                        game_ids=game_ids_train,
                        key=key
                    )),
                    ('scaler', MinMaxScaler())
                ]))
                for key in [
                    "kills", "deaths", "assists", "headshots", "flash_assists",
                    "first_kills_diff", "k_d_diff", "adr", "kast", "rating"
                ]
            ]
        ]
    )),

    # Перестановочная важность признаков
    ('perm_selector', SparsePermutationImportanceFeatureSelector(
        estimator=LogisticRegression(solver="liblinear", random_state=42),
        scoring=roc_auc_score,
        n_jobs=-1
    )),

    # RFECV для отбора признаков
    ('rfecv', RFECV(
        estimator=LogisticRegression(solver="liblinear", random_state=42),
        step=1,
        cv=TimeSeriesSplit(n_splits=10),
        scoring='roc_auc',
        min_features_to_select=1,
        verbose=2,
        n_jobs=-1
    )),

    # Финальный классификатор
    ('clf', LogisticRegression(solver="liblinear", random_state=42))
])


# ======================================================
# ОБУЧЕНИЕ И ОЦЕНКА
# ======================================================
print("\n📥 Загружаем обучающую и тестовую выборки...")
X_game_info_train, X_team_id_train, X_player_id_train, y_train = get_X_y(PATH_TO_GAMES_RAW, game_ids_train)
X_game_info_test, X_team_id_test, X_player_id_test, y_test = get_X_y(PATH_TO_GAMES_RAW, game_ids_test)

# объединяем все части X
X_train = np.c_[X_game_info_train, X_team_id_train, X_player_id_train]
X_test = np.c_[X_game_info_test, X_team_id_test, X_player_id_test]

print("\n🚀 Обучение пайплайна...")
pipeline.fit(X_train, y_train)

print("\n🎯 Предсказание на тестовой выборке...")
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

# ======================================================
# РЕЗУЛЬТАТЫ
# ======================================================
print("\n✅ РЕЗУЛЬТАТЫ МОДЕЛИ:")
print(f"Accuracy : {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall   : {recall_score(y_test, y_pred):.4f}")
print(f"F1-score : {f1_score(y_test, y_pred):.4f}")
print(f"ROC-AUC  : {roc_auc_score(y_test, y_proba):.4f}")

print("\nМатрица ошибок:")
print(confusion_matrix(y_test, y_pred))



📥 Загружаем обучающую и тестовую выборки...

🚀 Обучение пайплайна...

=== Итерация 1 ===
Текущий размер признаков: 9981

--- Fold 1/10 ---
ROC-AUC на валидации: 0.6972
