In [1]:
from clickhouse_driver import Client
from typing import Generator, Any
import os
import json
from tqdm.notebook import tqdm
from dateutil.parser import parse
from collections import defaultdict
import numpy as np
import warnings
import logging as log
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder as SklearnLabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy import sparse
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

warnings.filterwarnings("ignore")
log.basicConfig(level=log.INFO)

# ------------------------------
# DATA LOADING AND VALIDATION
# ------------------------------
def generate_game_raw(path_to_games_raw_dir: str = "data/games_raw") -> Generator[dict[str, Any], None, None]:
    for filename in tqdm(os.listdir(path_to_games_raw_dir)):
        try:
            with open(os.path.join(path_to_games_raw_dir, filename), "r", encoding="utf-8") as f:
                yield json.load(f)
        except:
            pass

def validate_game(game: dict) -> bool:
    try:
        int(game["id"])
        parse(game["begin_at"])
        int(game["match"]["league"]["id"])
        int(game["match"]["serie"]["id"])
        int(game["match"]["tournament"]["id"])
        int(game["map"]["id"])

        team_players = defaultdict(list)
        for p in game["players"]:
            team_players[p["team"]["id"]].append(p["player"]["id"])

        assert len(team_players) == 2
        for t_id, p_ids in team_players.items():
            assert len(set(p_ids)) == 5

        team_ids = list(team_players.keys())
        rounds = []
        for r in game["rounds"]:
            assert r["round"] is not None
            assert r["ct"] in team_ids
            assert r["terrorists"] in team_ids
            assert r["winner_team"] in team_ids
            rounds.append(r["round"])
        assert min(rounds) == 1
        assert max(rounds) >= 16
        return True
    except:
        return False

def get_game_ids(path_to_games_raw_dir: str = "data/games_raw") -> list[int]:
    game_ids_valid = []
    game_begin_at_valid = []
    for game in generate_game_raw(path_to_games_raw_dir):
        if validate_game(game):
            game_ids_valid.append(game["id"])
            game_begin_at_valid.append(parse(game["begin_at"]))
    return np.array(game_ids_valid)[np.argsort(game_begin_at_valid)].tolist()

# ------------------------------
# X, y PREPARATION
# ------------------------------
def get_player_id_X_for_game(path_to_games_raw_dir: str, game_id: int):
    with open(os.path.join(path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
        game = json.load(f)
    X = []
    team_players = defaultdict(list)
    for p in game["players"]:
        team_players[p["team"]["id"]].append(p["player"]["id"])
    t1_id, t2_id = sorted(team_players.keys())    
    X.extend(sorted(team_players[t1_id])) 
    X.extend(sorted(team_players[t2_id]))    
    return X

def get_team_id_X_for_game(path_to_games_raw_dir: str, game_id: int):    
    with open(os.path.join(path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
        game = json.load(f)
    team_ids = sorted({p["team"]["id"] for p in game["players"]})
    assert len(team_ids) == 2
    return team_ids

def get_game_info_X_for_game(path_to_games_raw_dir: str, game_id: int):    
    with open(os.path.join(path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
        game = json.load(f)
    timestamp = parse(game["begin_at"]).timestamp()
    map_id = int(game["map"]["id"])
    league_id = int(game["match"]["league"]["id"])
    serie_id = int(game["match"]["serie"]["id"])
    serie_tier = game["match"]["serie"].get("tier", "unknown")
    tournament_id = int(game["match"]["tournament"]["id"])
    return [timestamp, map_id, league_id, serie_id, serie_tier, tournament_id]

def get_y_for_game(path_to_games_raw_dir: str, game_id: int):
    with open(os.path.join(path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
        game = json.load(f)
    t1_id, t2_id = np.unique([p["team"]["id"] for p in game["players"]])
    team_win_count = {t1_id:0, t2_id:0}
    for r in game["rounds"]:
       team_win_count[r["winner_team"]] += 1 
    return int(team_win_count[t1_id] > team_win_count[t2_id])

def get_X_y(path_to_games_raw: str, game_ids: list[int]):
    X_game_info, X_team_id, X_player_id, y = [], [], [], []
    for game_id in tqdm(game_ids):
        try:
            X_player_id.append(get_player_id_X_for_game(path_to_games_raw, game_id))
            X_team_id.append(get_team_id_X_for_game(path_to_games_raw, game_id))
            X_game_info.append(get_game_info_X_for_game(path_to_games_raw, game_id))
            y.append(get_y_for_game(path_to_games_raw, game_id))
        except Exception as e:
            log.warning(f"Skipping game {game_id} due to error: {e}")
            continue
    return (np.array(X_game_info, dtype=object),
            np.array(X_team_id),
            np.array(X_player_id),
            np.array(y, dtype=int))

# ------------------------------
# CUSTOM TRANSFORMERS
# ------------------------------
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return np.array(X)[:, self.columns]

class LabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoder = SklearnLabelEncoder()
    def fit(self, X, y=None):
        X = np.asarray(X)
        self.encoder.fit(X.ravel().astype(object))
        return self
    def transform(self, X):
        X = np.asarray(X).copy()
        flat = X.ravel().astype(object)
        mask = np.isin(flat, self.encoder.classes_)
        flat_result = np.full(flat.shape, -1, dtype=int)
        flat_result[mask] = self.encoder.transform(flat[mask])
        return flat_result.reshape(X.shape)

class PlayerBagEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X = np.asarray(X)
        self.d = {val: idx for idx, val in enumerate(np.unique(X.flatten()))}
        return self
    def transform(self, X):
        X = np.asarray(X)
        rows, cols, data = [], [], []
        for i, row in enumerate(X):
            for j, val in enumerate(row):
                if val in self.d:
                    rows.append(i)
                    cols.append(self.d[val])
                    data.append(1 if j < 5 else -1)
        return sparse.csr_matrix((data, (rows, cols)), shape=(X.shape[0], len(self.d)), dtype=int)

class TeamBagEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X = np.asarray(X)
        self.d = {val: idx for idx, val in enumerate(np.unique(X.flatten()))}
        return self
    def transform(self, X):
        X = np.asarray(X)
        rows, cols, data = [], [], []
        for i, row in enumerate(X):
            for j, val in enumerate(row):
                if val in self.d:
                    rows.append(i)
                    cols.append(self.d[val])
                    data.append(1 if j == 0 else -1)
        return sparse.csr_matrix((data, (rows, cols)), shape=(X.shape[0], len(self.d)), dtype=int)

class GameInfoTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X = np.array(X, dtype=object)
        self.scaler = MinMaxScaler().fit(X[:, [0]])
        self.ohe = OneHotEncoder(sparse_output=True, handle_unknown='ignore').fit(X[:, 1:])
        return self
    def transform(self, X):
        X = np.array(X, dtype=object)
        ts_scaled = self.scaler.transform(X[:, [0]])
        cat_ohe = self.ohe.transform(X[:, 1:])
        return sparse.hstack([ts_scaled, cat_ohe]).tocsr()

# ------------------------------
# PLAYER STAT FEATURE EXTRACTOR
# ------------------------------
class PlayerStatFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, path_to_games_raw_dir: str, game_ids: list[int], key: str = "kills"):
        self.path_to_games_raw_dir = path_to_games_raw_dir
        self.game_ids = game_ids
        self.key = key
        self.cumulative_dict = {}

    def _get_player_stat(self, game_id: int) -> dict:
        with open(os.path.join(self.path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
            game = json.load(f)
        return {p["player"]["id"]: int(p.get(self.key) or 0) for p in game["players"]}

    def fit(self, X, y=None):
        n_games = X.shape[0]
        X_cumulative = np.zeros((n_games, 10), dtype=int)
        for i in range(n_games):
            game_id = self.game_ids[i]
            player_ids = X[i]
            player_stat_game = self._get_player_stat(game_id)
            row_stat = []
            for p_id in player_ids:
                self.cumulative_dict[p_id] = self.cumulative_dict.get(p_id, 0) + player_stat_game.get(p_id, 0)
                row_stat.append(self.cumulative_dict[p_id])
            X_cumulative[i, :] = row_stat
        self.X_cumulative_train = X_cumulative
        return self

    def transform(self, X):
        n_games = X.shape[0]
        if hasattr(self, "X_cumulative_train") and self.X_cumulative_train.shape[0] == n_games:
            X_cumulative = self.X_cumulative_train
        else:
            X_cumulative = np.zeros((n_games, 10), dtype=int)
            for i in range(n_games):
                X_cumulative[i, :] = np.array([self.cumulative_dict.get(p_id, 0) for p_id in X[i]])
        first5_sorted = np.sort(X_cumulative[:, :5], axis=1)
        last5_sorted = np.sort(X_cumulative[:, 5:], axis=1)
        x = np.hstack([first5_sorted, last5_sorted])
        mean_first5 = np.mean(first5_sorted, axis=1).reshape(-1, 1)
        mean_last5 = np.mean(last5_sorted, axis=1).reshape(-1, 1)
        diff_means = mean_first5 - mean_last5
        x = np.hstack([x, mean_first5, mean_last5, diff_means])
        pairwise_diffs = np.zeros((n_games, 25))
        for row_id in range(x.shape[0]):
            c = 0
            for i in range(5):
                for j in range(5):
                    pairwise_diffs[row_id, c] = first5_sorted[row_id, i] - last5_sorted[row_id, j]
                    c += 1
        x = np.hstack([x, pairwise_diffs])
        return x
    
class RecursiveFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, val_size: int = 1000):
        self.selected_features_mask_ = None
        self.logit_ = None
        self.val_size = val_size

    def fit(self, X, y):
        # Automatically take last val_size samples for validation
        X_train, X_val = X[:-self.val_size].copy(), X[-self.val_size:].copy()
        y_val, y_val = y[:-self.val_size].copy(), y[-self.val_size:].copy()

        iteration = 0

        while True:
            iteration += 1
            print(f"\n=== Итерация {iteration} ===")
            print(f"Текущие признаки: {X_train.shape[1]}")

            logit = LogisticRegression(
                solver="liblinear",
                penalty="l1",
                random_state=13,                
            )
            logit.fit(X_train, y_train)

            y_pred_proba = logit.predict_proba(X_val)[:, 1]
            roc_auc = roc_auc_score(y_val, y_pred_proba)
            print(f"ROC-AUC на валидации: {roc_auc:.4f}")

            non_zero_mask = logit.coef_.flatten() != 0
            n_non_zero = non_zero_mask.sum()
            print(f"Признаков с ненулевым коэффициентом: {n_non_zero}")

            if n_non_zero == X_train.shape[1]:
                print("Все оставшиеся признаки важны. Останавливаем рекурсию.")
                break

            X_train = X_train[:, non_zero_mask]
            X_val = X_val[:, non_zero_mask]
            print(f"Удаляем неважные признаки, оставляем {n_non_zero} признаков")

        self.selected_features_mask_ = non_zero_mask
        self.logit_ = logit
        return self

    def transform(self, X):
        return X[:, self.selected_features_mask_]

    def fit_transform(self, X, y):
        self.fit(X, y)
        return self.transform(X)

    
PATH_TO_GAMES_RAW = "data/games_raw"
TEST_SIZE = 100

game_ids = get_game_ids(PATH_TO_GAMES_RAW)
game_ids_train, game_ids_test = game_ids[:-TEST_SIZE], game_ids[-TEST_SIZE:]

# ------------------------------
# COLUMN INDEXES
# ------------------------------
TIMESTAMP_COL = [0]
GAME_INFO_COL = [1, 2, 3, 4, 5]
TEAM_ID_COL = [6, 7]
PLAYER_ID_COL = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

players_pipeline = Pipeline([
    ('selector', ColumnSelector(columns=PLAYER_ID_COL)),
    ('le', LabelEncoder()),
    ('bag', PlayerBagEncoder())
])

teams_pipeline = Pipeline([
    ('selector', ColumnSelector(columns=TEAM_ID_COL)),
    ('le', LabelEncoder()),
    ('bag', TeamBagEncoder())
])

game_info_pipeline = Pipeline([
    ('selector', ColumnSelector(columns=TIMESTAMP_COL + GAME_INFO_COL)),  
    ('transform', GameInfoTransformer())
])

# статистические пайплайны для разных метрик игроков
stat_pipelines = [
    (key, Pipeline([
        ('selector', ColumnSelector(columns=PLAYER_ID_COL)),
        (key, PlayerStatFeatureExtractor(
            path_to_games_raw_dir="data/games_raw",
            game_ids=game_ids_train,
            key=key
        )),
        ('scaler', MinMaxScaler())
    ]))
    for key in ["kills", "deaths", "assists", "headshots", "flash_assists"]
]

# основной pipeline с FeatureUnion
pipeline = Pipeline([
    ('features', FeatureUnion(
        transformer_list=[
            ('players', players_pipeline),
            ('teams', teams_pipeline),
            ('game_info', game_info_pipeline),
        ] + stat_pipelines  # динамически добавляем все статистические пайплайны
    )),
    
])

X_game_info_train, X_team_id_train, X_player_id_train, y_train = get_X_y(PATH_TO_GAMES_RAW, game_ids_train)
X_game_info_test, X_team_id_test, X_player_id_test, y_test = get_X_y(PATH_TO_GAMES_RAW, game_ids_test)

X_train = np.c_[X_game_info_train, X_team_id_train, X_player_id_train]
X_test = np.c_[X_game_info_test, X_team_id_test, X_player_id_test]

pipeline.fit(X_train, y_train)


# # Predict probabilities and classes
# y_pred_proba = pipeline.predict_proba(X_test_full)[:, 1]
# y_pred = pipeline.predict(X_test_full)

# # Metrics
# accuracy = accuracy_score(y_test, y_pred)
# precision = precision_score(y_test, y_pred)
# recall = recall_score(y_test, y_pred)
# fscore = f1_score(y_test, y_pred)
# roc_auc = roc_auc_score(y_test, y_pred_proba)

# # Confusion matrix
# tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# print(f"Accuracy : {accuracy:.4f}")
# print(f"Precision: {precision:.4f}")
# print(f"Recall   : {recall:.4f}")
# print(f"F1-score : {fscore:.4f}")
# print(f"ROC-AUC  : {roc_auc:.4f}")
# print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")


  0%|          | 0/56493 [00:00<?, ?it/s]

  0%|          | 0/37974 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

0,1,2
,steps,"[('features', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformer_list,"[('players', ...), ('teams', ...), ...]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,columns,"[8, 9, ...]"

0,1,2
,columns,"[6, 7]"

0,1,2
,columns,"[0, 1, ...]"

0,1,2
,columns,"[8, 9, ...]"

0,1,2
,path_to_games_raw_dir,'data/games_raw'
,game_ids,"[1396, 1397, ...]"
,key,'kills'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,columns,"[8, 9, ...]"

0,1,2
,path_to_games_raw_dir,'data/games_raw'
,game_ids,"[1396, 1397, ...]"
,key,'deaths'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,columns,"[8, 9, ...]"

0,1,2
,path_to_games_raw_dir,'data/games_raw'
,game_ids,"[1396, 1397, ...]"
,key,'assists'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,columns,"[8, 9, ...]"

0,1,2
,path_to_games_raw_dir,'data/games_raw'
,game_ids,"[1396, 1397, ...]"
,key,'headshots'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,columns,"[8, 9, ...]"

0,1,2
,path_to_games_raw_dir,'data/games_raw'
,game_ids,"[1396, 1397, ...]"
,key,'flash_assists'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [2]:
X_transformed_train = pipeline.transform(X_train)
X_transformed_test = pipeline.transform(X_test)

In [3]:
iteration = 0

while True:
    iteration += 1
    print(f"\n=== Итерация {iteration} ===")
    print(f"Текущие признаки: {X_transformed_train.shape[1]}")

    logit = LogisticRegression(
        solver="liblinear",
        penalty="l1",
        random_state=13,                
    )
    logit.fit(X_transformed_train, y_train)

    non_zero_mask = logit.coef_.flatten() != 0
    n_non_zero = non_zero_mask.sum()
    print(f"Признаков с ненулевым коэффициентом: {n_non_zero}")

    if n_non_zero == X_transformed_train.shape[1]:
        print("Все оставшиеся признаки важны. Останавливаем рекурсию.")
        break

    X_transformed_train = X_transformed_train[:, non_zero_mask]
    X_transformed_test = X_transformed_test[:, non_zero_mask]



=== Итерация 1 ===
Текущие признаки: 9449
Признаков с ненулевым коэффициентом: 3331

=== Итерация 2 ===
Текущие признаки: 3331
Признаков с ненулевым коэффициентом: 3245

=== Итерация 3 ===
Текущие признаки: 3245
Признаков с ненулевым коэффициентом: 3199

=== Итерация 4 ===
Текущие признаки: 3199
Признаков с ненулевым коэффициентом: 3194

=== Итерация 5 ===
Текущие признаки: 3194
Признаков с ненулевым коэффициентом: 3178

=== Итерация 6 ===
Текущие признаки: 3178
Признаков с ненулевым коэффициентом: 3146

=== Итерация 7 ===
Текущие признаки: 3146
Признаков с ненулевым коэффициентом: 3140

=== Итерация 8 ===
Текущие признаки: 3140
Признаков с ненулевым коэффициентом: 3136

=== Итерация 9 ===
Текущие признаки: 3136
Признаков с ненулевым коэффициентом: 3115

=== Итерация 10 ===
Текущие признаки: 3115
Признаков с ненулевым коэффициентом: 3101

=== Итерация 11 ===
Текущие признаки: 3101
Признаков с ненулевым коэффициентом: 3086

=== Итерация 12 ===
Текущие признаки: 3086
Признаков с ненулев

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from scipy import sparse
import numpy as np

# Логистическая регрессия
estimator = LogisticRegression(solver="liblinear", random_state=42)

# TimeSeriesSplit CV
tscv = TimeSeriesSplit(n_splits=10)

# RFECV
selector = RFECV(
    estimator=estimator,
    step=1,  # сколько признаков удаляем за итерацию
    cv=tscv,
    scoring='roc_auc',  # просто строка
    min_features_to_select=1,
    verbose=2,
    n_jobs=-1  # использовать все ядра
)



# Обучаем
selector.fit(X_transformed_train, y_train)

# Признаки, которые остались
selected_features = selector.support_
print(f"Выбрано признаков: {np.sum(selected_features)} из {X_transformed_train.shape[1]}")

# Трансформируем X
X_transformed_train = selector.transform(X_transformed_train)
X_transformed_test = selector.transform(X_transformed_test)


Fitting estimator with 3081 features.
Fitting estimator with 3081 features.
Fitting estimator with 3081 features.
Fitting estimator with 3081 features.
Fitting estimator with 3081 features.
Fitting estimator with 3081 features.
Fitting estimator with 3081 features.
Fitting estimator with 3080 features.
Fitting estimator with 3080 features.
Fitting estimator with 3079 features.
Fitting estimator with 3080 features.Fitting estimator with 3080 features.

Fitting estimator with 3078 features.
Fitting estimator with 3080 features.
Fitting estimator with 3081 features.
Fitting estimator with 3077 features.
Fitting estimator with 3079 features.
Fitting estimator with 3076 features.
Fitting estimator with 3075 features.
Fitting estimator with 3079 features.
Fitting estimator with 3074 features.
Fitting estimator with 3073 features.
Fitting estimator with 3078 features.
Fitting estimator with 3079 features.
Fitting estimator with 3072 features.
Fitting estimator with 3071 features.
Fitting esti

In [5]:
estimator = LogisticRegression(solver="liblinear", random_state=42)
estimator.fit(X_transformed_train, y_train)
y_pred_proba = estimator.predict_proba(X_transformed_test)[:, 1]
y_pred = estimator.predict(X_transformed_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
fscore = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-score : {fscore:.4f}")
print(f"ROC-AUC  : {roc_auc:.4f}")
print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")


Accuracy : 0.6300
Precision: 0.6852
Recall   : 0.6491
F1-score : 0.6667
ROC-AUC  : 0.6932
TP: 37, TN: 26, FP: 17, FN: 20


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import roc_auc_score
from scipy import sparse

iteration = 0

while True:
    iteration += 1
    print(f"\n=== Итерация {iteration} ===")
    print(f"Текущие признаки: {X_transformed_train.shape[1]}")
    
    logit = LogisticRegression(solver='liblinear', penalty="l1", random_state=42)
    logit.fit(X_transformed_train, y_train)
    
    y_pred_proba = logit.predict_proba(X_transformed_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    print(f"ROC-AUC на тесте: {roc_auc:.4f}")
    
    non_zero_mask = logit.coef_.flatten() != 0
    n_non_zero = non_zero_mask.sum()
    print(f"Признаков с ненулевым коэффициентом: {n_non_zero}")
    
    if n_non_zero == X_transformed_train.shape[1]:
        print("Все оставшиеся признаки важны. Останавливаем рекурсию.")
        break
    
    X_transformed_train = X_transformed_train[:, non_zero_mask]
    X_transformed_test = X_transformed_test[:, non_zero_mask]
    print(f"Удаляем неважные признаки, оставляем {n_non_zero} признаков")



=== Итерация 1 ===
Текущие признаки: 9449
ROC-AUC на тесте: 0.7099
Признаков с ненулевым коэффициентом: 3342
Удаляем неважные признаки, оставляем 3342 признаков

=== Итерация 2 ===
Текущие признаки: 3342
ROC-AUC на тесте: 0.7103
Признаков с ненулевым коэффициентом: 3242
Удаляем неважные признаки, оставляем 3242 признаков

=== Итерация 3 ===
Текущие признаки: 3242
ROC-AUC на тесте: 0.7103
Признаков с ненулевым коэффициентом: 3234
Удаляем неважные признаки, оставляем 3234 признаков

=== Итерация 4 ===
Текущие признаки: 3234
ROC-AUC на тесте: 0.7103
Признаков с ненулевым коэффициентом: 3230
Удаляем неважные признаки, оставляем 3230 признаков

=== Итерация 5 ===
Текущие признаки: 3230
ROC-AUC на тесте: 0.7103
Признаков с ненулевым коэффициентом: 3217
Удаляем неважные признаки, оставляем 3217 признаков

=== Итерация 6 ===
Текущие признаки: 3217
ROC-AUC на тесте: 0.7099
Признаков с ненулевым коэффициентом: 3210
Удаляем неважные признаки, оставляем 3210 признаков

=== Итерация 7 ===
Текущие 

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import roc_auc_score, make_scorer

iteration = 0
tscv = TimeSeriesSplit(n_splits=10)
roc_scorer = make_scorer(roc_auc_score, needs_proba=True)

param_grid = {
    'C': np.linspace(.01, 1, 10),
    'penalty': ['l1', 'l2']
}

logit = LogisticRegression(solver='liblinear', random_state=42, max_iter=1000)
grid = GridSearchCV(logit, param_grid, scoring=roc_scorer, cv=tscv, n_jobs=-1, verbose=2)
grid.fit(X_transformed_train, y_train)

best_logit = grid.best_estimator_
print(f"Лучшие параметры: {grid.best_params_}")

y_pred_proba = best_logit.predict_proba(X_transformed_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV] END .................................C=0.01, penalty=l1; total time=   0.1s
[CV] END .................................C=0.01, penalty=l1; total time=   0.0s
[CV] END .................................C=0.01, penalty=l1; total time=   0.0s
[CV] END .................................C=0.01, penalty=l2; total time=   0.0s
[CV] END .................................C=0.01, penalty=l1; total time=   0.1s
[CV] END .................................C=0.01, penalty=l2; total time=   0.1s
[CV] END .................................C=0.01, penalty=l1; total time=   0.1s
[CV] END .................................C=0.01, penalty=l1; total time=   0.3s
[CV] END .................................C=0.01, penalty=l1; total time=   0.3s
[CV] END .................................C=0.01, penalty=l2; total time=   0.1s
[CV] END .................................C=0.01, penalty=l1; total time=   0.2s
[CV] END .................................C=0.

In [7]:
roc_auc

0.6056711546307629

In [3]:
X_transformed_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2126500 stored elements and shape (37974, 9449)>

In [3]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy import sparse
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed

# функция для одного признака
def perm_importance_worker(estimator, X_val, y_val, i, baseline, random_seed):
    col = X_val[:, i].copy()
    if sparse.issparse(col):
        col = col.toarray().ravel()
    rng = np.random.RandomState(random_seed)
    rng.shuffle(col)

    X_perm = X_val.copy()
    if sparse.issparse(X_perm):
        X_perm = X_perm.tolil()
        X_perm[:, i] = col.reshape(-1, 1)
        X_perm = X_perm.tocsr()
    else:
        X_perm[:, i] = col

    perm_score = roc_auc_score(y_val, estimator.predict_proba(X_perm)[:, 1])
    return baseline - perm_score


class SparsePermutationImportanceFeatureSelector:
    def __init__(self, estimator=None, scoring=roc_auc_score, random_state=42, n_jobs=-1):
        self.estimator = estimator if estimator else LogisticRegression(max_iter=1000, random_state=random_state)
        self.scoring = scoring
        self.random_state = random_state
        self.feature_importances_ = None
        self.n_jobs = n_jobs  # число параллельных процессов

    def fit(self, X, y):
        if not sparse.issparse(X):
            X = sparse.csr_matrix(X)

        mask = np.arange(X.shape[1])
        iteration = 0
        best_score = -np.inf
        best_mask = mask.copy()
        n_splits = 10

        while True:
            iteration += 1
            print(f"\n=== Итерация {iteration} ===")
            X_subset = X[:, mask]
            print(f"Текущий размер признаков: {X_subset.shape[1]}")

            tscv = TimeSeriesSplit(n_splits=n_splits)
            fold_importances_list = []
            fold_scores = []

            for fold_idx, (train_idx, val_idx) in enumerate(tscv.split(X_subset)):
                print(f"\n--- Fold {fold_idx + 1}/{n_splits} ---")
                X_train, X_val = X_subset[train_idx], X_subset[val_idx]
                y_train, y_val = y[train_idx], y[val_idx]

                self.estimator.fit(X_train, y_train)
                baseline = self.scoring(y_val, self.estimator.predict_proba(X_val)[:, 1])
                fold_scores.append(baseline)
                print(f"ROC-AUC на валидации: {baseline:.4f}")

                # параллельное вычисление permutation importance по признакам
                fold_importances = Parallel(n_jobs=self.n_jobs)(
                    delayed(perm_importance_worker)(self.estimator, X_val, y_val, i, baseline, self.random_state + fold_idx)
                    for i in tqdm(range(X_subset.shape[1]), desc="Features")
                )
                fold_importances = np.array(fold_importances)
                print(f"Средняя важность признаков в fold {fold_idx + 1}: {np.mean(fold_importances):.6f}")
                fold_importances_list.append(fold_importances)

            mean_importances = np.mean(fold_importances_list, axis=0)
            avg_score = np.mean(fold_scores)
            print(f"\nСредний ROC-AUC по всем фолдам: {avg_score:.4f}")
            print(f"Средняя важность признаков по всем фолдам: {np.mean(mean_importances):.6f}")

            if avg_score < best_score:
                print(f"Средняя метрика упала с {best_score:.4f} до {avg_score:.4f}. Останавливаем рекурсию.")
                mask = best_mask
                break

            best_score = avg_score
            best_mask = mask.copy()

            positive_mask = mean_importances > 0
            n_positive = np.sum(positive_mask)
            print(f"Признаков с положительной важностью: {n_positive} из {len(mean_importances)}")

            if n_positive == X_subset.shape[1]:
                print("Все оставшиеся признаки имеют положительную важность. Останавливаем рекурсию.")
                break

            mask = mask[positive_mask]
            print(f"Удаляем признаки с нулевой или отрицательной важностью, оставляем {len(mask)} признаков")

        self.feature_importances_ = np.zeros(X.shape[1])
        self.feature_importances_[mask] = 1
        print("\nПерестановочная важность признаков вычислена!")
        print(f"Оставлено признаков: {np.sum(self.feature_importances_)} из {X.shape[1]}")
        return self

    def transform(self, X):
        mask = self.feature_importances_ > 0
        print(f"Выбраны признаки с положительной важностью: {np.sum(mask)} из {len(mask)}")
        if not sparse.issparse(X):
            X = sparse.csr_matrix(X)
        return X[:, mask]

    def fit_transform(self, X, y):
        self.fit(X, y)
        return self.transform(X)


In [4]:
sel = SparsePermutationImportanceFeatureSelector()
sel.fit(X_transformed_train, y_train)


=== Итерация 1 ===
Текущий размер признаков: 9601

--- Fold 1/10 ---
ROC-AUC на валидации: 0.5698


Features: 100%|██████████| 9601/9601 [03:36<00:00, 44.34it/s]


Средняя важность признаков в fold 1: 0.000005

--- Fold 2/10 ---
ROC-AUC на валидации: 0.6158


Features: 100%|██████████| 9601/9601 [03:44<00:00, 42.85it/s]


Средняя важность признаков в fold 2: 0.000013

--- Fold 3/10 ---
ROC-AUC на валидации: 0.6161


Features: 100%|██████████| 9601/9601 [03:30<00:00, 45.58it/s]


Средняя важность признаков в fold 3: 0.000019

--- Fold 4/10 ---
ROC-AUC на валидации: 0.6303


Features: 100%|██████████| 9601/9601 [03:30<00:00, 45.64it/s]


Средняя важность признаков в fold 4: 0.000020

--- Fold 5/10 ---
ROC-AUC на валидации: 0.6348


Features: 100%|██████████| 9601/9601 [03:34<00:00, 44.68it/s]


Средняя важность признаков в fold 5: 0.000026

--- Fold 6/10 ---
ROC-AUC на валидации: 0.6370


Features: 100%|██████████| 9601/9601 [03:41<00:00, 43.31it/s]


Средняя важность признаков в fold 6: 0.000022

--- Fold 7/10 ---
ROC-AUC на валидации: 0.6673


Features: 100%|██████████| 9601/9601 [03:31<00:00, 45.35it/s]


Средняя важность признаков в fold 7: 0.000034

--- Fold 8/10 ---
ROC-AUC на валидации: 0.6642


Features: 100%|██████████| 9601/9601 [03:29<00:00, 45.84it/s]


Средняя важность признаков в fold 8: 0.000028

--- Fold 9/10 ---
ROC-AUC на валидации: 0.6709


Features: 100%|██████████| 9601/9601 [03:35<00:00, 44.54it/s]


Средняя важность признаков в fold 9: 0.000040

--- Fold 10/10 ---
ROC-AUC на валидации: 0.6588


Features: 100%|██████████| 9601/9601 [03:45<00:00, 42.50it/s]


Средняя важность признаков в fold 10: 0.000044

Средний ROC-AUC по всем фолдам: 0.6365
Средняя важность признаков по всем фолдам: 0.000025
Признаков с положительной важностью: 2200 из 9601
Удаляем признаки с нулевой или отрицательной важностью, оставляем 2200 признаков

=== Итерация 2 ===
Текущий размер признаков: 2200

--- Fold 1/10 ---
ROC-AUC на валидации: 0.6061


Features: 100%|██████████| 2200/2200 [00:41<00:00, 53.30it/s]


Средняя важность признаков в fold 1: 0.000087

--- Fold 2/10 ---
ROC-AUC на валидации: 0.6377


Features: 100%|██████████| 2200/2200 [00:40<00:00, 53.93it/s]


Средняя важность признаков в fold 2: 0.000090

--- Fold 3/10 ---
ROC-AUC на валидации: 0.6603


Features: 100%|██████████| 2200/2200 [00:39<00:00, 55.13it/s]


Средняя важность признаков в fold 3: 0.000136

--- Fold 4/10 ---
ROC-AUC на валидации: 0.6576


Features: 100%|██████████| 2200/2200 [00:39<00:00, 55.13it/s]


Средняя важность признаков в fold 4: 0.000130

--- Fold 5/10 ---
ROC-AUC на валидации: 0.6406


Features: 100%|██████████| 2200/2200 [00:40<00:00, 54.87it/s]


Средняя важность признаков в fold 5: 0.000130

--- Fold 6/10 ---
ROC-AUC на валидации: 0.6593


Features: 100%|██████████| 2200/2200 [00:39<00:00, 55.04it/s]


Средняя важность признаков в fold 6: 0.000109

--- Fold 7/10 ---
ROC-AUC на валидации: 0.6788


Features: 100%|██████████| 2200/2200 [00:40<00:00, 54.79it/s]


Средняя важность признаков в fold 7: 0.000138

--- Fold 8/10 ---
ROC-AUC на валидации: 0.6809


Features: 100%|██████████| 2200/2200 [00:40<00:00, 54.84it/s]


Средняя важность признаков в fold 8: 0.000127

--- Fold 9/10 ---
ROC-AUC на валидации: 0.6843


Features: 100%|██████████| 2200/2200 [00:38<00:00, 57.47it/s]


Средняя важность признаков в fold 9: 0.000174

--- Fold 10/10 ---
ROC-AUC на валидации: 0.6644


Features: 100%|██████████| 2200/2200 [00:38<00:00, 57.78it/s]


Средняя важность признаков в fold 10: 0.000177

Средний ROC-AUC по всем фолдам: 0.6570
Средняя важность признаков по всем фолдам: 0.000130
Признаков с положительной важностью: 1702 из 2200
Удаляем признаки с нулевой или отрицательной важностью, оставляем 1702 признаков

=== Итерация 3 ===
Текущий размер признаков: 1702

--- Fold 1/10 ---
ROC-AUC на валидации: 0.6224


Features: 100%|██████████| 1702/1702 [00:22<00:00, 74.36it/s]


Средняя важность признаков в fold 1: 0.000126

--- Fold 2/10 ---
ROC-AUC на валидации: 0.6472


Features: 100%|██████████| 1702/1702 [00:22<00:00, 74.01it/s]


Средняя важность признаков в fold 2: 0.000134

--- Fold 3/10 ---
ROC-AUC на валидации: 0.6649


Features: 100%|██████████| 1702/1702 [00:23<00:00, 73.83it/s]


Средняя важность признаков в fold 3: 0.000190

--- Fold 4/10 ---
ROC-AUC на валидации: 0.6652


Features: 100%|██████████| 1702/1702 [00:23<00:00, 73.82it/s]


Средняя важность признаков в fold 4: 0.000174

--- Fold 5/10 ---
ROC-AUC на валидации: 0.6508


Features: 100%|██████████| 1702/1702 [00:23<00:00, 73.31it/s]


Средняя важность признаков в fold 5: 0.000169

--- Fold 6/10 ---
ROC-AUC на валидации: 0.6689


Features: 100%|██████████| 1702/1702 [00:23<00:00, 73.97it/s]


Средняя важность признаков в fold 6: 0.000158

--- Fold 7/10 ---
ROC-AUC на валидации: 0.6879


Features: 100%|██████████| 1702/1702 [00:23<00:00, 73.07it/s]


Средняя важность признаков в fold 7: 0.000188

--- Fold 8/10 ---
ROC-AUC на валидации: 0.6930


Features: 100%|██████████| 1702/1702 [00:23<00:00, 73.52it/s]


Средняя важность признаков в fold 8: 0.000181

--- Fold 9/10 ---
ROC-AUC на валидации: 0.6956


Features: 100%|██████████| 1702/1702 [00:22<00:00, 74.31it/s]


Средняя важность признаков в fold 9: 0.000250

--- Fold 10/10 ---
ROC-AUC на валидации: 0.6688


Features: 100%|██████████| 1702/1702 [00:23<00:00, 73.25it/s]


Средняя важность признаков в fold 10: 0.000231

Средний ROC-AUC по всем фолдам: 0.6665
Средняя важность признаков по всем фолдам: 0.000180
Признаков с положительной важностью: 1553 из 1702
Удаляем признаки с нулевой или отрицательной важностью, оставляем 1553 признаков

=== Итерация 4 ===
Текущий размер признаков: 1553

--- Fold 1/10 ---
ROC-AUC на валидации: 0.6241


Features: 100%|██████████| 1553/1553 [00:17<00:00, 89.21it/s]


Средняя важность признаков в fold 1: 0.000154

--- Fold 2/10 ---
ROC-AUC на валидации: 0.6476


Features: 100%|██████████| 1553/1553 [00:17<00:00, 89.66it/s]


Средняя важность признаков в fold 2: 0.000156

--- Fold 3/10 ---
ROC-AUC на валидации: 0.6732


Features: 100%|██████████| 1553/1553 [00:17<00:00, 87.92it/s]


Средняя важность признаков в fold 3: 0.000222

--- Fold 4/10 ---
ROC-AUC на валидации: 0.6672


Features: 100%|██████████| 1553/1553 [00:17<00:00, 87.41it/s]


Средняя важность признаков в fold 4: 0.000197

--- Fold 5/10 ---
ROC-AUC на валидации: 0.6522


Features: 100%|██████████| 1553/1553 [00:18<00:00, 85.32it/s]


Средняя важность признаков в fold 5: 0.000181

--- Fold 6/10 ---
ROC-AUC на валидации: 0.6730


Features: 100%|██████████| 1553/1553 [00:19<00:00, 80.86it/s]


Средняя важность признаков в fold 6: 0.000179

--- Fold 7/10 ---
ROC-AUC на валидации: 0.6903


Features: 100%|██████████| 1553/1553 [00:19<00:00, 79.56it/s]


Средняя важность признаков в fold 7: 0.000207

--- Fold 8/10 ---
ROC-AUC на валидации: 0.7023


Features: 100%|██████████| 1553/1553 [00:18<00:00, 82.39it/s]


Средняя важность признаков в fold 8: 0.000219

--- Fold 9/10 ---
ROC-AUC на валидации: 0.6970


Features: 100%|██████████| 1553/1553 [00:19<00:00, 81.26it/s]


Средняя важность признаков в fold 9: 0.000259

--- Fold 10/10 ---
ROC-AUC на валидации: 0.6718


Features: 100%|██████████| 1553/1553 [00:18<00:00, 81.88it/s]


Средняя важность признаков в fold 10: 0.000241

Средний ROC-AUC по всем фолдам: 0.6699
Средняя важность признаков по всем фолдам: 0.000202
Признаков с положительной важностью: 1501 из 1553
Удаляем признаки с нулевой или отрицательной важностью, оставляем 1501 признаков

=== Итерация 5 ===
Текущий размер признаков: 1501

--- Fold 1/10 ---
ROC-AUC на валидации: 0.6271


Features: 100%|██████████| 1501/1501 [00:15<00:00, 94.62it/s]


Средняя важность признаков в fold 1: 0.000171

--- Fold 2/10 ---
ROC-AUC на валидации: 0.6448


Features: 100%|██████████| 1501/1501 [00:16<00:00, 91.99it/s]


Средняя важность признаков в fold 2: 0.000154

--- Fold 3/10 ---
ROC-AUC на валидации: 0.6769


Features: 100%|██████████| 1501/1501 [00:15<00:00, 94.33it/s]


Средняя важность признаков в fold 3: 0.000238

--- Fold 4/10 ---
ROC-AUC на валидации: 0.6675


Features: 100%|██████████| 1501/1501 [00:16<00:00, 93.43it/s] 


Средняя важность признаков в fold 4: 0.000205

--- Fold 5/10 ---
ROC-AUC на валидации: 0.6561


Features: 100%|██████████| 1501/1501 [00:15<00:00, 95.76it/s]


Средняя важность признаков в fold 5: 0.000205

--- Fold 6/10 ---
ROC-AUC на валидации: 0.6760


Features: 100%|██████████| 1501/1501 [00:15<00:00, 95.62it/s] 


Средняя важность признаков в fold 6: 0.000189

--- Fold 7/10 ---
ROC-AUC на валидации: 0.6921


Features: 100%|██████████| 1501/1501 [00:16<00:00, 93.65it/s]


Средняя важность признаков в fold 7: 0.000205

--- Fold 8/10 ---
ROC-AUC на валидации: 0.7032


Features: 100%|██████████| 1501/1501 [00:16<00:00, 92.68it/s]


Средняя важность признаков в fold 8: 0.000215

--- Fold 9/10 ---
ROC-AUC на валидации: 0.6980


Features: 100%|██████████| 1501/1501 [00:16<00:00, 91.10it/s]


Средняя важность признаков в fold 9: 0.000267

--- Fold 10/10 ---
ROC-AUC на валидации: 0.6717


Features: 100%|██████████| 1501/1501 [00:16<00:00, 93.63it/s]


Средняя важность признаков в fold 10: 0.000244

Средний ROC-AUC по всем фолдам: 0.6713
Средняя важность признаков по всем фолдам: 0.000209
Признаков с положительной важностью: 1484 из 1501
Удаляем признаки с нулевой или отрицательной важностью, оставляем 1484 признаков

=== Итерация 6 ===
Текущий размер признаков: 1484

--- Fold 1/10 ---
ROC-AUC на валидации: 0.6277


Features: 100%|██████████| 1484/1484 [00:15<00:00, 97.59it/s] 


Средняя важность признаков в fold 1: 0.000175

--- Fold 2/10 ---
ROC-AUC на валидации: 0.6471


Features: 100%|██████████| 1484/1484 [00:15<00:00, 95.44it/s] 


Средняя важность признаков в fold 2: 0.000164

--- Fold 3/10 ---
ROC-AUC на валидации: 0.6767


Features: 100%|██████████| 1484/1484 [00:15<00:00, 96.39it/s] 


Средняя важность признаков в fold 3: 0.000237

--- Fold 4/10 ---
ROC-AUC на валидации: 0.6674


Features: 100%|██████████| 1484/1484 [00:15<00:00, 96.42it/s] 


Средняя важность признаков в fold 4: 0.000207

--- Fold 5/10 ---
ROC-AUC на валидации: 0.6568


Features: 100%|██████████| 1484/1484 [00:15<00:00, 95.76it/s] 


Средняя важность признаков в fold 5: 0.000197

--- Fold 6/10 ---
ROC-AUC на валидации: 0.6763


Features: 100%|██████████| 1484/1484 [00:15<00:00, 95.53it/s] 


Средняя важность признаков в fold 6: 0.000182

--- Fold 7/10 ---
ROC-AUC на валидации: 0.6920


Features: 100%|██████████| 1484/1484 [00:15<00:00, 95.81it/s]


Средняя важность признаков в fold 7: 0.000205

--- Fold 8/10 ---
ROC-AUC на валидации: 0.7036


Features: 100%|██████████| 1484/1484 [00:15<00:00, 96.54it/s] 


Средняя важность признаков в fold 8: 0.000234

--- Fold 9/10 ---
ROC-AUC на валидации: 0.6987


Features: 100%|██████████| 1484/1484 [00:15<00:00, 96.73it/s] 


Средняя важность признаков в fold 9: 0.000271

--- Fold 10/10 ---
ROC-AUC на валидации: 0.6729


Features: 100%|██████████| 1484/1484 [00:15<00:00, 94.80it/s] 


Средняя важность признаков в fold 10: 0.000268

Средний ROC-AUC по всем фолдам: 0.6719
Средняя важность признаков по всем фолдам: 0.000214
Признаков с положительной важностью: 1473 из 1484
Удаляем признаки с нулевой или отрицательной важностью, оставляем 1473 признаков

=== Итерация 7 ===
Текущий размер признаков: 1473

--- Fold 1/10 ---
ROC-AUC на валидации: 0.6271


Features: 100%|██████████| 1473/1473 [00:15<00:00, 95.66it/s] 


Средняя важность признаков в fold 1: 0.000179

--- Fold 2/10 ---
ROC-AUC на валидации: 0.6467


Features: 100%|██████████| 1473/1473 [00:15<00:00, 95.42it/s] 


Средняя важность признаков в fold 2: 0.000162

--- Fold 3/10 ---
ROC-AUC на валидации: 0.6770


Features: 100%|██████████| 1473/1473 [00:15<00:00, 95.25it/s] 


Средняя важность признаков в fold 3: 0.000247

--- Fold 4/10 ---
ROC-AUC на валидации: 0.6666


Features: 100%|██████████| 1473/1473 [00:15<00:00, 95.61it/s] 


Средняя важность признаков в fold 4: 0.000208

--- Fold 5/10 ---
ROC-AUC на валидации: 0.6565


Features: 100%|██████████| 1473/1473 [00:15<00:00, 94.44it/s] 


Средняя важность признаков в fold 5: 0.000211

--- Fold 6/10 ---
ROC-AUC на валидации: 0.6769


Features: 100%|██████████| 1473/1473 [00:15<00:00, 92.96it/s]


Средняя важность признаков в fold 6: 0.000194

--- Fold 7/10 ---
ROC-AUC на валидации: 0.6935


Features: 100%|██████████| 1473/1473 [00:15<00:00, 96.39it/s] 


Средняя важность признаков в fold 7: 0.000216

--- Fold 8/10 ---
ROC-AUC на валидации: 0.7034


Features: 100%|██████████| 1473/1473 [00:15<00:00, 93.24it/s] 


Средняя важность признаков в fold 8: 0.000226

--- Fold 9/10 ---
ROC-AUC на валидации: 0.6984


Features: 100%|██████████| 1473/1473 [00:15<00:00, 94.70it/s]


Средняя важность признаков в fold 9: 0.000281

--- Fold 10/10 ---
ROC-AUC на валидации: 0.6724


Features: 100%|██████████| 1473/1473 [00:15<00:00, 94.52it/s] 


Средняя важность признаков в fold 10: 0.000276

Средний ROC-AUC по всем фолдам: 0.6719
Средняя важность признаков по всем фолдам: 0.000220
Средняя метрика упала с 0.6719 до 0.6719. Останавливаем рекурсию.

Перестановочная важность признаков вычислена!
Оставлено признаков: 1484.0 из 9601


<__main__.SparsePermutationImportanceFeatureSelector at 0x7a88782a40a0>

In [5]:
X_sel_train = sel.transform(X_transformed_train)
X_sel_test = sel.transform(X_transformed_test)

Выбраны признаки с положительной важностью: 1484 из 9601
Выбраны признаки с положительной важностью: 1484 из 9601


In [6]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from scipy import sparse
import numpy as np

# Логистическая регрессия
estimator = LogisticRegression(solver="liblinear", random_state=42)

# TimeSeriesSplit CV
tscv = TimeSeriesSplit(n_splits=10)

# RFECV
selector = RFECV(
    estimator=estimator,
    step=1,  # сколько признаков удаляем за итерацию
    cv=tscv,
    scoring='roc_auc',  # просто строка
    min_features_to_select=1,
    verbose=2,
    n_jobs=-1  # использовать все ядра
)



# Обучаем
selector.fit(X_sel_train, y_train)

# Признаки, которые остались
selected_features = selector.support_
print(f"Выбрано признаков: {np.sum(selected_features)} из {X_sel_train.shape[1]}")

# Трансформируем X
X_sel_train = selector.transform(X_sel_train)
X_sel_test = selector.transform(X_sel_test)


Fitting estimator with 1484 features.
Fitting estimator with 1484 features.
Fitting estimator with 1484 features.
Fitting estimator with 1484 features.
Fitting estimator with 1484 features.
Fitting estimator with 1484 features.
Fitting estimator with 1484 features.
Fitting estimator with 1484 features.
Fitting estimator with 1483 features.
Fitting estimator with 1482 features.
Fitting estimator with 1483 features.
Fitting estimator with 1481 features.
Fitting estimator with 1483 features.
Fitting estimator with 1480 features.
Fitting estimator with 1479 features.
Fitting estimator with 1482 features.
Fitting estimator with 1483 features.
Fitting estimator with 1478 features.
Fitting estimator with 1477 features.
Fitting estimator with 1482 features.
Fitting estimator with 1481 features.
Fitting estimator with 1476 features.
Fitting estimator with 1483 features.
Fitting estimator with 1475 features.
Fitting estimator with 1483 features.
Fitting estimator with 1474 features.
Fitting esti

In [7]:
estimator = LogisticRegression(solver="liblinear")
estimator.fit(X_sel_train, y_train)

# Predict probabilities and classes
y_pred_proba = estimator.predict_proba(X_sel_test)[:, 1]
y_pred = estimator.predict(X_sel_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
fscore = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-score : {fscore:.4f}")
print(f"ROC-AUC  : {roc_auc:.4f}")
print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")

Accuracy : 0.7200
Precision: 0.7636
Recall   : 0.7368
F1-score : 0.7500
ROC-AUC  : 0.7448
TP: 42, TN: 30, FP: 13, FN: 15


In [None]:
# Accuracy : 0.6700
# Precision: 0.7222
# Recall   : 0.6842
# F1-score : 0.7027
# ROC-AUC  : 0.7281
# TP: 39, TN: 28, FP: 15, FN: 18

# Accuracy : 0.6900
# Precision: 0.7097
# Recall   : 0.7719
# F1-score : 0.7395
# ROC-AUC  : 0.7399
# TP: 44, TN: 25, FP: 18, FN: 13

# Accuracy : 0.7200
# Precision: 0.7636
# Recall   : 0.7368
# F1-score : 0.7500
# ROC-AUC  : 0.7448
# TP: 42, TN: 30, FP: 13, FN: 15

In [30]:
X_sel_train.shape

(37974, 1335)

In [None]:

tscv = TimeSeriesSplit(n_splits=10)
X_sel_train, y_train

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import make_scorer, roc_auc_score
from scipy import sparse



# TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=10)

# Логистическая регрессия
logit = LogisticRegression(solver='liblinear', random_state=42)

# Сетка параметров
param_grid = {
    'C': np.linspace(.01, 1, 100),
    'penalty': ['l1', 'l2']
}

# Метрика ROC-AUC
roc_auc_scorer = make_scorer(roc_auc_score, needs_proba=True)

# GridSearchCV с TimeSeriesSplit
grid_search = GridSearchCV(
    estimator=logit,
    param_grid=param_grid,
    scoring=roc_auc_scorer,
    cv=tscv,
    n_jobs=-1,
    verbose=2
)

# Обучаем
grid_search.fit(X_sel_train, y_train)

# Лучшие параметры
print("Best params:", grid_search.best_params_)
print("Best ROC-AUC:", grid_search.best_score_)

# Лучший обученный логистический классификатор
best_logit = grid_search.best_estimator_


Fitting 10 folds for each of 200 candidates, totalling 2000 fits
[CV] END .................................C=0.01, penalty=l1; total time=   0.0s
[CV] END .................................C=0.01, penalty=l1; total time=   0.2s
[CV] END .................................C=0.01, penalty=l1; total time=   0.2s
[CV] END .................................C=0.01, penalty=l1; total time=   0.1s
[CV] END .................................C=0.01, penalty=l1; total time=   0.1s
[CV] END .................................C=0.01, penalty=l2; total time=   0.0s
[CV] END .................................C=0.01, penalty=l1; total time=   0.4s
[CV] END .................................C=0.01, penalty=l2; total time=   0.1s
[CV] END .................................C=0.01, penalty=l2; total time=   0.1s
[CV] END .................................C=0.01, penalty=l2; total time=   0.1s
[CV] END .................................C=0.01, penalty=l2; total time=   0.3s
[CV] END .................................C=

In [32]:
roc_auc_score(y_test, best_logit.predict_proba(X_sel_test)[:, 1])

0.6073031415748673

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Initialize logistic regression
clf = LogisticRegression()

# Fit on training data
clf.fit(X_prepared_train, y_train)

# Predict probabilities for the positive class
y_pred_proba = clf.predict_proba(X_prepared_test)[:, 1]

# Compute ROC-AUC
auc = roc_auc_score(y_test, y_pred_proba)
print("ROC-AUC on test set:", auc)


ROC-AUC on test set: 0.6960424316605466


array([[1452687000.0, 1, 4164, ..., 17811, 17833, 17834],
       [1452689520.0, 3, 4164, ..., 17811, 17833, 17834],
       [1452696300.0, 3, 4164, ..., 17499, 17500, 17543],
       ...,
       [1649845117.0, 8, 4163, ..., 17501, 17543, 19666],
       [1649849354.0, 2, 4163, ..., 17501, 17543, 19666],
       [1649849445.0, 7, 4163, ..., 21433, 21439, 25439]],
      shape=(37974, 18), dtype=object)

In [16]:
import os
import json
from typing import Generator
from collections import defaultdict
from dateutil.parser import parse
from tqdm.notebook import tqdm
import numpy as np
import warnings
from scipy import sparse
import logging as log
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

warnings.filterwarnings("ignore")
log.basicConfig(level=log.INFO)

# ------------------------------
# DATA LOADING AND VALIDATION
# ------------------------------
def generate_game_raw(path_to_games_raw_dir: str = "data/games_raw") -> Generator[dict, None, None]:
    for filename in tqdm(os.listdir(path_to_games_raw_dir)):
        try:
            with open(os.path.join(path_to_games_raw_dir, filename), "r", encoding="utf-8") as f:
                yield json.load(f)
        except Exception as e:
            log.warning(f"Skipping {filename}: {e}")

def validate_game(game: dict) -> bool:
    try:
        int(game["id"])
        parse(game["begin_at"])
        int(game["match"]["league"]["id"])
        int(game["match"]["serie"]["id"])
        int(game["match"]["tournament"]["id"])
        int(game["map"]["id"])
        team_players = defaultdict(list)
        for p in game["players"]:
            team_players[p["team"]["id"]].append(p["player"]["id"])
        assert len(team_players) == 2
        for t_id, p_ids in team_players.items():
            assert len(set(p_ids)) == 5
        rounds = game["rounds"]
        assert min([r["round"] for r in rounds]) == 1
        assert max([r["round"] for r in rounds]) >= 16
        return True
    except:
        return False

def get_game_ids(path_to_games_raw_dir: str = "data/games_raw") -> list[int]:
    game_ids_valid, game_begin_at_valid = [], []
    for game in generate_game_raw(path_to_games_raw_dir):
        if validate_game(game):
            game_ids_valid.append(game["id"])
            game_begin_at_valid.append(parse(game["begin_at"]))
    return np.array(game_ids_valid)[np.argsort(game_begin_at_valid)].tolist()


# ------------------------------
# FEATURE EXTRACTION
# ------------------------------
def get_player_id_X_for_game(path: str, game_id: int):
    with open(os.path.join(path, f"{game_id}.json"), "r") as f:
        game = json.load(f)
    team_players = defaultdict(list)
    for p in game["players"]:
        team_players[p["team"]["id"]].append(p["player"]["id"])
    t1, t2 = sorted(team_players.keys())
    return sorted(team_players[t1]) + sorted(team_players[t2])

def get_team_id_X_for_game(path: str, game_id: int):
    with open(os.path.join(path, f"{game_id}.json"), "r") as f:
        game = json.load(f)
    team_ids = sorted({p["team"]["id"] for p in game["players"]})
    return team_ids

def get_game_info_X_for_game(path: str, game_id: int):
    with open(os.path.join(path, f"{game_id}.json"), "r") as f:
        game = json.load(f)
    timestamp = parse(game["begin_at"]).timestamp()
    map_id = int(game["map"]["id"])
    league_id = int(game["match"]["league"]["id"])
    serie_id = int(game["match"]["serie"]["id"])
    serie_tier = game["match"]["serie"].get("tier", "unknown")
    tournament_id = int(game["match"]["tournament"]["id"])
    return [timestamp, map_id, league_id, serie_id, serie_tier, tournament_id]

def get_y_for_game(path: str, game_id: int):
    with open(os.path.join(path, f"{game_id}.json"), "r") as f:
        game = json.load(f)
    t1, t2 = np.unique([p["team"]["id"] for p in game["players"]])
    wins = {t1: 0, t2: 0}
    for r in game["rounds"]:
        wins[r["winner_team"]] += 1
    return int(wins[t1] > wins[t2])

def get_X_y(path_to_games_raw: str, game_ids: list[int]):
    X_game_info, X_team_id, X_player_id, y = [], [], [], []
    for game_id in tqdm(game_ids):
        try:
            X_player_id.append(get_player_id_X_for_game(path_to_games_raw, game_id))
            X_team_id.append(get_team_id_X_for_game(path_to_games_raw, game_id))
            X_game_info.append(get_game_info_X_for_game(path_to_games_raw, game_id))
            y.append(get_y_for_game(path_to_games_raw, game_id))
        except Exception as e:
            log.warning(f"Skipping game {game_id}: {e}")
    return (
        np.array(X_game_info, dtype=object),
        np.array(X_team_id),
        np.array(X_player_id),
        np.array(y, dtype=int),
    )


# ------------------------------
# ENCODERS
# ------------------------------
class LabelEncoderPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, separate_cols=None, together_cols=None):
        self.separate_cols = separate_cols or []
        self.together_cols = together_cols or []
        self.encoders_sep_ = {}
        self.encoders_together_ = []

    def fit(self, X, y=None):
        X = np.asarray(X)
        for col in self.separate_cols:
            le = LabelEncoder()
            le.fit(X[:, col])
            self.encoders_sep_[col] = le
        for cols in self.together_cols:
            if isinstance(cols, int): cols = [cols]
            le = LabelEncoder()
            le.fit(X[:, cols].ravel())
            self.encoders_together_.append((cols, le))
        return self

    def transform(self, X):
        X = np.asarray(X).copy()
        for col, le in self.encoders_sep_.items():
            mask = np.isin(X[:, col], le.classes_)
            X_col = X[:, col].astype(object)
            X_col[mask] = le.transform(X_col[mask])
            X_col[~mask] = -1
            X[:, col] = X_col
        for cols, le in self.encoders_together_:
            if isinstance(cols, int): cols = [cols]
            flat = X[:, cols].ravel().astype(object)
            mask = np.isin(flat, le.classes_)
            flat_result = np.full(flat.shape, -1, dtype=int)
            flat_result[mask] = le.transform(flat[mask])
            X[:, cols] = flat_result.reshape(X.shape[0], len(cols))
        return X.astype(int)


class TeamBagEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X = np.asarray(X)
        self.d = {val: idx for idx, val in enumerate(np.unique(X.flatten()))}
        log.info(f"TeamBagEncoder: {len(self.d)} unique teams found.")
        return self
    def transform(self, X):
        X = np.asarray(X)
        rows, cols, data = [], [], []
        for i, row in enumerate(X):
            for j, val in enumerate(row):
                if val in self.d:
                    rows.append(i)
                    cols.append(self.d[val])
                    data.append(1 if j == 0 else -1)
        return sparse.csr_matrix((data, (rows, cols)), shape=(X.shape[0], len(self.d)), dtype=int)


class PlayerBagEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X = np.asarray(X)
        self.d = {val: idx for idx, val in enumerate(np.unique(X.flatten()))}        
        log.info(f"PlayerBagEncoder: {len(self.d)} unique players found.")
        return self
    def transform(self, X):
        X = np.asarray(X)
        rows, cols, data = [], [], []
        for i, row in enumerate(X):
            for j, val in enumerate(row):
                if val in self.d:
                    rows.append(i)
                    cols.append(self.d[val])
                    data.append(1 if j < 5 else -1)
        return sparse.csr_matrix((data, (rows, cols)), shape=(X.shape[0], len(self.d)), dtype=int)




In [None]:
# ------------------------------
# MAIN PIPELINE / TRAINING
# ------------------------------
PATH_TO_GAMES_RAW = "data/games_raw"
TEST_SIZE = 100

game_ids = get_game_ids(PATH_TO_GAMES_RAW)
game_ids_train, game_ids_test = game_ids[:-TEST_SIZE], game_ids[-TEST_SIZE:]

# Load features
X_game_info_train, X_team_id_train, X_player_id_train, y_train = get_X_y(PATH_TO_GAMES_RAW, game_ids_train)
X_game_info_test, X_team_id_test, X_player_id_test, y_test = get_X_y(PATH_TO_GAMES_RAW, game_ids_test)

# Label encode player IDs
le_players = LabelEncoderPreprocessor(together_cols=[list(range(X_player_id_train.shape[1]))])
le_players.fit(X_player_id_train)
X_player_id_train_enc = le_players.transform(X_player_id_train)
X_player_id_test_enc = le_players.transform(X_player_id_test)

le_teams = LabelEncoderPreprocessor(together_cols=[list(range(X_team_id_train.shape[1]))])
le_players.fit(X_team_id_train)
X_team_id_train_enc = le_players.transform(X_team_id_train)
X_team_id_test_enc = le_players.transform(X_team_id_test)

# Create bag-of-players sparse matrices
player_encoder = PlayerBagEncoder()
player_encoder.fit(X_player_id_train_enc)
X_train_players_bag = player_encoder.transform(X_player_id_train_enc)
X_test_players_bag = player_encoder.transform(X_player_id_test_enc)



In [None]:
import numpy as np
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score



In [15]:
auc

0.726438188494492

In [3]:
X_player_id_train

array([[17525, 17539, 17540, ..., 17811, 17833, 17834],
       [17525, 17539, 17540, ..., 17811, 17833, 17834],
       [17520, 17523, 17555, ..., 17499, 17500, 17543],
       ...,
       [20678, 20684, 20692, ..., 17501, 17543, 19666],
       [20678, 20684, 20692, ..., 17501, 17543, 19666],
       [20569, 25589, 27555, ..., 21433, 21439, 25439]], shape=(37974, 10))

In [4]:
X_player_id_test

array([[18715, 18717, 24818, 27088, 29632, 17728, 17730, 20370, 21440,
        23684],
       [18169, 18338, 20565, 23742, 24106, 17580, 17727, 24103, 26620,
        26672],
       [17685, 20392, 24713, 27743, 28430, 17561, 17563, 24243, 29152,
        30743],
       [18715, 18717, 24818, 27088, 29632, 17728, 17730, 20370, 21440,
        23684],
       [17685, 20392, 24713, 27743, 28430, 17561, 17563, 24243, 29152,
        30743],
       [17512, 17527, 17528, 17585, 20561, 17592, 17656, 17657, 17714,
        17725],
       [19670, 21427, 21721, 24852, 31864, 17581, 17748, 21071, 30260,
        30769],
       [17685, 20392, 24713, 27743, 28430, 17561, 17563, 24243, 29152,
        30743],
       [17512, 17527, 17528, 17585, 20561, 17592, 17656, 17657, 17714,
        17725],
       [19670, 21427, 21721, 24852, 31864, 17581, 17748, 21071, 30260,
        30769],
       [20509, 23967, 23969, 27110, 33150, 18308, 25345, 28461, 33570,
        36298],
       [20509, 23967, 23969, 27110, 33150, 

TypeError: object of type 'int' has no len()

In [14]:
X_player_id_train

array([[17525, 17539, 17540, ..., 17811, 17833, 17834],
       [17525, 17539, 17540, ..., 17811, 17833, 17834],
       [17520, 17523, 17555, ..., 17499, 17500, 17543],
       ...,
       [20678, 20684, 20692, ..., 17501, 17543, 19666],
       [20678, 20684, 20692, ..., 17501, 17543, 19666],
       [20569, 25589, 27555, ..., 21433, 21439, 25439]], shape=(37974, 10))