In [None]:
import os
import json
from dateutil.parser import parse
from collections import defaultdict
import numpy as np
from tqdm import tqdm
from sklearn.base import BaseEstimator, TransformerMixin, clone
from scipy import sparse
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import hashlib
import pickle

# ----------------------- Data loading -----------------------
def generate_game_raw(path_to_games_raw_dir: str = "data/games_raw"):
    for filename in tqdm(os.listdir(path_to_games_raw_dir)):
        file_path = os.path.join(path_to_games_raw_dir, filename)
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                yield json.load(f)
        except:
            continue

def validate_game(game: dict) -> bool:
    try:
        int(game["id"])
        parse(game["begin_at"])
        int(game["match"]["league"]["id"])
        int(game["match"]["serie"]["id"])
        int(game["match"]["tournament"]["id"])
        int(game["map"]["id"])
        team_players = defaultdict(list)
        for p in game["players"]:
            team_players[p["team"]["id"]].append(p["player"]["id"])
        if len(team_players) != 2:
            return False
        for p_ids in team_players.values():
            if len(set(p_ids)) != 5:
                return False
        team_ids = list(team_players.keys())
        rounds = []
        for r in game["rounds"]:
            if r["round"] is None or r["ct"] not in team_ids or r["terrorists"] not in team_ids or r["winner_team"] not in team_ids:
                return False
            rounds.append(r["round"])
        if min(rounds) != 1 or max(rounds) < 16:
            return False
        return True
    except:
        return False

def get_game_ids(path_to_games_raw_dir: str = "data/games_raw") -> list[int]:
    game_ids_valid, game_begin_at_valid = [], []
    for game in generate_game_raw(path_to_games_raw_dir):
        if validate_game(game):
            game_ids_valid.append(game["id"])
            game_begin_at_valid.append(parse(game["begin_at"]))
    return np.array(game_ids_valid)[np.argsort(game_begin_at_valid)].tolist()

def get_X_y(path_to_games_raw: str, game_ids: list[int]):
    X, y = [], []
    for game_id in tqdm(game_ids):
        file_path = os.path.join(path_to_games_raw, f"{game_id}.json")
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                game = json.load(f)
            team_players = defaultdict(list)
            for p in game["players"]:
                team_players[p["team"]["id"]].append(p["player"]["id"])
            t1_id, t2_id = sorted(team_players.keys())
            X.append([t1_id, t2_id] + sorted(team_players[t1_id]) + sorted(team_players[t2_id]))
            team_win_count = {t1_id: 0, t2_id: 0}
            for r in game["rounds"]:
                team_win_count[r["winner_team"]] += 1
            y.append(int(team_win_count[t1_id] > team_win_count[t2_id]))
        except:
            continue
    return np.array(X), np.array(y)

# ----------------------- Column selector -----------------------
class ColumnSelector(BaseEstimator, TransformerMixin):   
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = np.asarray(X)
        return X[:, self.columns]

# ----------------------- Bag encoders -----------------------
class PlayerBagEncoder(BaseEstimator, TransformerMixin):   
    def __init__(self):
        self.player_dict = None

    def fit(self, X, y=None):      
        X = np.asarray(X)
        uniques = np.unique(X.flatten())
        self.player_dict = {player: idx for idx, player in enumerate(uniques)}
        return self

    def transform(self, X):        
        X = np.asarray(X)
        n_samples = X.shape[0]
        n_features = len(self.player_dict)
        rows, cols, data = [], [], []

        for i, row in enumerate(X):
            for j, player in enumerate(row):
                col_idx = self.player_dict.get(player)
                if col_idx is not None:
                    rows.append(i)
                    cols.append(col_idx)
                    data.append(1 if j < len(row)//2 else -1)

        return sparse.csr_matrix((data, (rows, cols)), shape=(n_samples, n_features), dtype=int)

class TeamBagEncoder(BaseEstimator, TransformerMixin):   
    def __init__(self):
        self.team_dict = None

    def fit(self, X, y=None):      
        X = np.asarray(X)
        uniques = np.unique(X.flatten())
        self.team_dict = {team: idx for idx, team in enumerate(uniques)}
        return self

    def transform(self, X):        
        X = np.asarray(X)
        n_samples = X.shape[0]
        n_features = len(self.team_dict)
        rows, cols, data = [], [], []

        for i, row in enumerate(X):            
            for j, team in enumerate(row):
                col_idx = self.team_dict.get(team)
                if col_idx is not None:
                    rows.append(i)
                    cols.append(col_idx)
                    data.append(1 if j == 0 else -1)

        return sparse.csr_matrix((data, (rows, cols)), shape=(n_samples, n_features), dtype=int)

from joblib import Parallel, delayed
from tqdm import tqdm
from tqdm_joblib import tqdm_joblib
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.model_selection import TimeSeriesSplit
from sklearn.utils import shuffle
from scipy import sparse
import numpy as np
from sklearn.metrics import roc_auc_score

class RecursiveTimeSeriesPermutationSelector(BaseEstimator, TransformerMixin):
    def __init__(self, estimator, n_splits=10, n_repeats=1, random_state=42):
        self.estimator = estimator
        self.n_splits = n_splits
        self.n_repeats = n_repeats
        self.random_state = random_state
        self.mask_ = None

    def _perm_feature_score(self, est, X_val, y_val, i, rng):
        perm_scores = []
        for _ in range(self.n_repeats):
            X_perm = X_val.toarray() if sparse.issparse(X_val) else X_val.copy()
            seed = rng.randint(0, 2**32 - 1)
            X_perm[:, i] = shuffle(X_perm[:, i], random_state=seed)
            y_pred = est.predict_proba(X_perm)[:, 1]
            perm_scores.append(roc_auc_score(y_val, y_pred))
        return np.mean(perm_scores)

    def fit(self, X, y):
        if sparse.issparse(X):
            X = X.tocsr()
        else:
            X = np.asarray(X)

        mask_all = np.ones(X.shape[1], dtype=bool)
        rng = np.random.RandomState(self.random_state)
        previous_mean_score = -np.inf  # Initialize previous score

        iteration = 0
        while True:
            iteration += 1
            print(f"Iteration {iteration}: {mask_all.sum()} features")
            importances_accum = np.zeros(mask_all.sum())
            X_selected = X[:, mask_all]
            tscv = TimeSeriesSplit(n_splits=self.n_splits)

            iteration_scores = []

            for fold_idx, (train_idx, val_idx) in enumerate(tscv.split(X_selected), 1):
                X_train, X_val = X_selected[train_idx], X_selected[val_idx]
                y_train, y_val = y[train_idx], y[val_idx]

                est = clone(self.estimator)
                est.fit(X_train, y_train)

                y_pred_orig = est.predict_proba(X_val)[:, 1]
                score_orig = roc_auc_score(y_val, y_pred_orig)
                iteration_scores.append(score_orig)

                print(f"  Fold {fold_idx}/{self.n_splits} - Base ROC AUC: {score_orig:.4f}")

                
                perm_scores = Parallel(n_jobs=-1)(
                    delayed(self._perm_feature_score)(est, X_val, y_val, i, rng)
                    for i in range(X_selected.shape[1])
                )

                importances_accum += score_orig - np.array(perm_scores)

            mean_iteration_score = np.mean(iteration_scores)
            print(f"Iteration {iteration} - Mean base ROC AUC: {mean_iteration_score:.4f}")

            importances_avg = importances_accum / self.n_splits
            mask_iteration = importances_avg > 0
            if mask_iteration.sum() == mask_all.sum():
                print("No features removed this iteration. Stopping recursion.")
                break

            # Stop if score does not improve
            if mean_iteration_score <= previous_mean_score:
                print("Mean ROC AUC did not improve after feature elimination. Stopping recursion.")
                break

            previous_mean_score = mean_iteration_score
            idx_remaining = np.where(mask_all)[0]
            mask_all[idx_remaining[~mask_iteration]] = False

        self.mask_ = mask_all
        print(f"Final selected features: {mask_all.sum()} / {mask_all.size}")
        return self

    def transform(self, X):
        if self.mask_ is None:
            raise RuntimeError("You must fit before calling transform.")
        return X[:, self.mask_] if sparse.issparse(X) else np.asarray(X)[:, self.mask_]

    def fit_transform(self, X, y):
        return self.fit(X, y).transform(X)



# ----------------------- Metrics -----------------------
def get_metrics(y_true, y_pred, y_proba):
    acc = float(accuracy_score(y_true, y_pred))
    prec = float(precision_score(y_true, y_pred))
    rec = float(recall_score(y_true, y_pred))
    f1 = float(f1_score(y_true, y_pred))
    auc = float(roc_auc_score(y_true, y_proba))
    tn, fp, fn, tp = map(int, confusion_matrix(y_true, y_pred).ravel())
    
    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1_score": f1,
        "auc": auc,
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn
    }
    
import os
import argparse
from dotenv import load_dotenv

def get_settings():
    """
    Parse CLI arguments for .env file, load environment variables, 
    and return settings for the pipeline.
    """
    parser = argparse.ArgumentParser(description="Load ML pipeline settings")
    parser.add_argument(
        "--env_file",
        type=str,
        default=".env",
        help="Path to .env file containing environment variables"
    )
    args = parser.parse_args()

    # Load .env file
    if os.path.exists(args.env_file):
        load_dotenv(args.env_file)
    else:
        print(f"Warning: .env file '{args.env_file}' not found. Using defaults or env vars.")

    # Read settings from environment variables or set defaults
    settings = {
        "PATH_TO_GAMES_RAW_DIR": os.getenv("PATH_TO_GAMES_RAW_DIR", "data/games_raw"),
        "TEST_SIZE": int(os.getenv("TEST_SIZE", 100)),
        "PATH_TO_ML_RESULTS_DIR": os.getenv("PATH_TO_ML_RESULTS_DIR", "data/ml")
    }

    # Ensure results directory exists
    os.makedirs(settings["PATH_TO_ML_RESULTS_DIR"], exist_ok=True)

    return settings

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit
import numpy as np

class RecursiveL1Selector(BaseEstimator, TransformerMixin):
    def __init__(self, C=1.0, n_splits=10):
        self.C = C
        self.n_splits = n_splits
    
    def fit(self, X, y):
        # Если X разреженный, преобразуем в плотный
        X_sel = X.toarray() if sparse.issparse(X) else X.copy()
        iteration = 1

        while True:
            tscv = TimeSeriesSplit(n_splits=self.n_splits)
            masks = []

            for train_idx, val_idx in tscv.split(X_sel):
                X_train_fold, y_train_fold = X_sel[train_idx], y[train_idx]
                model = LogisticRegression(
                    solver="liblinear",
                    penalty="l1",
                    C=self.C,
                    max_iter=1000,
                    random_state=42
                )
                model.fit(X_train_fold, y_train_fold)
                masks.append(model.coef_[0] != 0)

            mask_array = np.vstack(masks)
            majority_mask = mask_array.mean(axis=0) >= 0.5

            if majority_mask.sum() == X_sel.shape[1]:
                break

            X_sel = X_sel[:, majority_mask]
            iteration += 1

        self.features_mask_ = majority_mask
        return self

    def transform(self, X):
        X_dense = X.toarray() if sparse.issparse(X) else np.asarray(X)
        return X_dense[:, self.features_mask_]



# ----------------------- Load data -----------------------
PATH_TO_GAMES_RAW = "data/games_raw"
TEST_SIZE = 100

game_ids = get_game_ids(PATH_TO_GAMES_RAW)
game_ids_train, game_ids_test = game_ids[:-TEST_SIZE], game_ids[-TEST_SIZE:]

X_train, y_train = get_X_y(PATH_TO_GAMES_RAW, game_ids_train)
X_test, y_test = get_X_y(PATH_TO_GAMES_RAW, game_ids_test)

# ----------------------- Pipeline -----------------------
team_cols = [0, 1]
player_cols = list(range(2, X_train.shape[1]))

from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
import numpy as np

# ----------------------- Pipeline -----------------------
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
import numpy as np
from scipy import sparse

# ----------------------- Исправленный RecursiveL1Selector -----------------------
class RecursiveL1Selector(BaseEstimator, TransformerMixin):
    def __init__(self, C=1.0, n_splits=10):
        self.C = C
        self.n_splits = n_splits
        self.features_mask_ = None
    
    def fit(self, X, y):
        X_dense = X.toarray() if sparse.issparse(X) else np.asarray(X)
        n_features = X_dense.shape[1]
        mask_all = np.ones(n_features, dtype=bool)

        tscv = TimeSeriesSplit(n_splits=self.n_splits)
        for train_idx, val_idx in tscv.split(X_dense):
            X_train, y_train = X_dense[train_idx], y[train_idx]
            model = LogisticRegression(
                solver="liblinear",
                penalty="l1",
                C=self.C,
                max_iter=1000,
                random_state=42
            )
            model.fit(X_train, y_train)
            coef_mask = model.coef_[0] != 0
            mask_all = mask_all & coef_mask  # пересечение признаков, которые выжили во всех фолдах

        self.features_mask_ = mask_all
        return self

    def transform(self, X):
        X_dense = X.toarray() if sparse.issparse(X) else np.asarray(X)
        return X_dense[:, self.features_mask_]

# ----------------------- Pipeline с RFECV -----------------------
full_pipeline_with_rfecv = Pipeline([
    ("features", FeatureUnion([
        ("team_bag", Pipeline([
            ("select_teams", ColumnSelector(team_cols)),
            ("team_encoder", TeamBagEncoder())
        ])),
        ("player_bag", Pipeline([
            ("select_players", ColumnSelector(player_cols)),
            ("player_encoder", PlayerBagEncoder())
        ]))
    ])),
    ("recursive_l1", RecursiveL1Selector(n_splits=10)),
    ("rfecv", RFECV(
        estimator=LogisticRegression(
            solver="liblinear",
            max_iter=1000,
            random_state=42
        ),
        step=1,
        cv=TimeSeriesSplit(n_splits=5),  # меньше фолдов для ускорения
        scoring="roc_auc",
        n_jobs=-1
    )),
    ("logit", LogisticRegression(
        solver="liblinear",
        max_iter=1000,
        random_state=42
    ))
])

# ----------------------- GridSearch по C -----------------------
param_grid = {
    "recursive_l1__C": np.linspace(0.1, 1.0, 10)  # C от 0.1 до 1.0, 10 значений
}

tscv = TimeSeriesSplit(n_splits=10)

grid_search = GridSearchCV(
    full_pipeline_with_rfecv,
    param_grid,
    scoring="roc_auc",
    cv=tscv,
    n_jobs=-1,
    verbose=2
)

# ----------------------- Обучение -----------------------
grid_search.fit(X_train, y_train)

print(f"Лучший параметр C: {grid_search.best_params_['recursive_l1__C']}")
print(f"Лучший ROC-AUC на TimeSeriesSplit: {grid_search.best_score_:.4f}")

# ----------------------- Предсказания на тесте -----------------------
best_pipeline = grid_search.best_estimator_
y_test_pred = best_pipeline.predict(X_test)
y_test_proba = best_pipeline.predict_proba(X_test)[:, 1]

metrics = get_metrics(y_test, y_test_pred, y_test_proba)
print("\nМетрики на тесте после L1 + RFECV:")
for k, v in metrics.items():
    print(f"{k}: {v}")





# # ----------------------- Predict & metrics -----------------------
# y_test_pred = pipeline.predict(X_test)
# y_test_proba = pipeline.predict_proba(X_test)[:, 1]
# metrics = get_metrics(y_test, y_test_pred, y_test_proba)

# # ----------------------- Hash & save -----------------------
# game_ids_bytes = json.dumps(game_ids, sort_keys=True).encode("utf-8")
# game_ids_hash = hashlib.md5(game_ids_bytes).hexdigest()

# with open(f"{game_ids_hash}.json", "w") as f:
#     json.dump(metrics, f, indent=4)

# with open(f"{game_ids_hash}.pickle", "wb") as f:
#     pickle.dump(pipeline, f)

# print(f"Metrics saved to {game_ids_hash}.json")
# print(f"Pipeline saved to {game_ids_hash}.pickle")


  0%|          | 0/56493 [00:00<?, ?it/s]

100%|██████████| 56493/56493 [00:16<00:00, 3375.21it/s]
100%|██████████| 37974/37974 [00:07<00:00, 5023.54it/s]
100%|██████████| 100/100 [00:00<00:00, 4439.03it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits





[CV] END ................................recursive_l1__C=0.1; total time=   0.3s
[CV] END ................................recursive_l1__C=0.1; total time=   1.9s
[CV] END ................................recursive_l1__C=0.1; total time=   4.7s
[CV] END ................................recursive_l1__C=0.2; total time=   0.5s




[CV] END ................................recursive_l1__C=0.1; total time=   7.3s




[CV] END ................................recursive_l1__C=0.2; total time=   3.8s




[CV] END ................................recursive_l1__C=0.1; total time=  11.2s




[CV] END ................................recursive_l1__C=0.2; total time=   6.8s




[CV] END ................................recursive_l1__C=0.1; total time=  14.6s


In [None]:
import os
import json
from dateutil.parser import parse
from collections import defaultdict
import numpy as np
from tqdm import tqdm
from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# ----------------------- Data loading -----------------------
def generate_game_raw(path_to_games_raw_dir: str = "data/games_raw"):
    for filename in tqdm(os.listdir(path_to_games_raw_dir)):
        file_path = os.path.join(path_to_games_raw_dir, filename)
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                yield json.load(f)
        except:
            continue

def validate_game(game: dict) -> bool:
    try:
        int(game["id"])
        parse(game["begin_at"])
        int(game["match"]["league"]["id"])
        int(game["match"]["serie"]["id"])
        int(game["match"]["tournament"]["id"])
        int(game["map"]["id"])
        team_players = defaultdict(list)
        for p in game["players"]:
            team_players[p["team"]["id"]].append(p["player"]["id"])
        if len(team_players) != 2:
            return False
        for p_ids in team_players.values():
            if len(set(p_ids)) != 5:
                return False
        team_ids = list(team_players.keys())
        rounds = []
        for r in game["rounds"]:
            if r["round"] is None or r["ct"] not in team_ids or r["terrorists"] not in team_ids or r["winner_team"] not in team_ids:
                return False
            rounds.append(r["round"])
        if min(rounds) != 1 or max(rounds) < 16:
            return False
        return True
    except:
        return False

def get_game_ids(path_to_games_raw_dir: str = "data/games_raw") -> list[int]:
    game_ids_valid, game_begin_at_valid = [], []
    for game in generate_game_raw(path_to_games_raw_dir):
        if validate_game(game):
            game_ids_valid.append(game["id"])
            game_begin_at_valid.append(parse(game["begin_at"]))
    return np.array(game_ids_valid)[np.argsort(game_begin_at_valid)].tolist()

def get_X_y(path_to_games_raw: str, game_ids: list[int]):
    X, y = [], []
    for game_id in tqdm(game_ids):
        file_path = os.path.join(path_to_games_raw, f"{game_id}.json")
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                game = json.load(f)
            team_players = defaultdict(list)
            for p in game["players"]:
                team_players[p["team"]["id"]].append(p["player"]["id"])
            t1_id, t2_id = sorted(team_players.keys())
            X.append([t1_id, t2_id] + sorted(team_players[t1_id]) + sorted(team_players[t2_id]))
            team_win_count = {t1_id: 0, t2_id: 0}
            for r in game["rounds"]:
                team_win_count[r["winner_team"]] += 1
            y.append(int(team_win_count[t1_id] > team_win_count[t2_id]))
        except:
            continue
    return np.array(X), np.array(y)

# ----------------------- Column selector -----------------------
class ColumnSelector(BaseEstimator, TransformerMixin):   
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = np.asarray(X)
        return X[:, self.columns]

# ----------------------- Bag encoders -----------------------
class PlayerBagEncoder(BaseEstimator, TransformerMixin):   
    def __init__(self):
        self.player_dict = None

    def fit(self, X, y=None):      
        X = np.asarray(X)
        uniques = np.unique(X.flatten())
        self.player_dict = {player: idx for idx, player in enumerate(uniques)}
        return self

    def transform(self, X):        
        X = np.asarray(X)
        n_samples = X.shape[0]
        n_features = len(self.player_dict)
        rows, cols, data = [], [], []

        for i, row in enumerate(X):
            for j, player in enumerate(row):
                col_idx = self.player_dict.get(player)
                if col_idx is not None:
                    rows.append(i)
                    cols.append(col_idx)
                    data.append(1 if j < len(row)//2 else -1)

        return sparse.csr_matrix((data, (rows, cols)), shape=(n_samples, n_features), dtype=int)

class TeamBagEncoder(BaseEstimator, TransformerMixin):   
    def __init__(self):
        self.team_dict = None

    def fit(self, X, y=None):      
        X = np.asarray(X)
        uniques = np.unique(X.flatten())
        self.team_dict = {team: idx for idx, team in enumerate(uniques)}
        return self

    def transform(self, X):        
        X = np.asarray(X)
        n_samples = X.shape[0]
        n_features = len(self.team_dict)
        rows, cols, data = [], [], []

        for i, row in enumerate(X):            
            for j, team in enumerate(row):
                col_idx = self.team_dict.get(team)
                if col_idx is not None:
                    rows.append(i)
                    cols.append(col_idx)
                    data.append(1 if j == 0 else -1)

        return sparse.csr_matrix((data, (rows, cols)), shape=(n_samples, n_features), dtype=int)

# ----------------------- Recursive L1 Selector -----------------------
class RecursiveL1Selector(BaseEstimator, TransformerMixin):
    def __init__(self, C=0.5, n_splits=10, min_features=1):
        self.C = C
        self.n_splits = n_splits
        self.min_features = min_features
        self.features_mask_ = None

    def fit(self, X, y):
        X_dense = X.toarray() if sparse.issparse(X) else np.asarray(X)
        mask_all = np.ones(X_dense.shape[1], dtype=bool)
        iteration = 0

        while True:
            iteration += 1
            print(f"L1 Iteration {iteration}: {mask_all.sum()} features remaining")
            tscv = TimeSeriesSplit(n_splits=self.n_splits)
            masks = []

            for train_idx, val_idx in tscv.split(X_dense[:, mask_all]):
                X_train, y_train = X_dense[train_idx][:, mask_all], y[train_idx]
                model = LogisticRegression(solver="liblinear", penalty="l1",
                                           C=self.C, max_iter=1000, random_state=42)
                model.fit(X_train, y_train)
                masks.append(model.coef_[0] != 0)

            majority_mask = np.vstack(masks).mean(axis=0) >= 0.5
            prev_sum = mask_all.sum()
            mask_all[np.where(mask_all)[0][~majority_mask]] = False

            if prev_sum == mask_all.sum() or mask_all.sum() <= self.min_features:
                break

        self.features_mask_ = mask_all
        print(f"Final selected features after Recursive L1: {mask_all.sum()}")
        return self

    def transform(self, X):
        X_dense = X.toarray() if sparse.issparse(X) else np.asarray(X)
        return X_dense[:, self.features_mask_]

# ----------------------- Metrics -----------------------
def get_metrics(y_true, y_pred, y_proba):
    acc = float(accuracy_score(y_true, y_pred))
    prec = float(precision_score(y_true, y_pred))
    rec = float(recall_score(y_true, y_pred))
    f1 = float(f1_score(y_true, y_pred))
    auc = float(roc_auc_score(y_true, y_proba))
    tn, fp, fn, tp = map(int, confusion_matrix(y_true, y_pred).ravel())
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1_score": f1, "auc": auc,
            "tp": tp, "tn": tn, "fp": fp, "fn": fn}

# ----------------------- Load data -----------------------
PATH_TO_GAMES_RAW = "data/games_raw"
TEST_SIZE = 100

game_ids = get_game_ids(PATH_TO_GAMES_RAW)
game_ids_train, game_ids_test = game_ids[:-TEST_SIZE], game_ids[-TEST_SIZE:]

X_train, y_train = get_X_y(PATH_TO_GAMES_RAW, game_ids_train)
X_test, y_test = get_X_y(PATH_TO_GAMES_RAW, game_ids_test)

team_cols = [0, 1]
player_cols = list(range(2, X_train.shape[1]))

best_auc = -np.inf
best_C = None
best_model = None
best_l1_selector = None
best_rfecv = None
best_encoder = None

for C in np.linspace(0.1, 1.0, 10):
    print(f"\n=== Запуск с C = {C:.2f} ===")

    # ----------------------- Feature encoding -----------------------
    full_encoder = FeatureUnion([
        ("team_bag", Pipeline([
            ("select_teams", ColumnSelector(team_cols)),
            ("team_encoder", TeamBagEncoder())
        ])),
        ("player_bag", Pipeline([
            ("select_players", ColumnSelector(player_cols)),
            ("player_encoder", PlayerBagEncoder())
        ]))
    ])

    X_train_encoded = full_encoder.fit_transform(X_train)
    X_test_encoded = full_encoder.transform(X_test)

    # ----------------------- Step 1: Recursive L1 -----------------------
    l1_selector = RecursiveL1Selector(C=C, n_splits=10)
    l1_selector.fit(X_train_encoded, y_train)
    X_train_l1 = l1_selector.transform(X_train_encoded)
    X_test_l1 = l1_selector.transform(X_test_encoded)

    print(f"Отобрано признаков после L1: {X_train_l1.shape[1]}")

    # ----------------------- Step 2: RFECV -----------------------
    rfecv = RFECV(
        estimator=LogisticRegression(solver="liblinear", max_iter=1000, random_state=42),
        step=1,
        cv=TimeSeriesSplit(n_splits=10),
        scoring="roc_auc",
        n_jobs=-1,
        verbose=2,
    )

    rfecv.fit(X_train_l1, y_train)
    X_train_rfecv = rfecv.transform(X_train_l1)
    X_test_rfecv = rfecv.transform(X_test_l1)

    print(f"Отобрано признаков после RFECV: {X_train_rfecv.shape[1]}")

    # ----------------------- Step 3: Final Logistic Regression -----------------------
    final_model = LogisticRegression(solver="liblinear", max_iter=1000, random_state=42)
    final_model.fit(X_train_rfecv, y_train)

    y_test_pred = final_model.predict(X_test_rfecv)
    y_test_proba = final_model.predict_proba(X_test_rfecv)[:, 1]

    metrics = get_metrics(y_test, y_test_pred, y_test_proba)
    print(f"\nМетрики на тесте для C={C:.2f}:")
    for k, v in metrics.items():
        print(f"{k}: {v}")

    # ----------------------- Проверка лучшей модели -----------------------
    if metrics["auc"] > best_auc:
        best_auc = metrics["auc"]
        best_C = C
        best_model = final_model
        best_l1_selector = l1_selector
        best_rfecv = rfecv
        best_encoder = full_encoder

print(f"\n=== Лучшая модель для C={best_C:.2f} с AUC={best_auc:.4f} ===")




100%|██████████| 56493/56493 [00:16<00:00, 3476.84it/s]
100%|██████████| 37974/37974 [00:07<00:00, 5073.52it/s]
100%|██████████| 100/100 [00:00<00:00, 4297.40it/s]



=== Запуск с C = 0.10 ===




L1 Iteration 1: 5800 features remaining
L1 Iteration 2: 187 features remaining
L1 Iteration 3: 183 features remaining
Final selected features after Recursive L1: 183
Отобрано признаков после L1: 183
Fitting estimator with 183 features.
Fitting estimator with 182 features.
Fitting estimator with 183 features.
Fitting estimator with 181 features.
Fitting estimator with 183 features.
Fitting estimator with 180 features.
Fitting estimator with 182 features.
Fitting estimator with 179 features.
Fitting estimator with 183 features.
Fitting estimator with 182 features.
Fitting estimator with 178 features.
Fitting estimator with 177 features.
Fitting estimator with 181 features.
Fitting estimator with 183 features.
Fitting estimator with 181 features.
Fitting estimator with 176 features.
Fitting estimator with 183 features.
Fitting estimator with 180 features.
Fitting estimator with 175 features.
Fitting estimator with 174 features.
Fitting estimator with 180 features.
Fitting estimator with 1



L1 Iteration 1: 5800 features remaining
L1 Iteration 2: 376 features remaining
L1 Iteration 3: 355 features remaining
L1 Iteration 4: 353 features remaining
Final selected features after Recursive L1: 353
Отобрано признаков после L1: 353
Fitting estimator with 353 features.
Fitting estimator with 353 features.
Fitting estimator with 353 features.
Fitting estimator with 353 features.
Fitting estimator with 353 features.
Fitting estimator with 353 features.
Fitting estimator with 353 features.
Fitting estimator with 352 features.
Fitting estimator with 352 features.
Fitting estimator with 351 features.
Fitting estimator with 353 features.
Fitting estimator with 352 features.
Fitting estimator with 350 features.
Fitting estimator with 351 features.
Fitting estimator with 349 features.
Fitting estimator with 350 features.
Fitting estimator with 348 features.
Fitting estimator with 352 features.
Fitting estimator with 347 features.
Fitting estimator with 349 features.
Fitting estimator with



L1 Iteration 1: 5800 features remaining
L1 Iteration 2: 513 features remaining
L1 Iteration 3: 479 features remaining
L1 Iteration 4: 475 features remaining
L1 Iteration 5: 473 features remaining
Final selected features after Recursive L1: 473
Отобрано признаков после L1: 473
Fitting estimator with 473 features.
Fitting estimator with 473 features.
Fitting estimator with 473 features.
Fitting estimator with 473 features.
Fitting estimator with 473 features.
Fitting estimator with 473 features.
Fitting estimator with 473 features.
Fitting estimator with 473 features.
Fitting estimator with 472 features.
Fitting estimator with 471 features.
Fitting estimator with 472 features.
Fitting estimator with 472 features.
Fitting estimator with 470 features.
Fitting estimator with 469 features.
Fitting estimator with 471 features.
Fitting estimator with 468 features.
Fitting estimator with 472 features.
Fitting estimator with 467 features.
Fitting estimator with 470 features.
Fitting estimator wi



L1 Iteration 1: 5800 features remaining
L1 Iteration 2: 641 features remaining
L1 Iteration 3: 617 features remaining
L1 Iteration 4: 615 features remaining
L1 Iteration 5: 613 features remaining
L1 Iteration 6: 612 features remaining
Final selected features after Recursive L1: 612
Отобрано признаков после L1: 612
Fitting estimator with 612 features.
Fitting estimator with 612 features.
Fitting estimator with 612 features.
Fitting estimator with 612 features.
Fitting estimator with 612 features.
Fitting estimator with 612 features.
Fitting estimator with 611 features.
Fitting estimator with 612 features.
Fitting estimator with 612 features.
Fitting estimator with 610 features.
Fitting estimator with 611 features.
Fitting estimator with 609 features.
Fitting estimator with 608 features.
Fitting estimator with 611 features.
Fitting estimator with 610 features.
Fitting estimator with 611 features.
Fitting estimator with 607 features.
Fitting estimator with 609 features.
Fitting estimator 

  0%|          | 0/56493 [00:00<?, ?it/s]

100%|██████████| 56493/56493 [00:16<00:00, 3434.61it/s]
100%|██████████| 37974/37974 [00:07<00:00, 5148.92it/s]
100%|██████████| 100/100 [00:00<00:00, 4749.47it/s]


AUC на тесте: 0.7419
accuracy: 0.66
precision: 0.6949152542372882
recall: 0.7192982456140351
f1_score: 0.7068965517241379
auc: 0.7419420644634843
tp: 41
tn: 25
fp: 18
fn: 16




In [None]:
import os
import json
from dateutil.parser import parse
from collections import defaultdict
import numpy as np
from tqdm import tqdm
from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# ----------------------- Data loading -----------------------
def generate_game_raw(path_to_games_raw_dir="data/games_raw"):
    for filename in tqdm(os.listdir(path_to_games_raw_dir)):
        file_path = os.path.join(path_to_games_raw_dir, filename)
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                yield json.load(f)
        except:
            continue

def validate_game(game):
    try:
        int(game["id"])
        parse(game["begin_at"])
        int(game["match"]["league"]["id"])
        int(game["match"]["serie"]["id"])
        int(game["match"]["tournament"]["id"])
        int(game["map"]["id"])
        team_players = defaultdict(list)
        for p in game["players"]:
            team_players[p["team"]["id"]].append(p["player"]["id"])
        if len(team_players) != 2:
            return False
        for p_ids in team_players.values():
            if len(set(p_ids)) != 5:
                return False
        team_ids = list(team_players.keys())
        rounds = []
        for r in game["rounds"]:
            if r["round"] is None or r["ct"] not in team_ids or r["terrorists"] not in team_ids or r["winner_team"] not in team_ids:
                return False
            rounds.append(r["round"])
        if min(rounds) != 1 or max(rounds) < 16:
            return False
        return True
    except:
        return False

def get_game_ids(path_to_games_raw_dir="data/games_raw"):
    game_ids_valid, game_begin_at_valid = [], []
    for game in generate_game_raw(path_to_games_raw_dir):
        if validate_game(game):
            game_ids_valid.append(game["id"])
            game_begin_at_valid.append(parse(game["begin_at"]))
    return np.array(game_ids_valid)[np.argsort(game_begin_at_valid)].tolist()

def get_X_y(path_to_games_raw, game_ids):
    X, y = [], []
    for game_id in tqdm(game_ids):
        file_path = os.path.join(path_to_games_raw, f"{game_id}.json")
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                game = json.load(f)
            team_players = defaultdict(list)
            for p in game["players"]:
                team_players[p["team"]["id"]].append(p["player"]["id"])
            t1_id, t2_id = sorted(team_players.keys())
            X.append([t1_id, t2_id] + sorted(team_players[t1_id]) + sorted(team_players[t2_id]))
            team_win_count = {t1_id: 0, t2_id: 0}
            for r in game["rounds"]:
                team_win_count[r["winner_team"]] += 1
            y.append(int(team_win_count[t1_id] > team_win_count[t2_id]))
        except:
            continue
    return np.array(X), np.array(y)

# ----------------------- Column selector -----------------------
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = np.asarray(X)
        return X[:, self.columns]

# ----------------------- Bag encoders -----------------------
class PlayerBagEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X = np.asarray(X)
        uniques = np.unique(X.flatten())
        self.player_dict = {player: idx for idx, player in enumerate(uniques)}
        return self
    def transform(self, X):
        X = np.asarray(X)
        n_samples = X.shape[0]
        n_features = len(self.player_dict)
        rows, cols, data = [], [], []
        for i, row in enumerate(X):
            for j, player in enumerate(row):
                col_idx = self.player_dict.get(player)
                if col_idx is not None:
                    rows.append(i)
                    cols.append(col_idx)
                    data.append(1 if j < len(row)//2 else -1)
        return sparse.csr_matrix((data, (rows, cols)), shape=(n_samples, n_features), dtype=int)

class TeamBagEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X = np.asarray(X)
        uniques = np.unique(X.flatten())
        self.team_dict = {team: idx for idx, team in enumerate(uniques)}
        return self
    def transform(self, X):
        X = np.asarray(X)
        n_samples = X.shape[0]
        n_features = len(self.team_dict)
        rows, cols, data = [], [], []
        for i, row in enumerate(X):
            for j, team in enumerate(row):
                col_idx = self.team_dict.get(team)
                if col_idx is not None:
                    rows.append(i)
                    cols.append(col_idx)
                    data.append(1 if j == 0 else -1)
        return sparse.csr_matrix((data, (rows, cols)), shape=(n_samples, n_features), dtype=int)

# ----------------------- Recursive L1 Selector -----------------------
class RecursiveL1Selector(BaseEstimator, TransformerMixin):
    def __init__(self, C=1, cv=None):
        self.C = C
        self.cv = cv        
        self.features_mask_ = None
    def fit(self, X, y):
        X_dense = X.toarray() if sparse.issparse(X) else np.asarray(X)
        mask_all = np.ones(X_dense.shape[1], dtype=bool)
        iteration = 0
        while True:
            iteration += 1
            masks = []
            for train_idx, val_idx in self.cv.split(X_dense[:, mask_all]):
                X_train, y_train = X_dense[train_idx][:, mask_all], y[train_idx]
                model = LogisticRegression(solver="liblinear", penalty="l1",
                                           C=self.C, max_iter=1000, random_state=42)
                model.fit(X_train, y_train)
                masks.append(model.coef_[0] != 0)
            majority_mask = np.vstack(masks).mean(axis=0) >= 0.5
            prev_sum = mask_all.sum()
            mask_all[np.where(mask_all)[0][~majority_mask]] = False
            if prev_sum == mask_all.sum():
                break
        self.features_mask_ = mask_all
        return self
    def transform(self, X):
        X_dense = X.toarray() if sparse.issparse(X) else np.asarray(X)
        return X_dense[:, self.features_mask_]

# ----------------------- Metrics -----------------------
def get_metrics(y_true, y_pred, y_proba):
    acc = float(accuracy_score(y_true, y_pred))
    prec = float(precision_score(y_true, y_pred))
    rec = float(recall_score(y_true, y_pred))
    f1 = float(f1_score(y_true, y_pred))
    auc = float(roc_auc_score(y_true, y_proba))
    tn, fp, fn, tp = map(int, confusion_matrix(y_true, y_pred).ravel())
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1_score": f1, "auc": auc,
            "tp": tp, "tn": tn, "fp": fp, "fn": fn}

# ----------------------- Load data -----------------------
PATH_TO_GAMES_RAW = "data/games_raw"
TEST_SIZE = 100

game_ids = get_game_ids(PATH_TO_GAMES_RAW)
game_ids_train, game_ids_test = game_ids[:-TEST_SIZE], game_ids[-TEST_SIZE:]

X_train, y_train = get_X_y(PATH_TO_GAMES_RAW, game_ids_train)
X_test, y_test = get_X_y(PATH_TO_GAMES_RAW, game_ids_test)

team_cols = [0, 1]
player_cols = list(range(2, X_train.shape[1]))

tscv = TimeSeriesSplit(n_splits=10)
from sklearn.feature_selection import RFECV
# ----------------------- Pipeline -----------------------
pipeline = Pipeline([
    ("encoder", FeatureUnion([
        ("team_bag", Pipeline([
            ("select_teams", ColumnSelector(team_cols)),
            ("team_encoder", TeamBagEncoder())
        ])),
        ("player_bag", Pipeline([
            ("select_players", ColumnSelector(player_cols)),
            ("player_encoder", PlayerBagEncoder())
        ]))
    ])),
    ("l1", RecursiveL1Selector(C=1, cv=tscv)),    
    ("clf", LogisticRegression(solver="liblinear", C=1, max_iter=1000, random_state=42))
])

# ----------------------- Fit pipeline -----------------------
pipeline.fit(X_train, y_train)

y_test_pred = pipeline.predict(X_test)
y_test_proba = pipeline.predict_proba(X_test)[:, 1]

metrics = get_metrics(y_test, y_test_pred, y_test_proba)
print(f"AUC на тесте: {metrics['auc']:.4f}")
for k, v in metrics.items():
    print(f"{k}: {v}")


In [4]:
metrics = get_metrics(y_test, y_test_pred, y_test_proba)
print(f"AUC на тесте: {metrics['auc']:.4f}")
for k, v in metrics.items():
    print(f"{k}: {v}")

AUC на тесте: 0.7419
accuracy: 0.66
precision: 0.6949152542372882
recall: 0.7192982456140351
f1_score: 0.7068965517241379
auc: 0.7419420644634843
tp: 41
tn: 25
fp: 18
fn: 16


In [9]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import TimeSeriesSplit

# ----------------------- Настройка -----------------------
X_sel_orig = X_transformed_train.copy()
y_sel = y_train.copy()

C_values = np.linspace(0.1, 1.0, 10)
n_splits = 10
results = []

for C in C_values:
    print(f"\n=== C = {C:.2f} ===")
    X_sel = X_sel_orig.copy()
    iteration = 1
    scores = []
    num_features = []

    while True:
        tscv = TimeSeriesSplit(n_splits=n_splits)
        aucs = []
        masks = []

        # --- Оценка на фолдах ---
        for train_idx, val_idx in tscv.split(X_sel):
            X_train_fold, X_val_fold = X_sel[train_idx], X_sel[val_idx]
            y_train_fold, y_val_fold = y_sel[train_idx], y_sel[val_idx]

            model = LogisticRegression(
                solver="liblinear",
                penalty="l1",
                C=C,
                max_iter=1000,
                random_state=42
            )
            model.fit(X_train_fold, y_train_fold)
            y_val_proba = model.predict_proba(X_val_fold)[:, 1]
            aucs.append(roc_auc_score(y_val_fold, y_val_proba))

            # маска ненулевых коэффициентов для текущего фолда
            masks.append(model.coef_[0] != 0)

        # --- объединяем маски по фолдам ---
        # оставляем признаки, которые ненулевые хотя бы в половине фолдов
        mask_array = np.vstack(masks)
        majority_mask = mask_array.mean(axis=0) >= 0.5

        # средний AUC
        mean_auc = np.mean(aucs)
        scores.append(mean_auc)
        num_features.append(majority_mask.sum())

        print(f"[{iteration}] Осталось признаков: {majority_mask.sum()} | mean AUC: {mean_auc:.4f}")

        # если ничего не изменилось — остановка
        if majority_mask.sum() == X_sel.shape[1]:
            print("Удаление завершено — все оставшиеся признаки значимы на валидации.")
            break

        # обновляем X_sel только по выбранным признакам
        X_sel = X_sel[:, majority_mask]
        iteration += 1

    results.append({
        "C": C,
        "final_features": X_sel.shape[1],
        "final_auc": mean_auc,
        "num_features_history": num_features,
        "auc_history": scores
    })

# ----------------------- Выбор лучшего C -----------------------
best_result = max(results, key=lambda r: r["final_auc"])
best_C = best_result["C"]
best_auc = best_result["final_auc"]
best_num_features = best_result["final_features"]

print(f"\nЛучший C: {best_C:.2f}")
print(f"Средний AUC на TimeSeriesSplit: {best_auc:.4f}")
print(f"Количество признаков: {best_num_features}")



=== C = 0.10 ===
[1] Осталось признаков: 187 | mean AUC: 0.6054
[2] Осталось признаков: 183 | mean AUC: 0.5944
[3] Осталось признаков: 183 | mean AUC: 0.5948
Удаление завершено — все оставшиеся признаки значимы на валидации.

=== C = 0.20 ===
[1] Осталось признаков: 376 | mean AUC: 0.6250
[2] Осталось признаков: 355 | mean AUC: 0.6216
[3] Осталось признаков: 353 | mean AUC: 0.6222
[4] Осталось признаков: 353 | mean AUC: 0.6222
Удаление завершено — все оставшиеся признаки значимы на валидации.

=== C = 0.30 ===
[1] Осталось признаков: 513 | mean AUC: 0.6317
[2] Осталось признаков: 479 | mean AUC: 0.6321
[3] Осталось признаков: 475 | mean AUC: 0.6331
[4] Осталось признаков: 473 | mean AUC: 0.6333
[5] Осталось признаков: 473 | mean AUC: 0.6335
Удаление завершено — все оставшиеся признаки значимы на валидации.

=== C = 0.40 ===
[1] Осталось признаков: 641 | mean AUC: 0.6335
[2] Осталось признаков: 617 | mean AUC: 0.6377
[3] Осталось признаков: 615 | mean AUC: 0.6382
[4] Осталось признаков

In [11]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# ----------------------- Финальный отбор признаков -----------------------
# majority_mask из последней итерации лучшего C
# Если ты сохранил mask_array/majority_mask для лучшего C, используем его
# Иначе пересчитаем на всех данных для лучшего C

# Повторно обучаем модель на всех данных с лучшим C, чтобы получить финальную маску
final_model_mask = LogisticRegression(
    solver="liblinear",
    penalty="l1",
    C=best_C,
    max_iter=1000,
    random_state=42
)
final_model_mask.fit(X_sel_orig, y_sel)
majority_mask = final_model_mask.coef_[0] != 0

X_train_final = X_sel_orig[:, majority_mask]
X_test_final = pipeline.transform(X_test)[:, majority_mask]

# ----------------------- Финальная модель -----------------------
final_model = LogisticRegression(
    solver="liblinear",
    penalty="l1",
    C=best_C,
    max_iter=1000,
    random_state=42
)
final_model.fit(X_train_final, y_train)

# ----------------------- Предсказания на тесте -----------------------
y_test_pred = final_model.predict(X_test_final)
y_test_proba = final_model.predict_proba(X_test_final)[:, 1]

# ----------------------- Метрики -----------------------
accuracy = accuracy_score(y_test, y_test_pred)
roc_auc = roc_auc_score(y_test, y_test_proba)
f1 = f1_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)

print("\nМетрики на тесте после рекурсивного L1-отбора:")
print(f"Accuracy : {accuracy:.4f}")
print(f"ROC-AUC  : {roc_auc:.4f}")
print(f"F1-score : {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")





Метрики на тесте после рекурсивного L1-отбора:
Accuracy : 0.6800
ROC-AUC  : 0.7028
F1-score : 0.6923
Precision: 0.7660
Recall   : 0.6316


  0%|          | 0/56493 [00:00<?, ?it/s]

  0%|          | 0/37974 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

NameError: name 'RecursiveL1Selector' is not defined

In [None]:
# Start with all features
mask_all = np.ones(X_train.shape[1], dtype=bool)  # all True initially
X_train_selected = X_train.copy()

iteration = 0
while True:
    iteration += 1
    print(f"Iteration {iteration}: {X_train_selected.shape[1]} features")

    # Fit L1 logistic regression
    logit = LogisticRegression(solver="liblinear", penalty="l1", random_state=42, max_iter=1000)
    logit.fit(X_train_selected, y_train)

    # Mask of non-zero coefficients for current iteration
    mask_iteration = logit.coef_.flatten() != 0
    n_selected = mask_iteration.sum()
    print(f"Selected {n_selected} features this iteration")

    # Map iteration mask back to original feature indices
    idx_remaining = np.where(mask_all)[0]  # indices of remaining features in original
    mask_all[idx_remaining[~mask_iteration]] = False  # mark removed features as False

    # Stop if no more features were removed
    if n_selected == X_train_selected.shape[1]:
        print("No more features removed. Recursive selection finished.")
        break

    # Keep only selected features for next iteration
    X_train_selected = X_train_selected[:, mask_iteration]

print(f"Final selected features: {mask_all.sum()} / {mask_all.size}")


Iteration 1: 4507 features
Selected 1797 features this iteration
Iteration 2: 1797 features
Selected 1779 features this iteration
Iteration 3: 1779 features
Selected 1769 features this iteration
Iteration 4: 1769 features
Selected 1756 features this iteration
Iteration 5: 1756 features
Selected 1754 features this iteration
Iteration 6: 1754 features
Selected 1743 features this iteration
Iteration 7: 1743 features
Selected 1740 features this iteration
Iteration 8: 1740 features
Selected 1735 features this iteration
Iteration 9: 1735 features
Selected 1731 features this iteration
Iteration 10: 1731 features
Selected 1730 features this iteration
Iteration 11: 1730 features
Selected 1725 features this iteration
Iteration 12: 1725 features
Selected 1724 features this iteration
Iteration 13: 1724 features
Selected 1722 features this iteration
Iteration 14: 1722 features
Selected 1722 features this iteration
No more features removed. Recursive selection finished.
Final selected features: 1722

In [None]:
X_train = X_train[:, mask_all]
X_test = X_test[:, mask_all]

In [26]:
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, roc_auc_score
import numpy as np

# ================== Define TimeSeriesSplit ==================
tscv = TimeSeriesSplit(n_splits=10)

# ================== Define hyperparameter grid ==================
param_grid = {
    'C': np.linspace(.01, 1, 10),   # regularization strength
    'penalty': ['l1', 'l2'],        # L1 or L2 regularization
    'solver': ['liblinear']          # liblinear supports L1 & L2
}

# ================== AUC scorer ==================
auc_scorer = make_scorer(roc_auc_score, needs_proba=True)

# ================== GridSearchCV ==================
grid_search = GridSearchCV(
    estimator=LogisticRegression(max_iter=1000, random_state=42),
    param_grid=param_grid,
    scoring="roc_auc",
    cv=tscv,
    n_jobs=-1,
    verbose=2
)

# ================== Fit on training data ==================
grid_search.fit(X_train, y_train)  # X_train_bag from PlayerBagEncoder

# ================== Results ==================
print("Best parameters:", grid_search.best_params_)
print("Best cross-validated AUC:", grid_search.best_score_)

# Final model with best parameters
best_logit = grid_search.best_estimator_

# Evaluate on test set
y_test_pred = best_logit.predict(X_test)
y_test_proba = best_logit.predict_proba(X_test)[:, 1]

# Compute metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

def print_metrics(y_true, y_pred, y_proba, label="Test"):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_proba)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    print(f"===== {label} Metrics =====")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")
    print(f"AUC      : {auc:.4f}")
    print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}\n")

print_metrics(y_test, y_test_pred, y_test_proba)


Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV] END ...............C=0.01, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ...............C=0.01, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ...............C=0.01, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ...............C=0.01, penalty=l1, solver=liblinear; total time=   0.1s
[CV] END ...............C=0.01, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ...............C=0.01, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ...............C=0.01, penalty=l1, solver=liblinear; total time=   0.1s
[CV] END ...............C=0.01, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ...............C=0.01, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ...............C=0.01, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ...............C=0.01, penalty=l2, solver=liblinear; total time=   0.1s
[CV] END ...............C=0.01, penalty=l2, so

In [27]:
print_metrics(y_test, y_test_pred, y_test_proba)

===== Test Metrics =====
Accuracy : 0.6900
Precision: 0.7955
Recall   : 0.6140
F1-score : 0.6931
AUC      : 0.7285
TP: 35, TN: 34, FP: 9, FN: 22



  0%|          | 0/56493 [00:00<?, ?it/s]

  0%|          | 0/37974 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

PATH_TO_GAMES_RAW = "data/games_raw"
TEST_SIZE = 100

# Load and split game IDs
game_ids = get_game_ids(PATH_TO_GAMES_RAW)
game_ids_train, game_ids_test = game_ids[:-TEST_SIZE], game_ids[-TEST_SIZE:]

# Create X and y
X_train, y_train = get_X_y(PATH_TO_GAMES_RAW, game_ids_train)
X_test, y_test = get_X_y(PATH_TO_GAMES_RAW, game_ids_test)

# Full pipeline: encoding + bag-of-players + logistic regression
full_pipeline = Pipeline([
    ('player_label', PlayerLabelEncoder()),
    ('player_bag', PlayerBagEncoder()),
    ('logit', LogisticRegression(solver="liblinear", random_state=13))
])

# Fit pipeline on training data
full_pipeline.fit(X_train, y_train)

# Predict on training and test data
y_train_pred = full_pipeline.predict(X_train)
y_test_pred = full_pipeline.predict(X_test)

# Predict probabilities (for AUC)
y_train_proba = full_pipeline.predict_proba(X_train)[:, 1]
y_test_proba = full_pipeline.predict_proba(X_test)[:, 1]

# ================= Metrics =================

# Accuracy
accuracy = accuracy_score(y_test, y_test_pred)

# Precision, Recall, F1-score
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
fscore = f1_score(y_test, y_test_pred)

# AUC
auc = roc_auc_score(y_test, y_test_proba)

# Confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()

# Print results
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-score : {fscore:.4f}")
print(f"AUC      : {auc:.4f}")
print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")


  0%|          | 0/56493 [00:00<?, ?it/s]

  0%|          | 0/37974 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Accuracy : 0.6100
Precision: 0.6552
Recall   : 0.6667
F1-score : 0.6609
AUC      : 0.5000
TP: 38, TN: 23, FP: 20, FN: 19


In [None]:
X_train_bag.shape

(37974, 4436)

In [None]:
log