In [None]:
import os
import json
from dateutil.parser import parse
from collections import defaultdict
import numpy as np
from tqdm import tqdm
from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

import warnings

warnings.filterwarnings("ignore")

# ----------------------- Data loading -----------------------
def generate_game_raw(path_to_games_raw_dir="data/games_raw"):
    for filename in tqdm(os.listdir(path_to_games_raw_dir)):
        file_path = os.path.join(path_to_games_raw_dir, filename)
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                yield json.load(f)
        except:
            continue

def validate_game(game):
    try:
        int(game["id"])
        parse(game["begin_at"])
        int(game["match"]["league"]["id"])
        int(game["match"]["serie"]["id"])
        int(game["match"]["tournament"]["id"])
        int(game["map"]["id"])
        team_players = defaultdict(list)
        for p in game["players"]:
            team_players[p["team"]["id"]].append(p["player"]["id"])
        if len(team_players) != 2:
            return False
        for p_ids in team_players.values():
            if len(set(p_ids)) != 5:
                return False
        team_ids = list(team_players.keys())
        rounds = []
        for r in game["rounds"]:
            if r["round"] is None or r["ct"] not in team_ids or r["terrorists"] not in team_ids or r["winner_team"] not in team_ids:
                return False
            rounds.append(r["round"])
        if min(rounds) != 1 or max(rounds) < 16:
            return False
        return True
    except:
        return False

def get_game_ids(path_to_games_raw_dir="data/games_raw"):
    game_ids_valid, game_begin_at_valid = [], []
    for game in generate_game_raw(path_to_games_raw_dir):
        if validate_game(game):
            game_ids_valid.append(game["id"])
            game_begin_at_valid.append(parse(game["begin_at"]))
    return np.array(game_ids_valid)[np.argsort(game_begin_at_valid)].tolist()

def get_X_y(path_to_games_raw, game_ids):
    X, y = [], []
    for game_id in tqdm(game_ids):
        file_path = os.path.join(path_to_games_raw, f"{game_id}.json")
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                game = json.load(f)
            team_players = defaultdict(list)
            for p in game["players"]:
                team_players[p["team"]["id"]].append(p["player"]["id"])
            t1_id, t2_id = sorted(team_players.keys())
            X.append([t1_id, t2_id] + sorted(team_players[t1_id]) + sorted(team_players[t2_id]))
            team_win_count = {t1_id: 0, t2_id: 0}
            for r in game["rounds"]:
                team_win_count[r["winner_team"]] += 1
            y.append(int(team_win_count[t1_id] > team_win_count[t2_id]))
        except:
            continue
    return np.array(X), np.array(y)

# ----------------------- Column selector -----------------------
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = np.asarray(X)
        return X[:, self.columns]

# ----------------------- Bag encoders -----------------------
class PlayerBagEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X = np.asarray(X)
        uniques = np.unique(X.flatten())
        self.player_dict = {player: idx for idx, player in enumerate(uniques)}
        return self
    def transform(self, X):
        X = np.asarray(X)
        n_samples = X.shape[0]
        n_features = len(self.player_dict)
        rows, cols, data = [], [], []
        for i, row in enumerate(X):
            for j, player in enumerate(row):
                col_idx = self.player_dict.get(player)
                if col_idx is not None:
                    rows.append(i)
                    cols.append(col_idx)
                    data.append(1 if j < len(row)//2 else -1)
        return sparse.csr_matrix((data, (rows, cols)), shape=(n_samples, n_features), dtype=int)

class TeamBagEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X = np.asarray(X)
        uniques = np.unique(X.flatten())
        self.team_dict = {team: idx for idx, team in enumerate(uniques)}
        return self
    def transform(self, X):
        X = np.asarray(X)
        n_samples = X.shape[0]
        n_features = len(self.team_dict)
        rows, cols, data = [], [], []
        for i, row in enumerate(X):
            for j, team in enumerate(row):
                col_idx = self.team_dict.get(team)
                if col_idx is not None:
                    rows.append(i)
                    cols.append(col_idx)
                    data.append(1 if j == 0 else -1)
        return sparse.csr_matrix((data, (rows, cols)), shape=(n_samples, n_features), dtype=int)

# ----------------------- Recursive L1 Selector -----------------------
class RecursiveL1Selector(BaseEstimator, TransformerMixin):
    def __init__(self, C=1, cv=None):
        self.C = C
        self.cv = cv        
        self.features_mask_ = None
    def fit(self, X, y):
        X_dense = X.toarray() if sparse.issparse(X) else np.asarray(X)
        mask_all = np.ones(X_dense.shape[1], dtype=bool)
        iteration = 0
        while True:
            iteration += 1
            print(f"Итерация отбора признаков: {iteration}")
            print(f"Отобрано: {mask_all.sum()}")
            masks = []
            for train_idx, val_idx in self.cv.split(X_dense[:, mask_all]):
                X_train, y_train = X_dense[train_idx][:, mask_all], y[train_idx]
                model = LogisticRegression(solver="liblinear", penalty="l1",
                                           C=self.C, max_iter=1000, random_state=42)
                model.fit(X_train, y_train)
                masks.append(model.coef_[0] != 0)
            majority_mask = np.vstack(masks).mean(axis=0) >= 0.5
            prev_sum = mask_all.sum()
            mask_all[np.where(mask_all)[0][~majority_mask]] = False
            if prev_sum == mask_all.sum():
                break
        self.features_mask_ = mask_all
        return self
    def transform(self, X):
        X_dense = X.toarray() if sparse.issparse(X) else np.asarray(X)
        return X_dense[:, self.features_mask_]
    
class PlayerEloEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, k_factor=32, base_elo=1000):
        self.k_factor = k_factor
        self.base_elo = base_elo
        self.elo_dict_ = {}
        self.X_elo_train_ = None
        self.X_shape_ = None

    def _expected_score(self, rating_a, rating_b):
        return 1 / (1 + 10 ** ((rating_b - rating_a) / 400))

    def _augment_X(self, row):
        """
        row: np.array с Elo игроков (10 элементов)
        Возвращает новый вектор признаков:
          - среднее Elo каждой команды
          - разница средних Elo (team1 - team2)
          - попарные разницы всех игроков между командами
        """
        x1 = np.sort(row[:5])
        x2 = np.sort(row[5:])
        features = []

        mean1 = np.mean(x1)
        mean2 = np.mean(x2)
        features.extend([mean1, mean2, mean1, -mean2, mean1 - mean2])

        for i in range(5):
            for j in range(5):
                features.append(x1[i] - x2[j])

        return np.array(features, dtype=float)

    def fit(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y)
        X_elo = []

        for row, outcome in tqdm(zip(X, y), total=X.shape[0], desc="Fitting PlayerEloEncoder"):
            elos_before = [self.elo_dict_.get(pid, self.base_elo) for pid in row]
            X_elo.append(elos_before)

            avg1 = np.mean(elos_before[:5])
            avg2 = np.mean(elos_before[5:])
            exp1 = self._expected_score(avg1, avg2)
            score1 = 1 if outcome == 1 else 0
            score2 = 1 - score1

            for pid in row[:5]:
                self.elo_dict_[pid] = self.elo_dict_.get(pid, self.base_elo) + self.k_factor * (score1 - exp1)
            for pid in row[5:]:
                self.elo_dict_[pid] = self.elo_dict_.get(pid, self.base_elo) + self.k_factor * (score2 - (1 - exp1))

        self.X_elo_train_ = np.array(X_elo, dtype=float)
        self.X_shape_ = X.shape
        return self

    def transform(self, X):
        X = np.asarray(X)
        if hasattr(self, "X_elo_train_") and self.X_elo_train_ is not None and X.shape == getattr(self, "X_shape_", None):
            X_out = np.copy(self.X_elo_train_)
        else:
            X_out = np.array([[self.elo_dict_.get(pid, self.base_elo) for pid in row] for row in X], dtype=float)

        # всегда возвращаем расширённые признаки
        X_aug = np.array([self._augment_X(row) for row in X_out], dtype=float)
        return X_aug





# ----------------------- Metrics -----------------------
def get_metrics(y_true, y_pred, y_proba):
    acc = float(accuracy_score(y_true, y_pred))
    prec = float(precision_score(y_true, y_pred))
    rec = float(recall_score(y_true, y_pred))
    f1 = float(f1_score(y_true, y_pred))
    auc = float(roc_auc_score(y_true, y_proba))
    tn, fp, fn, tp = map(int, confusion_matrix(y_true, y_pred).ravel())
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1_score": f1, "auc": auc,
            "tp": tp, "tn": tn, "fp": fp, "fn": fn}

# ----------------------- Load data -----------------------
PATH_TO_GAMES_RAW = "data/games_raw"
TEST_SIZE = 100

game_ids = get_game_ids(PATH_TO_GAMES_RAW)
game_ids_train, game_ids_test = game_ids[:-TEST_SIZE], game_ids[-TEST_SIZE:]

X_train, y_train = get_X_y(PATH_TO_GAMES_RAW, game_ids_train)
X_test, y_test = get_X_y(PATH_TO_GAMES_RAW, game_ids_test)

team_cols = [0, 1]
player_cols = list(range(2, X_train.shape[1]))

tscv = TimeSeriesSplit(n_splits=10)

pipeline = Pipeline([
    ("encoder", FeatureUnion([
        # ("team_bag", Pipeline([
        #     ("select_teams", ColumnSelector(team_cols)),
        #     ("team_encoder", TeamBagEncoder())
        # ])),
        # ("player_bag", Pipeline([
        #     ("select_players", ColumnSelector(player_cols)),
        #     ("player_encoder", PlayerBagEncoder())
        # ])),
        ("player_elo", Pipeline([
            ("select_players", ColumnSelector(player_cols)),
            ("elo_encoder", PlayerEloEncoder())
        ]))
    ])),   
    ("l1", RecursiveL1Selector(C=1, cv=tscv)),    
    ("clf", LogisticRegression(solver="liblinear", C=1, random_state=42))
])
# ----------------------- Fit pipeline -----------------------
pipeline.fit(X_train, y_train)

y_test_pred = pipeline.predict(X_test)
y_test_proba = pipeline.predict_proba(X_test)[:, 1]

metrics = get_metrics(y_test, y_test_pred, y_test_proba)
print(f"AUC на тесте: {metrics['auc']:.4f}")
for k, v in metrics.items():
    print(f"{k}: {v}")


100%|██████████| 56493/56493 [00:16<00:00, 3460.62it/s]
100%|██████████| 37974/37974 [00:07<00:00, 5050.83it/s]
100%|██████████| 100/100 [00:00<00:00, 4871.94it/s]
100%|██████████| 37974/37974 [00:00<00:00, 38703.58it/s]


Итерация отбора признаков: 1
Отобрано: 10
AUC на тесте: 0.7546
accuracy: 0.69
precision: 0.7166666666666667
recall: 0.7543859649122807
f1_score: 0.7350427350427351
auc: 0.7545899632802937
tp: 43
tn: 26
fp: 17
fn: 14
