In [1]:
import os
import json
from dateutil.parser import parse
from collections import defaultdict
import numpy as np
from tqdm.notebook import tqdm
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion

# ==============================================================
# 1️⃣ Data Loading & Validation
# ==============================================================

def generate_game_raw(path_to_games_raw_dir: str = "data/games_raw"):
    for filename in tqdm(os.listdir(path_to_games_raw_dir)):
        try:
            with open(os.path.join(path_to_games_raw_dir, filename), "r", encoding="utf-8") as f:
                yield json.load(f)
        except Exception:
            continue

def validate_game(game: dict) -> bool:
    try:
        int(game["id"])
        parse(game["begin_at"])
        int(game["match"]["league"]["id"])
        int(game["match"]["serie"]["id"])
        int(game["match"]["tournament"]["id"])
        int(game["map"]["id"])

        team_players = defaultdict(list)
        for p in game["players"]:
            team_players[p["team"]["id"]].append(p["player"]["id"])
        assert len(team_players) == 2
        for p_ids in team_players.values():
            assert len(set(p_ids)) == 5

        team_ids = list(team_players.keys())

        rounds = []
        for r in game["rounds"]:
            assert r["round"] is not None
            assert r["ct"] in team_ids
            assert r["terrorists"] in team_ids
            assert r["winner_team"] in team_ids
            rounds.append(r["round"])
        assert min(rounds) == 1
        assert max(rounds) >= 16
        return True
    except Exception:
        return False

def get_game_ids(path_to_games_raw_dir: str = "data/games_raw") -> list[int]:
    game_ids_valid, game_begin_at_valid = [], []
    for game in generate_game_raw(path_to_games_raw_dir):
        if validate_game(game):
            game_ids_valid.append(game["id"])
            game_begin_at_valid.append(parse(game["begin_at"]))
    return np.array(game_ids_valid)[np.argsort(game_begin_at_valid)].tolist()

def get_X_y(path_to_games_raw: str, game_ids: list[int]):
    X, y = [], []
    for game_id in tqdm(game_ids):
        with open(os.path.join(path_to_games_raw, f"{game_id}.json"), "r", encoding="utf-8") as f:
            game = json.load(f)
        team_players = defaultdict(list)
        for p in game["players"]:
            team_players[p["team"]["id"]].append(p["player"]["id"])
        t1_id, t2_id = sorted(team_players.keys())
        X.append([int(game["map"]["id"]), int(game["rounds"][0]["ct"])] + sorted(team_players[t1_id]) + sorted(team_players[t2_id]))
        team_win_count = {t1_id: 0, t2_id: 0}
        for r in game["rounds"]:
            team_win_count[r["winner_team"]] += 1
        y.append(int(team_win_count[t1_id] > team_win_count[t2_id]))
    return np.array(X), np.array(y)

# ==============================================================
# 2️⃣ Custom Encoders
# ==============================================================

class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.class_to_index = None

    def fit(self, X, y=None):
        uniques = np.unique(X)
        self.class_to_index = {v: i for i, v in enumerate(uniques)}
        return self

    def transform(self, X):
        if self.class_to_index is None:
            raise ValueError("CustomLabelEncoder has not been fitted yet.")
        vectorized_map = np.vectorize(lambda x: self.class_to_index.get(x, -1))
        return vectorized_map(X)

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)


class PlayerEloEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, k=30, initial_elo=1500):
        self.k = k
        self.initial_elo = initial_elo
        self.player_elos = defaultdict(lambda: self.initial_elo)
        self.X_train = None

    def fit(self, X, y):
        feats = []
        for i in range(X.shape[0]):
            t1, t2 = X[i, 2:7], X[i, 7:12]
            res = y[i]
            feats.append(self._augment_row(t1, t2))
            self._update_elos(t1, t2, res)
        self.X_train = np.array(feats)
        return self

    def transform(self, X):
        features = []
        for row in X:
            t1, t2 = row[2:7], row[7:12]
            features.append(self._augment_row(t1, t2))
        return np.array(features)

    def _augment_row(self, t1, t2):
        t1e = np.sort([self.player_elos[p] for p in t1])
        t2e = np.sort([self.player_elos[p] for p in t2])
        t1avg, t2avg = np.mean(t1e), np.mean(t2e)
        diff = t1avg - t2avg
        pairdiff = (t1e[:, None] - t2e[None, :]).flatten()
        return np.concatenate([t1e, t2e, [t1avg, t2avg, diff], pairdiff])

    def _update_elos(self, t1, t2, res):
        t1avg = np.mean([self.player_elos[p] for p in t1])
        t2avg = np.mean([self.player_elos[p] for p in t2])
        exp = 1 / (1 + 10 ** ((t2avg - t1avg)/400))
        delta = self.k * (res - exp)
        for p in t1: self.player_elos[p] += delta
        for p in t2: self.player_elos[p] -= delta


class PlayerMapEloEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, k=30, initial_elo=1500):
        self.k = k
        self.initial_elo = initial_elo
        self.player_elos = defaultdict(lambda: defaultdict(lambda: self.initial_elo))
        self.X_train = None

    def fit(self, X, y):
        feats = []
        for i in range(X.shape[0]):
            m = X[i, 0]
            t1, t2 = X[i, 2:7], X[i, 7:12]
            res = y[i]
            feats.append(self._augment_row(m, t1, t2))
            self._update_elos(m, t1, t2, res)
        self.X_train = np.array(feats)
        return self

    def transform(self, X):
        features = []
        for row in X:
            m = row[0]
            t1, t2 = row[2:7], row[7:12]
            features.append(self._augment_row(m, t1, t2))
        return np.array(features)

    def _augment_row(self, m, t1, t2):
        t1e = np.sort([self.player_elos[m][p] for p in t1])
        t2e = np.sort([self.player_elos[m][p] for p in t2])
        t1avg, t2avg = np.mean(t1e), np.mean(t2e)
        diff = t1avg - t2avg
        pairdiff = (t1e[:, None] - t2e[None,:]).flatten()
        return np.concatenate([t1e, t2e, [t1avg,t2avg,diff], pairdiff])

    def _update_elos(self, m, t1, t2, res):
        t1avg = np.mean([self.player_elos[m][p] for p in t1])
        t2avg = np.mean([self.player_elos[m][p] for p in t2])
        exp = 1 / (1 + 10 ** ((t2avg - t1avg)/400))
        delta = self.k * (res - exp)
        for p in t1: self.player_elos[m][p] += delta
        for p in t2: self.player_elos[m][p] -= delta


class PlayerMapStartEloEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, k=30, initial_elo=1500):
        self.k = k
        self.initial_elo = initial_elo
        self.player_elos = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: self.initial_elo)))
        self.X_train = None

    def fit(self, X, y):
        feats = []
        for i in range(X.shape[0]):
            m, start_ct = X[i,0], X[i,1]
            t1, t2 = X[i,2:7], X[i,7:12]
            res = y[i]
            feats.append(self._augment_row(m, start_ct, t1, t2))
            self._update_elos(m, start_ct, t1, t2, res)
        self.X_train = np.array(feats)
        return self

    def transform(self, X):
        features = []
        for row in X:
            m, start_ct = row[0], row[1]
            t1, t2 = row[2:7], row[7:12]
            features.append(self._augment_row(m, start_ct, t1, t2))
        return np.array(features)

    def _augment_row(self, m, start_ct, t1, t2):
        t1e = np.sort([self.player_elos[m][start_ct][p] for p in t1])
        t2e = np.sort([self.player_elos[m][start_ct][p] for p in t2])
        t1avg, t2avg = np.mean(t1e), np.mean(t2e)
        diff = t1avg - t2avg
        pairdiff = (t1e[:,None] - t2e[None,:]).flatten()
        return np.concatenate([t1e, t2e, [t1avg,t2avg,diff], pairdiff])

    def _update_elos(self, m, start_ct, t1, t2, res):
        t1avg = np.mean([self.player_elos[m][start_ct][p] for p in t1])
        t2avg = np.mean([self.player_elos[m][start_ct][p] for p in t2])
        exp = 1 / (1 + 10 ** ((t2avg - t1avg)/400))
        delta = self.k * (res - exp)
        for p in t1: self.player_elos[m][start_ct][p] += delta
        for p in t2: self.player_elos[m][start_ct][p] -= delta
        
from sklearn.preprocessing import LabelEncoder as SklearnLabelEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from scipy import sparse
import numpy as np

# --------------------------------------------------------------
# Player Label Encoder
# --------------------------------------------------------------
class PlayerIDLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoder = SklearnLabelEncoder()
        self.classes_set = set()
    
    def fit(self, X, y=None):
        X_players = X[:, 2:12].flatten()
        self.encoder.fit(X_players)
        self.classes_set = set(self.encoder.classes_)
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        X_players = X[:, 2:12]
        
        def map_label(x):
            if x in self.classes_set:
                return self.encoder.transform([x])[0]
            else:
                return -1  # unseen players mapped to -1

        X_transformed[:, 2:12] = np.vectorize(map_label)(X_players)
        return X_transformed


# --------------------------------------------------------------
# Player Bag Encoder (works with label-encoded players)
# --------------------------------------------------------------
class PlayerBagEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X = np.asarray(X)
        # Map each unique player to a column
        self.d = {val: idx for idx, val in enumerate(np.unique(X[:, 2:12].flatten()))}
        return self

    def transform(self, X):
        X = np.asarray(X)
        rows, cols, data = [], [], []
        for i, row in enumerate(X):
            for j, val in enumerate(row[2:12]):  # only player IDs
                if val in self.d:
                    rows.append(i)
                    cols.append(self.d[val])
                    data.append(1 if j < 5 else -1)
        return sparse.csr_matrix((data, (rows, cols)), shape=(X.shape[0], len(self.d)), dtype=int)
from sklearn.preprocessing import MinMaxScaler


In [None]:
PATH_TO_GAMES_RAW = "data/games_raw"
TEST_SIZE = 100

game_ids = get_game_ids(PATH_TO_GAMES_RAW)
game_ids_train, game_ids_test = game_ids[:-TEST_SIZE], game_ids[-TEST_SIZE:]

X_train, y_train = get_X_y(PATH_TO_GAMES_RAW, game_ids_train)
X_test, y_test = get_X_y(PATH_TO_GAMES_RAW, game_ids_test)
assert X_train.shape[0] == y_train.shape[0]
assert X_test.shape[0] == y_test.shape[0]

pipeline = FeatureUnion([
    ('player_elo', Pipeline([
        ('elo', PlayerEloEncoder(k=30)),
        ('scaler', MinMaxScaler())
    ])),
    ('player_map_elo', Pipeline([
        ('map_elo', PlayerMapEloEncoder(k=30)),
        ('scaler', MinMaxScaler())
    ])),
    ('player_map_start_elo', Pipeline([
        ('start_elo', PlayerMapStartEloEncoder(k=30)),
        ('scaler', MinMaxScaler())
    ])),
    ('player_bag', Pipeline([
        ('label', PlayerIDLabelEncoder()),   
        ('bag', PlayerBagEncoder())           
    ]))
])

X_transformed_train = pipeline.fit_transform(X_train, y_train)
X_transformed_test  = pipeline.transform(X_test)

print("X_transformed_train shape:", X_transformed_train.shape)
print("X_transformed_test shape:", X_transformed_test.shape)

  0%|          | 0/56493 [00:00<?, ?it/s]

  0%|          | 0/37974 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]