In [None]:
import os
import json
from dateutil.parser import parse
from collections import defaultdict
import numpy as np
from tqdm.notebook import tqdm
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline, FeatureUnion

# ==============================================================
# 1️⃣ Data Loading & Validation
# ==============================================================

def generate_game_raw(path_to_games_raw_dir: str = "data/games_raw"):
    for filename in tqdm(os.listdir(path_to_games_raw_dir)):
        try:
            with open(os.path.join(path_to_games_raw_dir, filename), "r", encoding="utf-8") as f:
                yield json.load(f)
        except:
            continue

def validate_game(game: dict) -> bool:
    try:
        int(game["id"])
        parse(game["begin_at"])
        int(game["match"]["league"]["id"])
        int(game["match"]["serie"]["id"])
        int(game["match"]["tournament"]["id"])
        int(game["map"]["id"])

        team_players = defaultdict(list)
        for p in game["players"]:
            team_players[p["team"]["id"]].append(p["player"]["id"])
        assert len(team_players) == 2
        for p_ids in team_players.values():
            assert len(set(p_ids)) == 5

        rounds = [r["round"] for r in game["rounds"] if r["round"] is not None]
        assert min(rounds) == 1
        assert max(rounds) >= 16
        return True
    except:
        return False

def get_game_ids(path_to_games_raw_dir: str = "data/games_raw") -> list[int]:
    game_ids_valid, game_begin_at_valid = [], []
    for game in generate_game_raw(path_to_games_raw_dir):
        if validate_game(game):
            game_ids_valid.append(game["id"])
            game_begin_at_valid.append(parse(game["begin_at"]))
    return np.array(game_ids_valid)[np.argsort(game_begin_at_valid)].tolist()

def get_X_for_game(path_to_games_raw_dir: str, game_id: int):
    with open(os.path.join(path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
        game = json.load(f)
    X = []
    team_players = defaultdict(list)
    for p in game["players"]:
        team_players[p["team"]["id"]].append(p["player"]["id"])
    t1_id, t2_id = sorted(team_players.keys())
    X.append(int(game["map"]["id"]))              # 0: map_id
    X.append(int(game["rounds"][0]["ct"]))        # 1: start_ct_team_id
    X.extend([t1_id, t2_id])                      # 2-3: team ids
    X.extend(sorted(team_players[t1_id]))         # 4-8: team1 player ids
    X.extend(sorted(team_players[t2_id]))         # 9-13: team2 player ids
    return X

def get_y_for_game(path_to_games_raw_dir: str, game_id: int):
    with open(os.path.join(path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
        game = json.load(f)
    t1_id, t2_id = np.unique([p["team"]["id"] for p in game["players"]])
    team_win_count = {t1_id: 0, t2_id: 0}
    for r in game["rounds"]:
        team_win_count[r["winner_team"]] += 1
    return int(team_win_count[t1_id] > team_win_count[t2_id])

def get_X_y(path_to_games_raw: str, game_ids: list[int]):
    X, y = [], []
    for game_id in tqdm(game_ids):
        try:
            x_row = get_X_for_game(path_to_games_raw, game_id)
            y_row = get_y_for_game(path_to_games_raw, game_id)
            X.append(x_row)
            y.append(y_row)
        except:
            continue
    X_arr = np.array(X, dtype=int)
    y_arr = np.array(y, dtype=int)
    assert X_arr.shape[0] == y_arr.shape[0], f"Mismatched shapes: X={X_arr.shape}, y={y_arr.shape}"
    return X_arr, y_arr

# ==============================================================
# 2️⃣ Custom Transformers
# ==============================================================

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None): return self
    def transform(self, X): return X[:, self.columns]

class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.encoders = {}
    def fit(self, X, y=None):
        for col in self.columns:
            le = LabelEncoder()
            le.fit(X[:, col])
            self.encoders[col] = le
        return self
    def transform(self, X):
        X_trans = X.copy()
        for col, le in self.encoders.items():
            X_trans[:, col] = le.transform(X_trans[:, col])
        return X_trans.astype(int)

import os
import json
from dateutil.parser import parse
from collections import defaultdict
import numpy as np
from tqdm.notebook import tqdm
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline, FeatureUnion

# ==============================================================
# 1️⃣ Data Loading & Validation
# ==============================================================

def generate_game_raw(path_to_games_raw_dir: str = "data/games_raw"):
    for filename in tqdm(os.listdir(path_to_games_raw_dir)):
        try:
            with open(os.path.join(path_to_games_raw_dir, filename), "r", encoding="utf-8") as f:
                yield json.load(f)
        except:
            continue

def validate_game(game: dict) -> bool:
    try:
        int(game["id"])
        parse(game["begin_at"])
        int(game["match"]["league"]["id"])
        int(game["match"]["serie"]["id"])
        int(game["match"]["tournament"]["id"])
        int(game["map"]["id"])

        team_players = defaultdict(list)
        for p in game["players"]:
            team_players[p["team"]["id"]].append(p["player"]["id"])
        assert len(team_players) == 2
        for p_ids in team_players.values():
            assert len(set(p_ids)) == 5

        rounds = [r["round"] for r in game["rounds"] if r["round"] is not None]
        assert min(rounds) == 1
        assert max(rounds) >= 16
        return True
    except:
        return False

def get_game_ids(path_to_games_raw_dir: str = "data/games_raw") -> list[int]:
    game_ids_valid, game_begin_at_valid = [], []
    for game in generate_game_raw(path_to_games_raw_dir):
        if validate_game(game):
            game_ids_valid.append(game["id"])
            game_begin_at_valid.append(parse(game["begin_at"]))
    return np.array(game_ids_valid)[np.argsort(game_begin_at_valid)].tolist()

def get_X_for_game(path_to_games_raw_dir: str, game_id: int):
    with open(os.path.join(path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
        game = json.load(f)
    X = []
    team_players = defaultdict(list)
    for p in game["players"]:
        team_players[p["team"]["id"]].append(p["player"]["id"])
    t1_id, t2_id = sorted(team_players.keys())
    X.append(int(game["map"]["id"]))              # 0: map_id
    X.append(int(game["rounds"][0].get("ct", 0))) # 1: start_ct_team_id
    X.extend([t1_id, t2_id])                      # 2-3: team ids
    X.extend(sorted(team_players[t1_id]))         # 4-8: team1 player ids
    X.extend(sorted(team_players[t2_id]))         # 9-13: team2 player ids
    return X

def get_y_for_game(path_to_games_raw_dir: str, game_id: int):
    with open(os.path.join(path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
        game = json.load(f)
    team_ids = sorted(set(p["team"]["id"] for p in game["players"]))
    t1_id, t2_id = team_ids
    team_win_count = {t1_id: 0, t2_id: 0}
    for r in game["rounds"]:
        winner = r.get("winner_team")
        if winner in team_win_count:
            team_win_count[winner] += 1
    return int(team_win_count[t1_id] > team_win_count[t2_id])

def get_X_y(path_to_games_raw: str, game_ids: list[int]):
    X, y = [], []
    for game_id in tqdm(game_ids):
        try:
            x_row = get_X_for_game(path_to_games_raw, game_id)
            y_row = get_y_for_game(path_to_games_raw, game_id)
            X.append(x_row)
            y.append(y_row)
        except:
            continue
    X_arr = np.array(X, dtype=int)
    y_arr = np.array(y, dtype=int)
    assert X_arr.shape[0] == y_arr.shape[0], f"Mismatched shapes: X={X_arr.shape}, y={y_arr.shape}"
    return X_arr, y_arr

# ==============================================================
# 2️⃣ Custom Transformers
# ==============================================================

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None): return self
    def transform(self, X): return X[:, self.columns]

class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.encoders = {}
    def fit(self, X, y=None):
        for col in self.columns:
            le = LabelEncoder()
            le.fit(X[:, col])
            self.encoders[col] = le
        return self
    def transform(self, X):
        X_trans = X.copy()
        for col, le in self.encoders.items():
            X_trans[:, col] = le.transform(X_trans[:, col])
        return X_trans.astype(int)

class PlayerEloEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, k=30, initial_elo=1500):
        self.k = k
        self.initial_elo = initial_elo
        self.player_elos = defaultdict(lambda: self.initial_elo)
        self.X_train = None

    def fit(self, X, y):
        feats = []
        for i in range(X.shape[0]):
            t1, t2 = X[i, :5], X[i, 5:]
            res = y[i]
            feats.append(self._augment_row(X[i]))
            self._update_elos(t1, t2, res)
        self.X_train = np.array(feats)
        return self

    def transform(self, X):
        if self.X_train is not None and X.shape == self.X_train.shape:
            return self.X_train
        features = []
        for row in X:
            features.append(self._augment_row(row))
        return np.array(features)

    def _augment_row(self, row):
        t1, t2 = row[:5], row[5:]
        t1e = np.sort([self.player_elos[p] for p in t1])
        t2e = np.sort([self.player_elos[p] for p in t2])
        t1avg, t2avg = np.mean(t1e), np.mean(t2e)
        diff = t1avg - t2avg
        pairdiff = (t1e[:, None] - t2e[None, :]).flatten()
        return np.concatenate([t1e, t2e, [t1avg, t2avg, diff], pairdiff])

    def _update_elos(self, t1, t2, res):
        t1avg = np.mean([self.player_elos[p] for p in t1])
        t2avg = np.mean([self.player_elos[p] for p in t2])
        exp = 1 / (1 + 10 ** ((t2avg - t1avg)/400))
        delta = self.k * (res - exp)
        for p in t1: self.player_elos[p] += delta
        for p in t2: self.player_elos[p] -= delta


class PlayerMapEloEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, k=30, initial_elo=1500):
        self.k = k
        self.initial_elo = initial_elo
        self.player_elos = defaultdict(lambda: defaultdict(lambda: self.initial_elo))
        self.X_train = None

    def fit(self, X, y):
        feats = []
        for i in range(X.shape[0]):
            m = X[i,0]
            t1, t2 = X[i,1:6], X[i,6:11]
            res = y[i]
            feats.append(self._augment_row(m, X[i]))
            self._update_elos(m, t1, t2, res)
        self.X_train = np.array(feats)
        return self

    def transform(self, X):
        if self.X_train is not None and X.shape == self.X_train.shape:
            return self.X_train
        features = []
        for row in X:
            m = row[0]
            features.append(self._augment_row(m, row))
        return np.array(features)

    def _augment_row(self, m, row):
        t1, t2 = row[1:6], row[6:11]
        t1e = np.sort([self.player_elos[m][p] for p in t1])
        t2e = np.sort([self.player_elos[m][p] for p in t2])
        t1avg, t2avg = np.mean(t1e), np.mean(t2e)
        diff = t1avg - t2avg
        pairdiff = (t1e[:, None] - t2e[None,:]).flatten()
        return np.concatenate([t1e, t2e, [t1avg,t2avg,diff], pairdiff])

    def _update_elos(self, m, t1, t2, res):
        t1avg = np.mean([self.player_elos[m][p] for p in t1])
        t2avg = np.mean([self.player_elos[m][p] for p in t2])
        exp = 1 / (1 + 10 ** ((t2avg - t1avg)/400))
        delta = self.k * (res - exp)
        for p in t1: self.player_elos[m][p] += delta
        for p in t2: self.player_elos[m][p] -= delta


class PlayerMapStartEloEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, k=30, initial_elo=1500):
        self.k = k
        self.initial_elo = initial_elo
        self.player_elos = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: self.initial_elo)))
        self.X_train = None

    def fit(self, X, y):
        feats = []
        for i in range(X.shape[0]):
            m, start_ct = X[i,0], X[i,1]
            t1, t2 = X[i,2:7], X[i,7:12]
            res = y[i]
            feats.append(self._augment_row(m, start_ct, X[i]))
            self._update_elos(m, start_ct, t1, t2, res)
        self.X_train = np.array(feats)
        return self

    def transform(self, X):
        if self.X_train is not None and X.shape == self.X_train.shape:
            return self.X_train
        features = []
        for row in X:
            m, start_ct = row[0], row[1]
            features.append(self._augment_row(m, start_ct, row))
        return np.array(features)

    def _augment_row(self, m, start_ct, row):
        t1, t2 = row[2:7], row[7:12]
        t1e = np.sort([self.player_elos[m][start_ct][p] for p in t1])
        t2e = np.sort([self.player_elos[m][start_ct][p] for p in t2])
        t1avg, t2avg = np.mean(t1e), np.mean(t2e)
        diff = t1avg - t2avg
        pairdiff = (t1e[:,None] - t2e[None,:]).flatten()
        return np.concatenate([t1e, t2e, [t1avg,t2avg,diff], pairdiff])

    def _update_elos(self, m, start_ct, t1, t2, res):
        t1avg = np.mean([self.player_elos[m][start_ct][p] for p in t1])
        t2avg = np.mean([self.player_elos[m][start_ct][p] for p in t2])
        exp = 1 / (1 + 10 ** ((t2avg - t1avg)/400))
        delta = self.k * (res - exp)
        for p in t1: self.player_elos[m][start_ct][p] += delta
        for p in t2: self.player_elos[m][start_ct][p] -= delta




In [8]:
import os
import json
import numpy as np
from collections import defaultdict
from tqdm import tqdm

import os
import json
from dateutil.parser import parse
from collections import defaultdict
import numpy as np
from tqdm.notebook import tqdm
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline, FeatureUnion

# ==============================================================
# 1️⃣ Data Loading & Validation
# ==============================================================

def generate_game_raw(path_to_games_raw_dir: str = "data/games_raw"):
    for filename in tqdm(os.listdir(path_to_games_raw_dir)):
        try:
            with open(os.path.join(path_to_games_raw_dir, filename), "r", encoding="utf-8") as f:
                yield json.load(f)
        except:
            continue

def validate_game(game: dict) -> bool:
    try:
        int(game["id"])
        parse(game["begin_at"])
        int(game["match"]["league"]["id"])
        int(game["match"]["serie"]["id"])
        int(game["match"]["tournament"]["id"])
        int(game["map"]["id"])

        team_players = defaultdict(list)
        for p in game["players"]:
            team_players[p["team"]["id"]].append(p["player"]["id"])
        assert len(team_players) == 2
        for p_ids in team_players.values():
            assert len(set(p_ids)) == 5

        rounds = [r["round"] for r in game["rounds"] if r["round"] is not None]
        assert min(rounds) == 1
        assert max(rounds) >= 16
        return True
    except:
        return False

def get_game_ids(path_to_games_raw_dir: str = "data/games_raw") -> list[int]:
    game_ids_valid, game_begin_at_valid = [], []
    for game in generate_game_raw(path_to_games_raw_dir):
        if validate_game(game):
            game_ids_valid.append(game["id"])
            game_begin_at_valid.append(parse(game["begin_at"]))
    return np.array(game_ids_valid)[np.argsort(game_begin_at_valid)].tolist()

def get_X_y(path_to_games_raw: str, game_ids: list[int]):
    X_map_id = []
    X_start_ct_id = []
    X_team_id = []
    X_player_id = []
    y = []

    for game_id in tqdm(game_ids):
        try:
            # Load game JSON
            with open(os.path.join(path_to_games_raw, f"{game_id}.json"), "r", encoding="utf-8") as f:
                game = json.load(f)

            # Organize players by team
            team_players = defaultdict(list)
            for p in game["players"]:
                team_players[p["team"]["id"]].append(p["player"]["id"])
            t1_id, t2_id = sorted(team_players.keys())

            # Map ID
            X_map_id.append(int(game["map"]["id"]))

            # Start CT team ID
            X_start_ct_id.append(int(game["rounds"][0]["ct"]))

            # Team IDs
            X_team_id.append([t1_id, t2_id])

            # Player IDs (10 columns: 5 per team)
            player_ids = sorted(team_players[t1_id]) + sorted(team_players[t2_id])
            X_player_id.append(player_ids)

            # Target (winner)
            team_win_count = {t1_id: 0, t2_id: 0}
            for r in game["rounds"]:
                team_win_count[r["winner_team"]] += 1
            y.append(int(team_win_count[t1_id] > team_win_count[t2_id]))

        except Exception:
            continue

    # Convert to numpy arrays
    X_map_id = np.array(X_map_id, dtype=int)
    X_start_ct_id = np.array(X_start_ct_id, dtype=int)
    X_team_id = np.array(X_team_id, dtype=int)         # shape: (n_games, 2)
    X_player_id = np.array(X_player_id, dtype=int)     # shape: (n_games, 10)
    y = np.array(y, dtype=int)

    return X_map_id, X_start_ct_id, X_team_id, X_player_id, y

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.class_to_index = None

    def fit(self, X, y=None):        
        uniques = np.unique(X.flatten())
        self.class_to_index = dict(zip(uniques, range(len(uniques))))
        return self

    def transform(self, X):        
        if self.class_to_index is None:
            raise ValueError("CustomLabelEncoder has not been fitted yet.")
        vectorized_map = np.vectorize(lambda x: self.class_to_index.get(x, -1))
        return vectorized_map(X)

    def fit_transform(self, X, y=None):        
        return self.fit(X, y).transform(X)
    



In [None]:
PATH_TO_GAMES_RAW = "data/games_raw"
TEST_SIZE = 100

game_ids = get_game_ids(PATH_TO_GAMES_RAW)
X_map_id, X_start_ct_id, X_team_id, X_player_id, y = get_X_y(PATH_TO_GAMES_RAW, game_ids)
X_map_id_train = X_map_id[:-TEST_SIZE]


X_map_id_train      = X_map_id[:-TEST_SIZE]
X_team_id_train      = np.c_[X_start_ct_id[:-TEST_SIZE], X_team_id[:-TEST_SIZE]]
X_player_id_train    = X_player_id[:-TEST_SIZE]
y_train             = y[:-TEST_SIZE]


X_map_id_test       = X_map_id[-TEST_SIZE:]
X_team_id_test      = np.c_[X_start_ct_id[-TEST_SIZE:], X_team_id[-TEST_SIZE:]]
X_player_id_test     = X_player_id[-TEST_SIZE:]
y_test              = y[-TEST_SIZE:]


  0%|          | 0/56493 [00:00<?, ?it/s]

  0%|          | 0/38370 [00:00<?, ?it/s]

In [None]:
uniques = np.unique(X_map_id_train.flatten())
d = dict(zip(uniques, range(len(uniques))))

array([ 1,  2,  3,  4,  5,  6,  7,  8, 20, 28, 29, 30, 31])

In [10]:
import os
import json
from dateutil.parser import parse
from collections import defaultdict
import numpy as np
from tqdm.notebook import tqdm
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline, FeatureUnion

# ==============================================================
# 1️⃣ Data Loading & Validation
# ==============================================================

def generate_game_raw(path_to_games_raw_dir: str = "data/games_raw"):
    for filename in tqdm(os.listdir(path_to_games_raw_dir)):
        try:
            with open(os.path.join(path_to_games_raw_dir, filename), "r", encoding="utf-8") as f:
                yield json.load(f)
        except:
            continue

def validate_game(game: dict) -> bool:
    try:
        int(game["id"])
        parse(game["begin_at"])
        int(game["match"]["league"]["id"])
        int(game["match"]["serie"]["id"])
        int(game["match"]["tournament"]["id"])
        int(game["map"]["id"])

        team_players = defaultdict(list)
        for p in game["players"]:
            team_players[p["team"]["id"]].append(p["player"]["id"])
        assert len(team_players) == 2
        for p_ids in team_players.values():
            assert len(set(p_ids)) == 5

        rounds = [r["round"] for r in game["rounds"] if r["round"] is not None]
        assert min(rounds) == 1
        assert max(rounds) >= 16
        return True
    except:
        return False

def get_game_ids(path_to_games_raw_dir: str = "data/games_raw") -> list[int]:
    game_ids_valid, game_begin_at_valid = [], []
    for game in generate_game_raw(path_to_games_raw_dir):
        if validate_game(game):
            game_ids_valid.append(game["id"])
            game_begin_at_valid.append(parse(game["begin_at"]))
    return np.array(game_ids_valid)[np.argsort(game_begin_at_valid)].tolist()

def get_X_for_game(path_to_games_raw_dir: str, game_id: int):
    with open(os.path.join(path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
        game = json.load(f)
    X = []
    team_players = defaultdict(list)
    for p in game["players"]:
        team_players[p["team"]["id"]].append(p["player"]["id"])
    t1_id, t2_id = sorted(team_players.keys())
    X.append(int(game["map"]["id"]))               # 0: map_id
    X.append(int(game["rounds"][0].get("ct", 0)))  # 1: start_ct_team_id
    X.extend([t1_id, t2_id])                       # 2-3: team ids
    X.extend(sorted(team_players[t1_id]))          # 4-8: team1 player ids
    X.extend(sorted(team_players[t2_id]))          # 9-13: team2 player ids
    return X

def get_y_for_game(path_to_games_raw_dir: str, game_id: int):
    with open(os.path.join(path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
        game = json.load(f)
    team_ids = sorted(set(p["team"]["id"] for p in game["players"]))
    t1_id, t2_id = team_ids
    team_win_count = {t1_id: 0, t2_id: 0}
    for r in game["rounds"]:
        winner = r.get("winner_team")
        if winner in team_win_count:
            team_win_count[winner] += 1
    return int(team_win_count[t1_id] > team_win_count[t2_id])

def get_X_y(path_to_games_raw: str, game_ids: list[int]):
    X, y = [], []
    for game_id in tqdm(game_ids):
        try:
            x_row = get_X_for_game(path_to_games_raw, game_id)
            y_row = get_y_for_game(path_to_games_raw, game_id)
            X.append(x_row)
            y.append(y_row)
        except:
            continue
    X_arr = np.array(X, dtype=int)
    y_arr = np.array(y, dtype=int)
    assert X_arr.shape[0] == y_arr.shape[0], f"Mismatched shapes: X={X_arr.shape}, y={y_arr.shape}"
    return X_arr, y_arr

# ==============================================================
# 2️⃣ Custom Transformers
# ==============================================================

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None): return self
    def transform(self, X): return X[:, self.columns]

class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.encoders = {}
    def fit(self, X, y=None):
        for col in self.columns:
            le = LabelEncoder()
            le.fit(X[:, col])
            self.encoders[col] = le
        return self
    def transform(self, X):
        X_trans = X.copy()
        for col, le in self.encoders.items():
            X_trans[:, col] = le.transform(X_trans[:, col])
        return X_trans.astype(int)

# ==============================================================
# 3️⃣ Separate ELO Encoders with X_train caching
# ==============================================================

class PlayerEloEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, k=30, initial_elo=1500):
        self.k = k
        self.initial_elo = initial_elo
        self.player_elos = defaultdict(lambda: self.initial_elo)
        self.X_train = None

    def fit(self, X, y):
        feats = []
        for i in range(X.shape[0]):
            t1, t2 = X[i, :5], X[i, 5:]
            res = y[i]
            feats.append(self._augment_row(X[i]))
            self._update_elos(t1, t2, res)
        self.X_train = np.array(feats)
        return self

    def transform(self, X):
        if self.X_train is not None and X.shape == self.X_train.shape:
            return self.X_train
        return np.array([self._augment_row(row) for row in X])

    def _augment_row(self, row):
        t1, t2 = row[:5], row[5:]
        t1e = np.sort([self.player_elos[p] for p in t1])
        t2e = np.sort([self.player_elos[p] for p in t2])
        t1avg, t2avg = np.mean(t1e), np.mean(t2e)
        diff = t1avg - t2avg
        pairdiff = (t1e[:, None] - t2e[None, :]).flatten()
        return np.concatenate([t1e, t2e, [t1avg, t2avg, diff], pairdiff])

    def _update_elos(self, t1, t2, res):
        t1avg = np.mean([self.player_elos[p] for p in t1])
        t2avg = np.mean([self.player_elos[p] for p in t2])
        exp = 1 / (1 + 10 ** ((t2avg - t1avg)/400))
        delta = self.k * (res - exp)
        for p in t1: self.player_elos[p] += delta
        for p in t2: self.player_elos[p] -= delta

class PlayerMapEloEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, k=30, initial_elo=1500):
        self.k = k
        self.initial_elo = initial_elo
        self.player_elos = defaultdict(lambda: defaultdict(lambda: self.initial_elo))
        self.X_train = None

    def fit(self, X, y):
        feats = []
        for i in range(X.shape[0]):
            m = X[i,0]
            t1, t2 = X[i,1:6], X[i,6:11]
            res = y[i]
            feats.append(self._augment_row(m, X[i]))
            self._update_elos(m, t1, t2, res)
        self.X_train = np.array(feats)
        return self

    def transform(self, X):
        if self.X_train is not None and X.shape == self.X_train.shape:
            return self.X_train
        return np.array([self._augment_row(row[0], row) for row in X])

    def _augment_row(self, m, row):
        t1, t2 = row[1:6], row[6:11]
        t1e = np.sort([self.player_elos[m][p] for p in t1])
        t2e = np.sort([self.player_elos[m][p] for p in t2])
        t1avg, t2avg = np.mean(t1e), np.mean(t2e)
        diff = t1avg - t2avg
        pairdiff = (t1e[:, None] - t2e[None,:]).flatten()
        return np.concatenate([t1e, t2e, [t1avg,t2avg,diff], pairdiff])

    def _update_elos(self, m, t1, t2, res):
        t1avg = np.mean([self.player_elos[m][p] for p in t1])
        t2avg = np.mean([self.player_elos[m][p] for p in t2])
        exp = 1 / (1 + 10 ** ((t2avg - t1avg)/400))
        delta = self.k * (res - exp)
        for p in t1: self.player_elos[m][p] += delta
        for p in t2: self.player_elos[m][p] -= delta

class PlayerMapStartEloEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, k=30, initial_elo=1500):
        self.k = k
        self.initial_elo = initial_elo
        self.player_elos = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: self.initial_elo)))
        self.X_train = None

    def fit(self, X, y):
        feats = []
        for i in range(X.shape[0]):
            m, start_ct = X[i,0], X[i,1]
            t1, t2 = X[i,2:7], X[i,7:12]
            res = y[i]
            feats.append(self._augment_row(m, start_ct, X[i]))
            self._update_elos(m, start_ct, t1, t2, res)
        self.X_train = np.array(feats)
        return self

    def transform(self, X):
        if self.X_train is not None and X.shape == self.X_train.shape:
            return self.X_train
        return np.array([self._augment_row(row[0], row[1], row) for row in X])

    def _augment_row(self, m, start_ct, row):
        t1, t2 = row[2:7], row[7:12]
        t1e = np.sort([self.player_elos[m][start_ct][p] for p in t1])
        t2e = np.sort([self.player_elos[m][start_ct][p] for p in t2])
        t1avg, t2avg = np.mean(t1e), np.mean(t2e)
        diff = t1avg - t2avg
        pairdiff = (t1e[:,None] - t2e[None,:]).flatten()
        return np.concatenate([t1e, t2e, [t1avg,t2avg,diff], pairdiff])

    def _update_elos(self, m, start_ct, t1, t2, res):
        t1avg = np.mean([self.player_elos[m][start_ct][p] for p in t1])
        t2avg = np.mean([self.player_elos[m][start_ct][p] for p in t2])
        exp = 1 / (1 + 10 ** ((t2avg - t1avg)/400))
        delta = self.k * (res - exp)
        for p in t1: self.player_elos[m][start_ct][p] += delta
        for p in t2: self.player_elos[m][start_ct][p] -= delta

# ==============================================================
# 4️⃣ Pipeline Construction
# ==============================================================

MAP_ID_COL_IDX = [0]
START_CT_TEAM_ID_COL_IDX = [1]
TEAM_IDS_COL_IDX = [2, 3]
PLAYER_IDS_COL_IDX = list(range(4,14))

player_elo_pipeline = Pipeline([
    ('select_players', ColumnSelector(columns=PLAYER_IDS_COL_IDX)),
    ('label_encode_players', MultiColumnLabelEncoder(columns=list(range(len(PLAYER_IDS_COL_IDX))))),
    ('player_elo', PlayerEloEncoder())
])

player_map_elo_pipeline = Pipeline([
    ('select_map_players', ColumnSelector(columns=MAP_ID_COL_IDX + PLAYER_IDS_COL_IDX)),
    ('label_encode', MultiColumnLabelEncoder(columns=list(range(len(MAP_ID_COL_IDX + PLAYER_IDS_COL_IDX))))),
    ('player_map_elo', PlayerMapEloEncoder())
])

player_map_start_elo_pipeline = Pipeline([
    ('select_map_start_players', ColumnSelector(columns=MAP_ID_COL_IDX + START_CT_TEAM_ID_COL_IDX + PLAYER_IDS_COL_IDX)),
    ('label_encode', MultiColumnLabelEncoder(columns=list(range(len(MAP_ID_COL_IDX + START_CT_TEAM_ID_COL_IDX + PLAYER_IDS_COL_IDX))))),
    ('player_map_start_elo', PlayerMapStartEloEncoder())
])

elo_feature_union = FeatureUnion([
    ('player_elo', player_elo_pipeline),
    ('player_map_elo', player_map_elo_pipeline),
    ('player_map_start_elo', player_map_start_elo_pipeline)
])

# ==============================================================
# 5️⃣ Load data, fit, transform
# ==============================================================

PATH_TO_GAMES_RAW = "data/games_raw"
TEST_SIZE = 100

game_ids = get_game_ids(PATH_TO_GAMES_RAW)
game_ids_train, game_ids_test = game_ids[:-TEST_SIZE], game_ids[-TEST_SIZE:]

X_train, y_train = get_X_y(PATH_TO_GAMES_RAW, game_ids_train)
X_test, y_test = get_X_y(PATH_TO_GAMES_RAW, game_ids_test)




  0%|          | 0/56493 [00:00<?, ?it/s]

  0%|          | 0/38270 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [11]:
X_train

array([[     1,   3284,   3216, ...,  17811,  17833,  17834],
       [     3,   3216,   3216, ...,  17811,  17833,  17834],
       [     3,   3228,   3210, ...,  17499,  17500,  17543],
       ...,
       [     2, 126377, 125802, ...,  17501,  17543,  19666],
       [     7, 129501, 126709, ...,  21433,  21439,  25439],
       [     7, 125751,   5793, ...,  20370,  21440,  23684]],
      shape=(38270, 14))

In [12]:
y_train

array([1, 1, 0, ..., 0, 1, 0], shape=(38270,))

In [13]:
X_test

array([[     2,   3310,   3310, ...,  24103,  26620,  26672],
       [     7, 129444, 129444, ...,  24243,  29152,  30743],
       [     7, 130578,   3310, ...,  24103,  26620,  26672],
       ...,
       [     8, 130596,   3249, ...,  17799,  18732,  19214],
       [    31,   3240,   3240, ...,  40581,  40582,  40583],
       [    31, 126738, 126151, ...,  23032,  23035,  23562]],
      shape=(100, 14))

In [14]:
y_test

array([1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0])

In [4]:
PATH_TO_GAMES_RAW = "data/games_raw"
TEST_SIZE = 100

game_ids = get_game_ids(PATH_TO_GAMES_RAW)
game_ids_train, game_ids_test = game_ids[:-TEST_SIZE], game_ids[-TEST_SIZE:]

X_train, y_train = get_X_y(PATH_TO_GAMES_RAW, game_ids_train)
X_test, y_test = get_X_y(PATH_TO_GAMES_RAW, game_ids_test)

  0%|          | 0/56493 [00:00<?, ?it/s]

  0%|          | 0/38270 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [5]:
X_train

array([[     1,   3284,   3216, ...,  17811,  17833,  17834],
       [     3,   3216,   3216, ...,  17811,  17833,  17834],
       [     3,   3228,   3210, ...,  17499,  17500,  17543],
       ...,
       [     2, 126377, 125802, ...,  17501,  17543,  19666],
       [     7, 129501, 126709, ...,  21433,  21439,  25439],
       [     7, 125751,   5793, ...,  20370,  21440,  23684]],
      shape=(38266, 14))

In [6]:
y_train

array([1, 1, 0, ..., 0, 1, 0], shape=(38266,))

In [7]:
X_test

array([[     2,   3310,   3310, ...,  24103,  26620,  26672],
       [     7, 129444, 129444, ...,  24243,  29152,  30743],
       [     7, 130578,   3310, ...,  24103,  26620,  26672],
       ...,
       [     8, 130596,   3249, ...,  17799,  18732,  19214],
       [    31,   3240,   3240, ...,  40581,  40582,  40583],
       [    31, 126738, 126151, ...,  23032,  23035,  23562]],
      shape=(100, 14))

In [8]:
y_test

array([1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0])

In [None]:
import os
import json
from dateutil.parser import parse
from collections import defaultdict
import numpy as np
from tqdm import tqdm
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline

# =========================================================
# Data Utilities
# =========================================================

def generate_game_raw(path_to_games_raw_dir: str = "data/games_raw"):
    for filename in tqdm(os.listdir(path_to_games_raw_dir)):
        try:
            with open(os.path.join(path_to_games_raw_dir, filename), "r", encoding="utf-8") as f:
                yield json.load(f)
        except:
            continue

def validate_game(game: dict) -> bool:
    try:
        int(game["id"])
        parse(game["begin_at"])
        int(game["match"]["league"]["id"])
        int(game["match"]["serie"]["id"])
        int(game["match"]["tournament"]["id"])
        int(game["map"]["id"])
        team_players = defaultdict(list)
        for p in game["players"]:
            team_players[p["team"]["id"]].append(p["player"]["id"])
        assert len(team_players) == 2
        for p_ids in team_players.values():
            assert len(set(p_ids)) == 5
        rounds = [r["round"] for r in game["rounds"]]
        assert min(rounds) == 1
        assert max(rounds) >= 16
        return True
    except:
        return False

def get_game_ids(path_to_games_raw_dir: str = "data/games_raw"):
    game_ids_valid, game_begin_at_valid = [], []
    for game in generate_game_raw(path_to_games_raw_dir):
        if validate_game(game):
            game_ids_valid.append(game["id"])
            game_begin_at_valid.append(parse(game["begin_at"]))
    return np.array(game_ids_valid)[np.argsort(game_begin_at_valid)].tolist()

def get_X_for_game(path_to_games_raw_dir: str, game_id: int):
    with open(os.path.join(path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
        game = json.load(f)
    team_players = defaultdict(list)
    for p in game["players"]:
        team_players[p["team"]["id"]].append(p["player"]["id"])
    t1_id, t2_id = sorted(team_players.keys())
    X = [
        int(game["map"]["id"]),
        int(game["rounds"][0]["ct"]),
        t1_id,
        t2_id
    ]
    X.extend(sorted(team_players[t1_id]))
    X.extend(sorted(team_players[t2_id]))
    return X

def get_y_for_game(path_to_games_raw_dir: str, game_id: int):
    with open(os.path.join(path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
        game = json.load(f)
    t1_id, t2_id = np.unique([p["team"]["id"] for p in game["players"]])
    team_win_count = {t1_id: 0, t2_id: 0}
    for r in game["rounds"]:
        team_win_count[r["winner_team"]] += 1
    return int(team_win_count[t1_id] > team_win_count[t2_id])

def get_X_y(path_to_games_raw: str, game_ids: list[int]):
    X, y = [], []
    for game_id in tqdm(game_ids):
        try:
            X.append(get_X_for_game(path_to_games_raw, game_id))
            y.append(get_y_for_game(path_to_games_raw, game_id))
        except:
            continue
    return np.array(X, dtype=int), np.array(y, dtype=int)

# =========================================================
# Transformers
# =========================================================

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[:, self.columns]

class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.encoders = {}
    def fit(self, X, y=None):
        for col in self.columns:
            le = LabelEncoder()
            le.fit(X[:, col])
            self.encoders[col] = le
        return self
    def transform(self, X):
        X_trans = X.copy()
        for col, le in self.encoders.items():
            X_trans[:, col] = le.transform(X_trans[:, col])
        return X_trans.astype(int)

# =========================================================
# ELO Encoders
# =========================================================

class PlayerEloEncoder(BaseEstimator, TransformerMixin):    
    def __init__(self, k=30, initial_elo=1500):
        self.k = k
        self.initial_elo = initial_elo
        self.player_elos = defaultdict(lambda: self.initial_elo)
        self.X_train = None
    def fit(self, X, y):
        X_features = []
        for i in range(X.shape[0]):
            team1_players, team2_players = X[i, :5], X[i, 5:]
            result = y[i]
            feat_row = self._augment_row(X[i])
            X_features.append(feat_row)
            self._update_elos(team1_players, team2_players, result)
        self.X_train = np.array(X_features)
        return self
    def transform(self, X):        
        if self.X_train is not None and X.shape == self.X_train.shape:
            return self.X_train
        features = [self._augment_row(X[i]) for i in range(X.shape[0])]
        return np.array(features)
    def _augment_row(self, row):        
        team1_players, team2_players = row[:5], row[5:]
        t1_sorted = np.sort([self.player_elos[p] for p in team1_players])
        t2_sorted = np.sort([self.player_elos[p] for p in team2_players])
        t1_avg, t2_avg = np.mean(t1_sorted), np.mean(t2_sorted)
        t_diff = t1_avg - t2_avg
        pairwise_diff = (t1_sorted[:, None] - t2_sorted[None, :]).flatten()
        return np.concatenate([t1_sorted, t2_sorted, [t1_avg, t2_avg, t_diff], pairwise_diff])
    def _update_elos(self, team1_players, team2_players, result):        
        t1_avg = np.mean([self.player_elos[p] for p in team1_players])
        t2_avg = np.mean([self.player_elos[p] for p in team2_players])
        expected = 1 / (1 + 10 ** ((t2_avg - t1_avg) / 400))
        delta = self.k * (result - expected)
        for p in team1_players:
            self.player_elos[p] += delta
        for p in team2_players:
            self.player_elos[p] -= delta

class PlayerMapEloEncoder(PlayerEloEncoder):    
    def __init__(self, k=30, initial_elo=1500):
        super().__init__(k, initial_elo)
        self.player_elos = defaultdict(lambda: defaultdict(lambda: self.initial_elo))
    def fit(self, X, y):
        X_features = []
        for i in range(X.shape[0]):
            map_id = X[i, 0]
            team1_players, team2_players = X[i, 1:6], X[i, 6:11]
            result = y[i]
            feat_row = self._augment_row(map_id, X[i])
            X_features.append(feat_row)
            self._update_elos(map_id, team1_players, team2_players, result)
        self.X_train = np.array(X_features)
        return self
    def transform(self, X):
        if self.X_train is not None and X.shape == self.X_train.shape:
            return self.X_train
        features = [self._augment_row(X[i, 0], X[i]) for i in range(X.shape[0])]
        return np.array(features)
    def _augment_row(self, map_id, row):        
        team1_players, team2_players = row[1:6], row[6:11]
        t1_sorted = np.sort([self.player_elos[map_id][p] for p in team1_players])
        t2_sorted = np.sort([self.player_elos[map_id][p] for p in team2_players])
        t1_avg, t2_avg = np.mean(t1_sorted), np.mean(t2_sorted)
        t_diff = t1_avg - t2_avg
        pairwise_diff = (t1_sorted[:, None] - t2_sorted[None, :]).flatten()
        return np.concatenate([t1_sorted, t2_sorted, [t1_avg, t2_avg, t_diff], pairwise_diff])
    def _update_elos(self, map_id, team1_players, team2_players, result):        
        t1_avg = np.mean([self.player_elos[map_id][p] for p in team1_players])
        t2_avg = np.mean([self.player_elos[map_id][p] for p in team2_players])
        expected = 1 / (1 + 10 ** ((t2_avg - t1_avg) / 400))
        delta = self.k * (result - expected)
        for p in team1_players:
            self.player_elos[map_id][p] += delta
        for p in team2_players:
            self.player_elos[map_id][p] -= delta

class PlayerMapStartEloEncoder(PlayerMapEloEncoder):
    def __init__(self, k=30, initial_elo=1500):
        super().__init__(k, initial_elo)
        self.player_elos = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: self.initial_elo)))
    def fit(self, X, y):
        X_features = []
        for i in range(X.shape[0]):
            map_id, start_ct_id = X[i, 0], X[i, 1]
            team1_players, team2_players = X[i, 2:7], X[i, 7:12]
            result = y[i]
            feat_row = self._augment_row(map_id, start_ct_id, X[i])
            X_features.append(feat_row)
            self._update_elos(map_id, start_ct_id, team1_players, team2_players, result)
        self.X_train = np.array(X_features)
        return self
    def transform(self, X):
        if self.X_train is not None and X.shape == self.X_train.shape:
            return self.X_train
        features = [self._augment_row(X[i, 0], X[i, 1], X[i]) for i in range(X.shape[0])]
        return np.array(features)
    def _augment_row(self, map_id, start_ct_id, row):        
        team1_players, team2_players = row[2:7], row[7:12]
        t1_sorted = np.sort([self.player_elos[map_id][start_ct_id][p] for p in team1_players])
        t2_sorted = np.sort([self.player_elos[map_id][start_ct_id][p] for p in team2_players])
        t1_avg, t2_avg = np.mean(t1_sorted), np.mean(t2_sorted)
        t_diff = t1_avg - t2_avg
        pairwise_diff = (t1_sorted[:, None] - t2_sorted[None, :]).flatten()
        return np.concatenate([t1_sorted, t2_sorted, [t1_avg, t2_avg, t_diff], pairwise_diff])
    def _update_elos(self, map_id, start_ct_id, team1_players, team2_players, result):        
        t1_avg = np.mean([self.player_elos[map_id][start_ct_id][p] for p in team1_players])
        t2_avg = np.mean([self.player_elos[map_id][start_ct_id][p] for p in team2_players])
        expected = 1 / (1 + 10 ** ((t2_avg - t1_avg) / 400))
        delta = self.k * (result - expected)
        for p in team1_players:
            self.player_elos[map_id][start_ct_id][p] += delta
        for p in team2_players:
            self.player_elos[map_id][start_ct_id][p] -= delta

# =========================================================
# Column Index Config
# =========================================================

MAP_ID_COL_IDX = [0]
START_CT_TEAM_ID_COL_IDX = [1]
TEAM_IDS_COL_IDX = [2, 3]
PLAYER_IDS_COL_IDX = [4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
TEAM_ALL_COLS = START_CT_TEAM_ID_COL_IDX + TEAM_IDS_COL_IDX

# =========================================================
# Pipelines
# =========================================================

player_elo_pipeline = Pipeline([
    ('label_encode_players', MultiColumnLabelEncoder(columns=PLAYER_IDS_COL_IDX)),
    ('select_players', ColumnSelector(columns=PLAYER_IDS_COL_IDX)),
    ('player_elo', PlayerEloEncoder())
])

player_map_elo_pipeline = Pipeline([
    ('label_encode', MultiColumnLabelEncoder(columns=MAP_ID_COL_IDX + PLAYER_IDS_COL_IDX)),
    ('select_map_players', ColumnSelector(columns=MAP_ID_COL_IDX + PLAYER_IDS_COL_IDX)),
    ('player_map_elo', PlayerMapEloEncoder())
])

player_map_start_elo_pipeline = Pipeline([
    ('label_encode', MultiColumnLabelEncoder(columns=MAP_ID_COL_IDX + TEAM_ALL_COLS + PLAYER_IDS_COL_IDX)),
    ('select_map_start_players', ColumnSelector(columns=MAP_ID_COL_IDX + START_CT_TEAM_ID_COL_IDX + PLAYER_IDS_COL_IDX)),
    ('player_map_start_elo', PlayerMapStartEloEncoder())
])

# =========================================================
# Load and Run
# =========================================================

PATH_TO_GAMES_RAW = "data/games_raw"
TEST_SIZE = 100

game_ids = get_game_ids(PATH_TO_GAMES_RAW)
game_ids_train, game_ids_test = game_ids[:-TEST_SIZE], game_ids[-TEST_SIZE:]
X_train, y_train = get_X_y(PATH_TO_GAMES_RAW, game_ids_train)
X_test, y_test = get_X_y(PATH_TO_GAMES_RAW, game_ids_test)

player_elo_pipeline.fit(X_train, y_train)
player_map_elo_pipeline.fit(X_train, y_train)
player_map_start_elo_pipeline.fit(X_train, y_train)

X_train_elo = player_elo_pipeline.transform(X_train)
X_test_elo = player_elo_pipeline.transform(X_test)

X_train_map_elo = player_map_elo_pipeline.transform(X_train)
X_test_map_elo = player_map_elo_pipeline.transform(X_test)

X_train_map_start_elo = player_map_start_elo_pipeline.transform(X_train)
X_test_map_start_elo = player_map_start_elo_pipeline.transform(X_test)


100%|██████████| 56493/56493 [00:16<00:00, 3359.73it/s]
 68%|██████▊   | 25846/38270 [00:11<00:05, 2234.28it/s]

In [None]:
PATH_TO_GAMES_RAW = "data/games_raw"
TEST_SIZE = 100

game_ids = get_game_ids(PATH_TO_GAMES_RAW)
game_ids_train, game_ids_test = game_ids[:-TEST_SIZE], game_ids[-TEST_SIZE:]

X_train, y_train = get_X_y(PATH_TO_GAMES_RAW, game_ids_train)
X_test, y_test = get_X_y(PATH_TO_GAMES_RAW, game_ids_test)





  0%|          | 0/56493 [00:00<?, ?it/s]

  0%|          | 0/37974 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [3]:
X_train

array([[     1,   3284,   3216, ...,  17811,  17833,  17834],
       [     3,   3216,   3216, ...,  17811,  17833,  17834],
       [     3,   3228,   3210, ...,  17499,  17500,  17543],
       ...,
       [     8, 125802, 125802, ...,  17501,  17543,  19666],
       [     2, 126377, 125802, ...,  17501,  17543,  19666],
       [     7, 129501, 126709, ...,  21433,  21439,  25439]],
      shape=(37974, 14))

In [7]:
import os
import json
from dateutil.parser import parse
from collections import defaultdict
import numpy as np
from tqdm.notebook import tqdm
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, accuracy_score, precision_score,
    recall_score, f1_score, confusion_matrix
)

# ==============================================================
# 1️⃣ Data Loading & Validation
# ==============================================================

def generate_game_raw(path_to_games_raw_dir: str = "data/games_raw"):
    for filename in tqdm(os.listdir(path_to_games_raw_dir)):
        try:
            with open(os.path.join(path_to_games_raw_dir, filename), "r", encoding="utf-8") as f:
                yield json.load(f)
        except:
            continue

def validate_game(game: dict) -> bool:
    try:
        int(game["id"])
        parse(game["begin_at"])
        int(game["match"]["league"]["id"])
        int(game["match"]["serie"]["id"])
        int(game["match"]["tournament"]["id"])
        int(game["map"]["id"])

        team_players = defaultdict(list)
        for p in game["players"]:
            team_players[p["team"]["id"]].append(p["player"]["id"])
        assert len(team_players) == 2
        for p_ids in team_players.values():
            assert len(set(p_ids)) == 5

        team_ids = list(team_players.keys())
        rounds = [r["round"] for r in game["rounds"] if r["round"] is not None]
        assert min(rounds) == 1
        assert max(rounds) >= 16
        return True
    except:
        return False

def get_game_ids(path_to_games_raw_dir: str = "data/games_raw") -> list[int]:
    game_ids_valid, game_begin_at_valid = [], []
    for game in generate_game_raw(path_to_games_raw_dir):
        if validate_game(game):
            game_ids_valid.append(game["id"])
            game_begin_at_valid.append(parse(game["begin_at"]))
    return np.array(game_ids_valid)[np.argsort(game_begin_at_valid)].tolist()

def get_X_for_game(path_to_games_raw_dir: str, game_id: int):
    with open(os.path.join(path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
        game = json.load(f)
    X = []
    team_players = defaultdict(list)
    for p in game["players"]:
        team_players[p["team"]["id"]].append(p["player"]["id"])
    t1_id, t2_id = sorted(team_players.keys())
    X.append(int(game["map"]["id"]))              # 0: map_id
    X.append(int(game["rounds"][0]["ct"]))        # 1: start_ct_team_id
    X.extend([t1_id, t2_id])                      # 2-3: team ids
    X.extend(sorted(team_players[t1_id]))         # 4-8: team1 player ids
    X.extend(sorted(team_players[t2_id]))         # 9-13: team2 player ids
    return X

def get_y_for_game(path_to_games_raw_dir: str, game_id: int):
    with open(os.path.join(path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
        game = json.load(f)
    t1_id, t2_id = np.unique([p["team"]["id"] for p in game["players"]])
    team_win_count = {t1_id: 0, t2_id: 0}
    for r in game["rounds"]:
        team_win_count[r["winner_team"]] += 1
    return int(team_win_count[t1_id] > team_win_count[t2_id])

def get_X_y(path_to_games_raw: str, game_ids: list[int]):
    X, y = [], []
    for game_id in tqdm(game_ids):
        try:
            X.append(get_X_for_game(path_to_games_raw, game_id))
            y.append(get_y_for_game(path_to_games_raw, game_id))
        except:
            continue
    return np.array(X, dtype=int), np.array(y, dtype=int)

# ==============================================================
# 2️⃣ Custom Transformers
# ==============================================================

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None): return self
    def transform(self, X): return X[:, self.columns]

class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.encoders = {}
    def fit(self, X, y=None):
        for col in self.columns:
            le = LabelEncoder()
            le.fit(X[:, col])
            self.encoders[col] = le
        return self
    def transform(self, X):
        X_trans = X.copy()
        for col, le in self.encoders.items():
            X_trans[:, col] = le.transform(X_trans[:, col])
        return X_trans.astype(int)

# ==============================================================
# 3️⃣ ELO Encoders
# ==============================================================

class PlayerEloEncoder(BaseEstimator, TransformerMixin):    
    def __init__(self, k=30, initial_elo=1500):
        self.k, self.initial_elo = k, initial_elo
        self.player_elos = defaultdict(lambda: self.initial_elo)
        self.X_train = None
    def fit(self, X, y):
        feats = []
        for i in range(X.shape[0]):
            t1, t2 = X[i, :5], X[i, 5:]
            res = y[i]
            f = self._augment_row(X[i])
            feats.append(f)
            self._update_elos(t1, t2, res)
        self.X_train = np.array(feats)
        return self
    def transform(self, X):
        if self.X_train is not None and X.shape == self.X_train.shape:
            return self.X_train
        return np.array([self._augment_row(row) for row in X])
    def _augment_row(self, row):
        t1, t2 = row[:5], row[5:]
        t1e, t2e = np.sort([self.player_elos[p] for p in t1]), np.sort([self.player_elos[p] for p in t2])
        t1avg, t2avg = np.mean(t1e), np.mean(t2e)
        diff = t1avg - t2avg
        pairdiff = (t1e[:, None] - t2e[None, :]).flatten()
        return np.concatenate([t1e, t2e, [t1avg, t2avg, diff], pairdiff])
    def _update_elos(self, t1, t2, res):
        t1avg, t2avg = np.mean([self.player_elos[p] for p in t1]), np.mean([self.player_elos[p] for p in t2])
        exp = 1 / (1 + 10 ** ((t2avg - t1avg) / 400))
        delta = self.k * (res - exp)
        for p in t1: self.player_elos[p] += delta
        for p in t2: self.player_elos[p] -= delta

class PlayerMapEloEncoder(PlayerEloEncoder):
    def __init__(self, k=30, initial_elo=1500):
        super().__init__(k, initial_elo)
        self.player_elos = defaultdict(lambda: defaultdict(lambda: self.initial_elo))
    def fit(self, X, y):
        feats = []
        for i in range(X.shape[0]):
            m = X[i, 0]
            t1, t2 = X[i, 1:6], X[i, 6:11]
            res = y[i]
            f = self._augment_row(m, X[i])
            feats.append(f)
            self._update_elos(m, t1, t2, res)
        self.X_train = np.array(feats)
        return self
    def transform(self, X):
        if self.X_train is not None and X.shape == self.X_train.shape:
            return self.X_train
        return np.array([self._augment_row(row[0], row) for row in X])
    def _augment_row(self, m, row):
        t1, t2 = row[1:6], row[6:11]
        t1e, t2e = np.sort([self.player_elos[m][p] for p in t1]), np.sort([self.player_elos[m][p] for p in t2])
        t1avg, t2avg = np.mean(t1e), np.mean(t2e)
        diff = t1avg - t2avg
        pairdiff = (t1e[:, None] - t2e[None, :]).flatten()
        return np.concatenate([t1e, t2e, [t1avg, t2avg, diff], pairdiff])
    def _update_elos(self, m, t1, t2, res):
        t1avg, t2avg = np.mean([self.player_elos[m][p] for p in t1]), np.mean([self.player_elos[m][p] for p in t2])
        exp = 1 / (1 + 10 ** ((t2avg - t1avg) / 400))
        delta = self.k * (res - exp)
        for p in t1: self.player_elos[m][p] += delta
        for p in t2: self.player_elos[m][p] -= delta

class PlayerMapStartEloEncoder(PlayerEloEncoder):
    def __init__(self, k=30, initial_elo=1500):
        super().__init__(k, initial_elo)
        self.player_elos = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: self.initial_elo)))
    def fit(self, X, y):
        feats = []
        for i in range(X.shape[0]):
            m, start_ct = X[i, 0], X[i, 1]
            t1, t2 = X[i, 2:7], X[i, 7:12]
            res = y[i]
            f = self._augment_row(m, start_ct, X[i])
            feats.append(f)
            self._update_elos(m, start_ct, t1, t2, res)
        self.X_train = np.array(feats)
        return self
    def transform(self, X):
        if self.X_train is not None and X.shape == self.X_train.shape:
            return self.X_train
        return np.array([self._augment_row(row[0], row[1], row) for row in X])
    def _augment_row(self, m, start_ct, row):
        t1, t2 = row[2:7], row[7:12]
        t1e, t2e = np.sort([self.player_elos[m][start_ct][p] for p in t1]), np.sort([self.player_elos[m][start_ct][p] for p in t2])
        t1avg, t2avg = np.mean(t1e), np.mean(t2e)
        diff = t1avg - t2avg
        pairdiff = (t1e[:, None] - t2e[None, :]).flatten()
        return np.concatenate([t1e, t2e, [t1avg, t2avg, diff], pairdiff])
    def _update_elos(self, m, start_ct, t1, t2, res):
        t1avg, t2avg = np.mean([self.player_elos[m][start_ct][p] for p in t1]), np.mean([self.player_elos[m][start_ct][p] for p in t2])
        exp = 1 / (1 + 10 ** ((t2avg - t1avg) / 400))
        delta = self.k * (res - exp)
        for p in t1: self.player_elos[m][start_ct][p] += delta
        for p in t2: self.player_elos[m][start_ct][p] -= delta

# ==============================================================
# 4️⃣ Pipeline Construction
# ==============================================================

from sklearn.pipeline import Pipeline, FeatureUnion

# Column indices
MAP_ID_COL_IDX = [0]
START_CT_TEAM_ID_COL_IDX = [1]
TEAM_IDS_COL_IDX = [2, 3]
PLAYER_IDS_COL_IDX = [4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
TEAM_ALL_COLS = START_CT_TEAM_ID_COL_IDX + TEAM_IDS_COL_IDX

# -------------------------------
# Player Elo pipeline
# -------------------------------
player_elo_pipeline = Pipeline([
    ('select_players', ColumnSelector(columns=PLAYER_IDS_COL_IDX)),
    ('label_encode_players', MultiColumnLabelEncoder(columns=list(range(len(PLAYER_IDS_COL_IDX))))),
    ('player_elo', PlayerEloEncoder())
])

# -------------------------------
# Player + Map Elo pipeline
# -------------------------------
player_map_elo_pipeline = Pipeline([
    ('select_map_players', ColumnSelector(columns=MAP_ID_COL_IDX + PLAYER_IDS_COL_IDX)),
    ('label_encode', MultiColumnLabelEncoder(columns=list(range(len(MAP_ID_COL_IDX + PLAYER_IDS_COL_IDX))))),
    ('player_map_elo', PlayerMapEloEncoder())
])

# -------------------------------
# Player + Map + Start Team Elo pipeline
# -------------------------------
player_map_start_elo_pipeline = Pipeline([
    ('select_map_start_players', ColumnSelector(columns=MAP_ID_COL_IDX + START_CT_TEAM_ID_COL_IDX + PLAYER_IDS_COL_IDX)),
    ('label_encode', MultiColumnLabelEncoder(columns=list(range(len(MAP_ID_COL_IDX + START_CT_TEAM_ID_COL_IDX + PLAYER_IDS_COL_IDX))))),
    ('player_map_start_elo', PlayerMapStartEloEncoder())
])

# -------------------------------
# Combine all pipelines
# -------------------------------
elo_feature_union = FeatureUnion([
    ('player_elo', player_elo_pipeline),
    ('player_map_elo', player_map_elo_pipeline),
    ('player_map_start_elo', player_map_start_elo_pipeline)
])

# Fit and transform
elo_feature_union.fit(X_train, y_train)
X_train_all = elo_feature_union.transform(X_train)
X_test_all = elo_feature_union.transform(X_test)

print("Final Combined Feature Shapes:")
print("Train:", X_train_all.shape)
print("Test:", X_test_all.shape)


IndexError: index 38266 is out of bounds for axis 0 with size 38266

In [5]:
# --- Map map_id to integer indices ---
uniques = np.unique(X_train[:, 0])
map_dict = dict(zip(uniques, range(len(uniques))))
for i in range(X_train.shape[0]):
    X_train[i, 0] = map_dict.get(X_train[i, 0], -1)
for i in range(X_test.shape[0]):
    X_test[i, 0] = map_dict.get(X_test[i, 0], -1)

# --- Map team IDs (start_ct, t1_id, t2_id) to integer indices ---
uniques = np.unique(X_train[:, [1, 2, 3]].flatten())
team_dict = dict(zip(uniques, range(len(uniques))))
for i in range(X_train.shape[0]):
    for j in [1, 2, 3]:
        X_train[i, j] = team_dict.get(X_train[i, j], -1)
for i in range(X_test.shape[0]):
    for j in [1, 2, 3]:
        X_test[i, j] = team_dict.get(X_test[i, j], -1)

# --- Map player IDs (columns 4-13) to integer indices ---
uniques = np.unique(X_train[:, 4:14].flatten())
player_dict = dict(zip(uniques, range(len(uniques))))
for i in range(X_train.shape[0]):
    for j in range(4, 14):
        X_train[i, j] = player_dict.get(X_train[i, j], -1)
for i in range(X_test.shape[0]):
    for j in range(4, 14):
        X_test[i, j] = player_dict.get(X_test[i, j], -1)


In [6]:
# 1️⃣ General player Elo
player_elo_extractor = PlayerEloExtractor(k=30, initial_elo=1500)
player_elo_extractor.fit(X_train, y_train)
X_train_elo = player_elo_extractor.transform(X_train)
X_test_elo = player_elo_extractor.transform(X_test)
print("PlayerEloExtractor:", X_train_elo.shape, X_test_elo.shape)

# 2️⃣ Player Elo per map
player_map_elo_extractor = PlayerMapEloExtractor(k=30, initial_elo=1500)
player_map_elo_extractor.fit(X_train, y_train)
X_train_map_elo = player_map_elo_extractor.transform(X_train)
X_test_map_elo = player_map_elo_extractor.transform(X_test)
print("PlayerMapEloExtractor:", X_train_map_elo.shape, X_test_map_elo.shape)

# 3️⃣ Player Elo per map & starting CT side
player_map_start_elo_extractor = PlayerMapStartEloExtractor(initial_elo=1500)
player_map_start_elo_extractor.fit(X_train, y_train)
X_train_map_start_elo = player_map_start_elo_extractor.transform(X_train)
X_test_map_start_elo = player_map_start_elo_extractor.transform(X_test)
print("PlayerMapStartEloExtractor:", X_train_map_start_elo.shape, X_test_map_start_elo.shape)

# ------------------------------
# Train Logistic Regression (example with PlayerMapStartElo features)
# ------------------------------
clf, metrics = train_and_evaluate_logit(
    X_train_map_start_elo, y_train,
    X_test_map_start_elo, y_test
)

PlayerEloExtractor: (37974, 38) (100, 38)
PlayerMapEloExtractor: (37974, 38) (100, 38)
PlayerMapStartEloExtractor: (37974, 38) (100, 38)
Logistic Regression performance on test set:
Auc: 0.5000
Accuracy: 0.5700
Precision: 0.5700
Recall: 1.0000
F1: 0.7261
Confusion matrix: TP=57, TN=0, FP=43, FN=0


In [None]:
player_columns = list(range(4, 14))
unique_players = np.unique(np.concatenate([X_train[:, player_columns], X_test[:, player_columns]]))
player_dict = {pid: idx for idx, pid in enumerate(unique_players)}

for arr in [X_train, X_test]:
    for i in range(arr.shape[0]):
        for j in player_columns:
            arr[i, j] = player_dict.get(arr[i, j], -1)

# Compute Elo features
elo_extractor = PlayerEloExtractor(k=30, initial_elo=1500)
elo_extractor.fit(X_train, y_train)
X_train_elo = elo_extractor.transform(X_train)
X_test_elo = elo_extractor.transform(X_test)

print("Training Elo features shape:", X_train_elo.shape)
print("Test Elo features shape:", X_test_elo.shape)

# Train and evaluate Logistic Regression
clf, metrics = train_and_evaluate_logit(X_train_elo, y_train, X_test_elo, y_test)


  0%|          | 0/56493 [00:00<?, ?it/s]

  0%|          | 0/37974 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

IndexError: index 10 is out of bounds for axis 1 with size 10

In [5]:
X_train

array([[17525, 17539, 17540, ..., 17811, 17833, 17834],
       [17525, 17539, 17540, ..., 17811, 17833, 17834],
       [17520, 17523, 17555, ..., 17499, 17500, 17543],
       ...,
       [20678, 20684, 20692, ..., 17501, 17543, 19666],
       [20678, 20684, 20692, ..., 17501, 17543, 19666],
       [20569, 25589, 27555, ..., 21433, 21439, 25439]], shape=(37974, 10))

In [None]:


# ------------------------------
# Data Loading & Utilities
# ------------------------------

def generate_game_raw(path_to_games_raw_dir: str = "data/games_raw"):
    for filename in tqdm(os.listdir(path_to_games_raw_dir)):
        try:
            with open(os.path.join(path_to_games_raw_dir, filename), "r", encoding="utf-8") as f:
                yield json.load(f)
        except:
            pass

def validate_game(game: dict) -> bool:
    try:
        int(game["id"])
        parse(game["begin_at"])
        int(game["match"]["league"]["id"])
        int(game["match"]["serie"]["id"])
        int(game["match"]["tournament"]["id"])
        int(game["map"]["id"])

        team_players = defaultdict(list)
        for p in game["players"]:
            team_players[p["team"]["id"]].append(p["player"]["id"])
        assert len(team_players) == 2
        for t_id, p_ids in team_players.items():
            assert len(set(p_ids)) == 5

        team_ids = list(team_players.keys())
        rounds = []
        for r in game["rounds"]:
            assert r["round"] is not None
            assert r["ct"] in team_ids
            assert r["terrorists"] in team_ids
            assert r["winner_team"] in team_ids
            rounds.append(r["round"])
        assert min(rounds) == 1
        assert max(rounds) >= 16
        return True
    except:
        return False

def get_game_ids(path_to_games_raw_dir: str = "data/games_raw") -> list[int]:
    game_ids_valid = []
    game_begin_at_valid = []
    for game in generate_game_raw(path_to_games_raw_dir):
        if validate_game(game):
            game_ids_valid.append(game["id"])
            game_begin_at_valid.append(parse(game["begin_at"]))
    return np.array(game_ids_valid)[np.argsort(game_begin_at_valid)].tolist()

def get_X_for_game(path_to_games_raw_dir: str, game_id: int):
    with open(os.path.join(path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
        game = json.load(f)
    X = []
    team_players = defaultdict(list)
    for p in game["players"]:
        team_players[p["team"]["id"]].append(p["player"]["id"])
    t1_id, t2_id = sorted(team_players.keys())
    X.append(int(game["map"]["id"]))
    X.append(int(game["rounds"][0]["ct"]))
    X.extend([t1_id, t2_id])
    X.extend(sorted(team_players[t1_id]))
    X.extend(sorted(team_players[t2_id]))
    return X

def get_y_for_game(path_to_games_raw_dir: str, game_id: int):
    with open(os.path.join(path_to_games_raw_dir, f"{game_id}.json"), "r", encoding="utf-8") as f:
        game = json.load(f)
    t1_id, t2_id = np.unique([p["team"]["id"] for p in game["players"]])
    team_win_count = {t1_id:0, t2_id:0}
    for r in game["rounds"]:
        team_win_count[r["winner_team"]] += 1
    return int(team_win_count[t1_id] > team_win_count[t2_id])

def get_X_y(path_to_games_raw: str, game_ids: list[int]):
    X, y = [], []
    for game_id in tqdm(game_ids):
        try:
            X.append(get_X_for_game(path_to_games_raw, game_id))
            y.append(get_y_for_game(path_to_games_raw, game_id))
        except:
            continue
    return np.array(X, dtype=int), np.array(y, dtype=int)

# ------------------------------
# Player Elo Extractor
# ------------------------------

class PlayerEloExtractor(BaseEstimator, TransformerMixin):
    """
    Elo features per match:
    - Sorted team1 Elo (5)
    - Sorted team2 Elo (5)
    - Team averages (2)
    - Team Elo difference (1)
    - Pairwise differences (25)
    Total: 38 features
    """
    def __init__(self, k: int = 30, initial_elo: int = 1500):
        self.k = k
        self.initial_elo = initial_elo
        self.player_elos = defaultdict(lambda: self.initial_elo)
        self.X_train = None

    def fit(self, X: np.ndarray, y: np.ndarray):
        X_train = []

        for i in range(X.shape[0]):
            team1_players, team2_players = X[i, :5], X[i, 5:10]
            result = y[i]

            # Sorted Elo
            t1_sorted = np.sort([self.player_elos[p] for p in team1_players])
            t2_sorted = np.sort([self.player_elos[p] for p in team2_players])

            # Team averages & diff
            t1_avg, t2_avg = np.mean(t1_sorted), np.mean(t2_sorted)
            t_diff = t1_avg - t2_avg

            # Pairwise differences
            pairwise_diff = (t1_sorted[:, None] - t2_sorted[None, :]).flatten()

            # Concatenate features
            feat_row = np.concatenate([t1_sorted, t2_sorted, [t1_avg, t2_avg, t_diff], pairwise_diff])
            X_train.append(feat_row)

            # Update Elo after game
            self._update_elos(team1_players, team2_players, result)

        self.X_train = np.array(X_train)
        return self

    def transform(self, X: np.ndarray):
        if X.shape == self.X_train.shape:
            return self.X_train
        
        features = []

        for i in range(X.shape[0]):
            team1_players, team2_players = X[i, :5], X[i, 5:10]

            t1_sorted = np.sort([self.player_elos[p] for p in team1_players])
            t2_sorted = np.sort([self.player_elos[p] for p in team2_players])

            t1_avg, t2_avg = np.mean(t1_sorted), np.mean(t2_sorted)
            t_diff = t1_avg - t2_avg

            pairwise_diff = (t1_sorted[:, None] - t2_sorted[None, :]).flatten()

            feat_row = np.concatenate([t1_sorted, t2_sorted, [t1_avg, t2_avg, t_diff], pairwise_diff])
            features.append(feat_row)

        return np.array(features)

    def _update_elos(self, team1_players, team2_players, result):
        t1_avg = np.mean([self.player_elos[p] for p in team1_players])
        t2_avg = np.mean([self.player_elos[p] for p in team2_players])
        expected = 1 / (1 + 10 ** ((t2_avg - t1_avg) / 400))
        delta1 = self.k * (result - expected)
        delta2 = self.k * ((1 - result) - (1 - expected))

        for p in team1_players:
            self.player_elos[p] += delta1
        for p in team2_players:
            self.player_elos[p] += delta2
            
from collections import defaultdict
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class PlayerMapEloExtractor(BaseEstimator, TransformerMixin):
    """
    Elo features per match with per-map Elo:
    - Sorted team1 Elo (5)
    - Sorted team2 Elo (5)
    - Team averages (2)
    - Team Elo difference (1)
    - Pairwise differences (25)
    Total: 38 features
    """
    def __init__(self, k: int = 30, initial_elo: int = 1500):
        self.k = k
        self.initial_elo = initial_elo
        # player_elos[map_id][player_id] = elo
        self.player_elos = defaultdict(lambda: defaultdict(lambda: self.initial_elo))
        self.X_train = None

    def fit(self, X: np.ndarray, y: np.ndarray):
        X_train = []

        for i in range(X.shape[0]):
            map_id = X[i, 0]  # assume first column is map_id
            team1_players, team2_players = X[i, 1:6], X[i, 6:11]
            result = y[i]

            # Sorted Elo (per map)
            t1_sorted = np.sort([self.player_elos[map_id][p] for p in team1_players])
            t2_sorted = np.sort([self.player_elos[map_id][p] for p in team2_players])

            # Team averages & difference
            t1_avg, t2_avg = np.mean(t1_sorted), np.mean(t2_sorted)
            t_diff = t1_avg - t2_avg

            # Pairwise differences
            pairwise_diff = (t1_sorted[:, None] - t2_sorted[None, :]).flatten()

            # Combine features
            feat_row = np.concatenate([t1_sorted, t2_sorted, [t1_avg, t2_avg, t_diff], pairwise_diff])
            X_train.append(feat_row)

            # Update Elo per map
            self._update_elos(map_id, team1_players, team2_players, result)

        self.X_train = np.array(X_train)
        return self

    def transform(self, X: np.ndarray):
        if X.shape == self.X_train.shape:
            return self.X_train
        
        features = []

        for i in range(X.shape[0]):
            map_id = X[i, 0]
            team1_players, team2_players = X[i, 1:6], X[i, 6:11]

            t1_sorted = np.sort([self.player_elos[map_id][p] for p in team1_players])
            t2_sorted = np.sort([self.player_elos[map_id][p] for p in team2_players])

            t1_avg, t2_avg = np.mean(t1_sorted), np.mean(t2_sorted)
            t_diff = t1_avg - t2_avg

            pairwise_diff = (t1_sorted[:, None] - t2_sorted[None, :]).flatten()

            feat_row = np.concatenate([t1_sorted, t2_sorted, [t1_avg, t2_avg, t_diff], pairwise_diff])
            features.append(feat_row)

        return np.array(features)

    def _update_elos(self, map_id, team1_players, team2_players, result):
        t1_avg = np.mean([self.player_elos[map_id][p] for p in team1_players])
        t2_avg = np.mean([self.player_elos[map_id][p] for p in team2_players])
        expected = 1 / (1 + 10 ** ((t2_avg - t1_avg) / 400))
        delta1 = self.k * (result - expected)
        delta2 = self.k * ((1 - result) - (1 - expected))

        for p in team1_players:
            self.player_elos[map_id][p] += delta1
        for p in team2_players:
            self.player_elos[map_id][p] += delta2
            
from collections import defaultdict
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

from collections import defaultdict
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class PlayerMapStartEloExtractor(BaseEstimator, TransformerMixin):
    """
    Elo features per match using per-map and starting CT-side Elo (no updates)
    Features per match:
    - Sorted team1 Elo (5)
    - Sorted team2 Elo (5)
    - Team averages (2)
    - Team Elo difference (1)
    - Pairwise differences (25)
    Total: 38 features
    Column mapping based on get_X_for_game:
    0: map_id
    1: start_ct_team_id
    2-3: t1_id, t2_id
    4-8: team1 player ids
    9-13: team2 player ids
    """
    def __init__(self, initial_elo: int = 1500):
        self.initial_elo = initial_elo
        # Elo stored per map and per starting CT side
        self.player_elos = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: self.initial_elo)))
        self.X_train = None

    def fit(self, X: np.ndarray, y: np.ndarray = None):
        X_train = []

        for i in range(X.shape[0]):
            map_id = X[i, 0]
            start_ct_id = X[i, 1]
            team1_players, team2_players = X[i, 4:9], X[i, 9:14]

            # Sorted starting Elo using map_id and start_ct_id
            t1_sorted = np.sort([self.player_elos[map_id][start_ct_id][p] for p in team1_players])
            t2_sorted = np.sort([self.player_elos[map_id][start_ct_id][p] for p in team2_players])

            t1_avg, t2_avg = np.mean(t1_sorted), np.mean(t2_sorted)
            t_diff = t1_avg - t2_avg
            pairwise_diff = (t1_sorted[:, None] - t2_sorted[None, :]).flatten()

            feat_row = np.concatenate([t1_sorted, t2_sorted, [t1_avg, t2_avg, t_diff], pairwise_diff])
            X_train.append(feat_row)

        self.X_train = np.array(X_train)
        return self

    def transform(self, X: np.ndarray):
        features = []

        for i in range(X.shape[0]):
            map_id = X[i, 0]
            start_ct_id = X[i, 1]
            team1_players, team2_players = X[i, 4:9], X[i, 9:14]

            t1_sorted = np.sort([self.player_elos[map_id][start_ct_id][p] for p in team1_players])
            t2_sorted = np.sort([self.player_elos[map_id][start_ct_id][p] for p in team2_players])

            t1_avg, t2_avg = np.mean(t1_sorted), np.mean(t2_sorted)
            t_diff = t1_avg - t2_avg
            pairwise_diff = (t1_sorted[:, None] - t2_sorted[None, :]).flatten()

            feat_row = np.concatenate([t1_sorted, t2_sorted, [t1_avg, t2_avg, t_diff], pairwise_diff])
            features.append(feat_row)

        return np.array(features)




from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, accuracy_score, precision_score,
    recall_score, f1_score, confusion_matrix
)



from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, accuracy_score, precision_score,
    recall_score, f1_score, confusion_matrix
)

def train_and_evaluate_logit(X_train, y_train, X_test, y_test, random_state: int = 42):
    """
    Train Logistic Regression on training data and evaluate on test data.
    
    Returns:
        clf: trained LogisticRegression model
        metrics: dict of evaluation metrics
    """
    # Train
    clf = LogisticRegression(solver="liblinear", random_state=random_state)
    clf.fit(X_train, y_train)

    # Predict
    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)[:, 1]

    # Compute metrics
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    metrics = {
        "auc": roc_auc_score(y_test, y_proba),
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "confusion_matrix": {"TP": tp, "TN": tn, "FP": fp, "FN": fn},
    }

    # Print metrics
    print("Logistic Regression performance on test set:")
    for k, v in metrics.items():
        if k != "confusion_matrix":
            print(f"{k.capitalize()}: {v:.4f}")
    cm = metrics["confusion_matrix"]
    print(f"Confusion matrix: TP={cm['TP']}, TN={cm['TN']}, FP={cm['FP']}, FN={cm['FN']}")

    return clf, metrics



In [3]:
clf, metrics = train_and_evaluate_logit(X_train_elo, y_train, X_test_elo, y_test)


Logistic Regression performance on test set:
Auc: 0.7472
Accuracy: 0.6600
Precision: 0.6667
Recall: 0.8070
F1: 0.7302
Confusion matrix: TP=46, TN=20, FP=23, FN=11
