In [3]:
# Conditional logit-style (team Bradley–Terry) model for player utilities
# Files expected:
#   /mnt/data/games_rows.csv with columns:
#       id, round_number, team1_player1_id, team1_player2_id,
#       team2_player1_id, team2_player2_id, winning_team (1 or 2), is_playoff_game
#   /mnt/data/players_rows.csv with columns:
#       id, name, ...
#
# Outputs:
#   A DataFrame with player_id, name, beta_ref (reference-coded),
#   beta_centered (mean-zero), a 0–100 “utility” scale, and a “vs_avg” win prob.
#   Also saves /mnt/data/player_betas.csv

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

# ------------------------
# 1) Load data
# ------------------------
games_path   = "../backup/games_rows.csv"
players_path = "../backup/players_rows.csv"

games   = pd.read_csv(games_path)
players = pd.read_csv(players_path)

required_cols = [
    "team1_player1_id","team1_player2_id",
    "team2_player1_id","team2_player2_id",
    "winning_team"
]
missing = [c for c in required_cols if c not in games.columns]
if missing:
    raise ValueError(f"games_rows.csv missing required columns: {missing}")

# Outcome: 1 if Team 1 won, else 0
y = (games["winning_team"] == 1).astype(int).values

In [4]:
# ------------------------
# 2) Build the design matrix
#    +1 for players on Team 1, -1 for players on Team 2, 0 otherwise
# ------------------------
player_cols = ["team1_player1_id","team1_player2_id","team2_player1_id","team2_player2_id"]
all_ids = pd.unique(pd.concat([games[c] for c in player_cols], ignore_index=True))
all_ids = np.sort(all_ids)

# Drop a baseline player for identifiability (here: fewest appearances; tie-break by smallest id)
appearances = (
    pd.concat([games[c] for c in player_cols], ignore_index=True)
      .value_counts()
      .rename_axis("player_id")
      .reset_index(name="appearances")
)
min_apps = appearances["appearances"].min()
baseline_id = int(min(appearances[appearances["appearances"] == min_apps]["player_id"].tolist()))

id_to_col = {pid: j for j, pid in enumerate(all_ids)}
X_full = np.zeros((len(games), len(all_ids)), dtype=float)

for i, row in games.iterrows():
    t1 = [row["team1_player1_id"], row["team1_player2_id"]]
    t2 = [row["team2_player1_id"], row["team2_player2_id"]]
    for pid in t1:
        X_full[i, id_to_col[pid]] = 1.0
    for pid in t2:
        X_full[i, id_to_col[pid]] = -1.0

keep_mask = all_ids != baseline_id
X = X_full[:, keep_mask]
kept_ids = all_ids[keep_mask]


In [5]:
# ------------------------
# 3) Fit logistic regression
#    (L2 regularization controls separation; tune C if you want more/less shrinkage)
# ------------------------
C_value = 1.0  # larger C = weaker regularization; try 0.5, 1.0, 2.0, 5.0, etc.
lr = LogisticRegression(
    penalty="l2",
    C=C_value,
    fit_intercept=True,   # intercept captures any Team 1 advantage
    solver="lbfgs",
    max_iter=10000,
)
lr.fit(X, y)

intercept = float(lr.intercept_[0])
player_coefs_ref = lr.coef_[0]  # betas for kept_ids; baseline is implicitly 0

In [8]:
# ------------------------
# 4) Assemble results (names + two scales)
# ------------------------
results_ref = pd.DataFrame({"player_id": kept_ids, "beta_ref": player_coefs_ref})
results_ref = pd.concat([results_ref, pd.DataFrame({"player_id":[baseline_id], "beta_ref":[0.0]})], ignore_index=True)

# merge in player names
if not {"id","name"}.issubset(players.columns):
    raise ValueError("players_rows.csv must contain at least 'id' and 'name' columns.")
results = results_ref.merge(players[["id","name"]], left_on="player_id", right_on="id", how="left").drop(columns=["id"])

# mean-zero centered (nice for MaxDiff-style reporting)
mean_beta = results["beta_ref"].mean()
results["beta_centered"] = results["beta_ref"] - mean_beta

# optional 0–100 scale (50 ~ average player)
mn, mx = results["beta_centered"].min(), results["beta_centered"].max()
if mx > mn:
    results["utility_0_100"] = 100 * (results["beta_centered"] - mn) / (mx - mn)
else:
    results["utility_0_100"] = 50.0

# convenience: probability vs an "average" opposing player (holding others average)
def logistic(z): return 1/(1+np.exp(-z))
results["vs_avg_logit_prob"] = results["beta_centered"].apply(logistic)

# sort and round for display
results = results.sort_values("beta_centered", ascending=False).reset_index(drop=True)
display_cols = ["player_id","name","beta_ref","beta_centered","utility_0_100","vs_avg_logit_prob"]
results_display = results[display_cols].copy()
results_display["beta_ref"]         = results_display["beta_ref"].round(4)
results_display["beta_centered"]    = results_display["beta_centered"].round(4)
results_display["utility_0_100"]    = results_display["utility_0_100"].round(1)
results_display["vs_avg_logit_prob"]= results_display["vs_avg_logit_prob"].round(3)

print(f"Baseline (reference) player id: {baseline_id}")
print(f"Intercept (Team 1 advantage, log-odds): {intercept:.4f}")
print(f"Regularization: L2, C={C_value}")
# print(results_display.to_string(index=False))

# write CSV
out_path = "player_betas.csv"
results_display.to_csv(out_path, index=False)
print(f"\nSaved results to: {out_path}")



Baseline (reference) player id: 102
Intercept (Team 1 advantage, log-odds): 1.0319
Regularization: L2, C=1.0

Saved results to: player_betas.csv
