In [1]:
import re

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

from src.lichess import *

In [2]:
uk = "UserId"
tk = "tournamentId"

In [3]:
tournament_sample_df = pd.read_parquet(
    dz.get_raw_data_path("tournament-sample.parquet")
)

In [4]:
comm_cols = [tk, "stamp", "termination"]
sides = ["White", "Black"]
tournament_fixtures_df = (
    pd.read_parquet(dz.get_raw_data_path("tournament-games.parquet"))
    .assign(
        stamp=lambda df: pd.to_datetime(df["UTCDate"] + " " + df["UTCTime"]),
        termination=lambda df: df["Termination"].str.replace(" ", "_").str.lower(),
    )
    .pipe(
        lambda df: pd.concat(
            df.loc[:, comm_cols].assign(
                UserId=df[side],
                OppId=df[opp],
                Elo=df[f"{side}Elo"].astype(float),
                OppElo=df[f"{opp}Elo"].astype(float),
                Result=df["Result"] if side == sides[0] else df["Result"].str[::-1],
                result=lambda df: df["Result"].pipe(
                    lambda s: np.where(
                        s == "1-0", "win", np.where(s == "0-1", "lose", "draw")
                    )
                ),
                startTime=pd.to_timedelta(df[f"{side}Start"]).dt.seconds,
            )
            for side, opp in zip(sides, sides[::-1])
        )
    )
    .reset_index(drop=True)
    .assign(
        base_points=lambda df: df["Result"].pipe(
            lambda s: np.where(s == "1-0", 2, np.where(s == "0-1", 0, 1))
        ),
        didnt_lose=lambda df: df["base_points"] != 0,
        did_berzerk=lambda df: df["startTime"]
        < tournament_sample_df["clock__limit"].reindex(df["tournamentId"]).values,
    )
)

In [5]:
def get_gdf_base(gid, gdf):
    return gdf.assign(
        berzerked_first=lambda df: df.groupby(uk)["did_berzerk"].transform("first"),
        on_streak=lambda df: df.groupby("UserId")["didnt_lose"]
        .rolling(2)
        .sum()
        .fillna(0)
        .reset_index(level=0, drop=True)
        >= 2,
        points_won=lambda df: df["base_points"] * (1 + df["on_streak"])
        + (df["didnt_lose"] * df["did_berzerk"]),
        current_points=lambda df: df.groupby("UserId")["points_won"].transform(
            "cumsum"
        ),
        current_position_based_on_points=lambda df: [
            df.iloc[:i, :]
            .drop_duplicates(uk, keep="last")["current_points"]
            .pipe(
                lambda s: np.searchsorted(
                    (-s).sort_values(), -df.iloc[i, :]["current_points"]
                )
                + 1
            )
            for i in range(df.shape[0])
        ],
        initial_position_based_on_elo=lambda df: df.groupby(uk)["Elo"]
        .first()
        .rank()
        .reindex(df[uk])
        .values,
    )

In [6]:
ranked_df = pd.concat(
    map(
        lambda gtup: get_gdf_base(gtup[0], gtup[1]),
        tqdm(tournament_fixtures_df.sort_values("stamp").groupby("tournamentId")),
    )
).pipe(lambda df: df.set_index(df.index.rename("fixture_id")))

  0%|          | 0/3683 [00:00<?, ?it/s]

In [7]:
dummy_basis = pd.get_dummies(
    ranked_df.set_index([uk, tk], append=True)[
        ["termination", "result", "did_berzerk", "didnt_lose"]
    ]
).astype(int)

In [8]:
streak_cols = ["result_win", "result_lose", "result_draw", "did_berzerk"]

In [15]:
def get_streak_df(dumk):
    _shifted = dummy_basis.groupby([tk, uk], as_index=False)[dumk].shift()
    _streak_id = (dummy_basis[dumk] != _shifted).groupby([tk, uk]).cumsum()
    _streak_len = (
        _streak_id.to_frame().assign(c=1).groupby([tk, uk, dumk])["c"].cumsum()
    )

    return (
        pd.concat([dummy_basis[dumk], _streak_len], axis=1)
        .pivot_table(index=_shifted.index.names, columns=dumk, values="c")
        .rename(columns=lambda s: f"{dumk}_{s}_streak")
        .fillna(0)
    )

In [16]:
full_streak_df = pd.concat(map(get_streak_df, streak_cols), axis=1)

In [21]:
extended_ranked_df = ranked_df.join(
    dummy_basis.assign(games=1)
    .groupby([tk, uk])
    .cumsum()
    .pipe(lambda db: db - dummy_basis.assign(games=1))
    .reset_index([uk, tk], drop=True)
    .rename(columns=lambda s: f"so_far_{s}")
).join(full_streak_df.reset_index([uk, tk], drop=True))

In [24]:
extended_ranked_df.to_parquet(dz.get_raw_data_path("ranked-games.parquet"))

In [4]:
extended_ranked_df = pd.read_parquet(dz.get_raw_data_path("ranked-games.parquet"))

In [19]:
streak_cols = [
    c for c in extended_ranked_df.columns if c.endswith("_streak") and c != "on_streak"
]

In [20]:
fixed_streaks = extended_ranked_df.pipe(
    lambda df: df.sort_values("stamp").groupby([tk, uk])[streak_cols].shift(1)
)

In [23]:
extended_ranked_df = extended_ranked_df.drop(streak_cols, axis=1).join(
    fixed_streaks.fillna(0)
)

In [5]:
extended_ranked_df.sort_values([uk, tk, "stamp"]).head(20).T.head(40)

game_id,3698829,8871459,3698352,8871004,8870979,3698116,3698053,8870776,3697744,3697577,8870121,3697091,3696918,8869469,8869210,4269642,9442306,4269420,9442104,4269162
tournamentId,2aBI4J3v,2aBI4J3v,2aBI4J3v,2aBI4J3v,2aBI4J3v,2aBI4J3v,2aBI4J3v,2aBI4J3v,2aBI4J3v,2aBI4J3v,2aBI4J3v,2aBI4J3v,2aBI4J3v,2aBI4J3v,2aBI4J3v,EEuZoktz,EEuZoktz,EEuZoktz,EEuZoktz,EEuZoktz
stamp,2023-10-01 16:00:28,2023-10-01 16:04:48,2023-10-01 16:11:52,2023-10-01 16:14:58,2023-10-01 16:15:23,2023-10-01 16:16:39,2023-10-01 16:17:56,2023-10-01 16:19:15,2023-10-01 16:24:11,2023-10-01 16:27:44,2023-10-01 16:33:08,2023-10-01 16:38:03,2023-10-01 16:41:34,2023-10-01 16:46:59,2023-10-01 16:53:37,2023-09-24 22:36:30,2023-09-24 22:40:06,2023-09-24 22:42:44,2023-09-24 22:46:24,2023-09-24 22:50:56
termination,normal,time_forfeit,normal,abandoned,normal,normal,normal,normal,normal,normal,normal,normal,normal,time_forfeit,normal,time_forfeit,normal,normal,normal,time_forfeit
UserId,1Cavalodetroia,1Cavalodetroia,1Cavalodetroia,1Cavalodetroia,1Cavalodetroia,1Cavalodetroia,1Cavalodetroia,1Cavalodetroia,1Cavalodetroia,1Cavalodetroia,1Cavalodetroia,1Cavalodetroia,1Cavalodetroia,1Cavalodetroia,1Cavalodetroia,1Cavalodetroia,1Cavalodetroia,1Cavalodetroia,1Cavalodetroia,1Cavalodetroia
OppId,C_Tom,Yggdrasil_3,andreypereponkin,Modi_Chanyu,Strong89,syrio1685,DhrubaBarik,laruslarus,Gorky84,K_Karina,Harold77,Khatben,EJ111,Rolin11,Kevin-DT,DennisBee,CaxaRLian,Okomuchy,dodi99,CarlosMVP
Elo,1763.0,1757.0,1757.0,1756.0,1763.0,1762.0,1761.0,1766.0,1765.0,1765.0,1770.0,1764.0,1760.0,1754.0,1761.0,1774.0,1781.0,1780.0,1785.0,1784.0
OppElo,1761.0,1734.0,2203.0,1838.0,2206.0,2105.0,1717.0,2242.0,1732.0,1731.0,1731.0,1877.0,1746.0,1869.0,1656.0,1880.0,2186.0,1713.0,2317.0,1732.0
Result,0-1,2/1-2/1,0-1,1-0,0-1,0-1,1-0,0-1,1/2-1/2,1-0,0-1,0-1,0-1,1-0,0-1,1-0,0-1,1-0,0-1,1-0
result,lose,draw,lose,win,lose,lose,win,lose,draw,win,lose,lose,lose,win,lose,win,lose,win,lose,win
startTime,180.0,180.0,180.0,,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0


In [6]:
opp_cols = [
    "so_far_termination_abandoned",
    "so_far_termination_normal",
    "so_far_termination_time_forfeit",
    "so_far_result_draw",
    "so_far_result_lose",
    "so_far_result_win",
    "so_far_games",
    "result_win_0_streak",
    "result_win_1_streak",
    "result_lose_0_streak",
    "result_lose_1_streak",
    "result_draw_0_streak",
    "result_draw_1_streak",
    "did_berzerk_0_streak",
    "did_berzerk_1_streak",
    "base_points",
    "didnt_lose",
    "did_berzerk",
    "berzerked_first",
    "on_streak",
    "points_won",
    "current_points",
    "current_position_based_on_points",
    "initial_position_based_on_elo",
    "so_far_termination_rules_infraction",
    "current_rank_rate",
    "initial_rank_rate",
    "overachievement_rate",
]

In [7]:
decorated_fixture_df = (
    extended_ranked_df.merge(
        tournament_sample_df.loc[
            :, ["fullName", "startsAt", "finishesAt", "nbPlayers"]
        ],
        left_on="tournamentId",
        right_index=True,
    )
    .assign(
        stage_of_tournament=lambda df: (df["stamp"].astype(int) / 1e6 - df["startsAt"])
        / (df["finishesAt"] - df["startsAt"]),
        elo_diff=lambda df: df["Elo"] - df["OppElo"],
        elo_diff_rate=lambda df: df["elo_diff"] / df["Elo"],
        current_rank_rate=lambda df: df["current_position_based_on_points"]
        / df["nbPlayers"],
        initial_rank_rate=lambda df: df["initial_position_based_on_elo"]
        / df["nbPlayers"],
        overachievement_rate=lambda df: df["initial_rank_rate"]
        - df["current_rank_rate"],
    )
    .pipe(
        lambda df: df.merge(
            df.loc[:, opp_cols + ["stamp", uk]].rename(
                columns={k: f"opposition_{k}" for k in opp_cols} | {uk: "OppId"}
            )
        )
    )
    .drop_duplicates(subset=["stamp", uk])
)

In [8]:
decorated_fixture_df.to_parquet(dz.get_raw_data_path("decorated-fixtures.parquet"))

In [16]:
xy = ["elo_diff_rate", "stage_of_tournament"]

In [22]:
xy = ["underachievement_rate", "stage_of_tournament"]

In [27]:
decorated_fixture_df.pipe(
    lambda df: df.assign(**{k: pd.qcut(df[k].round(2).values, 5) for k in xy})
).assign(
    Win=lambda df: df["Result"] == "1-0",
    berzerkWin=lambda df: df["Win"] & df["did_berzerk"],
).groupby(
    xy[::-1]
).agg(
    count=pd.NamedAgg("UserId", "count"),
    berzerkRate=pd.NamedAgg("did_berzerk", "mean"),
    winRate=pd.NamedAgg("Win", "mean"),
    berzerkAndWin=pd.NamedAgg("berzerkWin", "mean"),
).assign(
    berzerkWinRate=lambda df: df["berzerkAndWin"] / df["berzerkRate"]
).style.background_gradient()  # .pivot_table(index=xy[0], columns=xy[0], values="didBerzerk")

Unnamed: 0_level_0,Unnamed: 1_level_0,count,berzerkRate,winRate,berzerkAndWin,berzerkWinRate
stage_of_tournament,underachievement_rate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"(-0.001, 0.19]","(-0.871, -0.05]",73678,0.040568,0.07545,0.001819,0.044831
"(-0.001, 0.19]","(-0.05, 0.21]",227334,0.054493,0.370266,0.015598,0.286245
"(-0.001, 0.19]","(0.21, 0.44]",240641,0.08019,0.46229,0.03044,0.379593
"(-0.001, 0.19]","(0.44, 0.67]",260069,0.109779,0.511941,0.050333,0.458494
"(-0.001, 0.19]","(0.67, 0.99]",272328,0.161305,0.673016,0.112761,0.699053
"(0.19, 0.37]","(-0.871, -0.05]",174755,0.041538,0.197282,0.005516,0.132801
"(0.19, 0.37]","(-0.05, 0.21]",220676,0.062449,0.393795,0.019223,0.307815
"(0.19, 0.37]","(0.21, 0.44]",220502,0.091745,0.478921,0.037161,0.405042
"(0.19, 0.37]","(0.44, 0.67]",221719,0.135839,0.591041,0.074653,0.549572
"(0.19, 0.37]","(0.67, 0.99]",206880,0.201363,0.708561,0.142078,0.705579
