In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import soccerdata as sd
from pathlib import PosixPath
import pandas as pd
import numpy as np

import os
import cProfile
from tqdm import tqdm
import database_io.dims
import database_io.faks

from sqlalchemy import create_engine
from sqlalchemy.orm import Session

from sqlalchemy import func
from sqlalchemy.orm import aliased
import plotly.express as px

In [None]:
db_path = "/home/morten/Develop/packing-report/gde/GDE.db"

engine = create_engine(f'sqlite:///{db_path}', echo=False)
session = Session(engine)


# Joining the subqueries
query = (
    session.query(
        database_io.dims.Games.expected_game_result,
        # database_io.dims.Games.result
    ).filter(database_io.dims.Games.minutes > 80)
    .filter(database_io.dims.Games.expected_game_result >= 0)
    .filter(database_io.dims.Games.version == 0.2)
)

results = query.all()

In [None]:
np.mean(results)

In [None]:
px.histogram(pd.DataFrame(results, columns=['expected_game_result']))

Get Data. Train + Test

In [None]:
db_path = "/home/morten/Develop/packing-report/gde/GDE.db"

engine = create_engine(f'sqlite:///{db_path}', echo=False)
session = Session(engine)


# Subquery for result_table
ranked_subquery = (
    session.query(
        database_io.dims.Games.game_id,
        database_io.dims.Games.team_id,
        database_io.dims.Games.result,
        func.row_number().over(
            partition_by=(database_io.dims.Games.game_id, database_io.dims.Games.team_id),
            order_by=func.count().desc()
        ).label('rank')
    )
    .filter(database_io.dims.Games.game_date > '2018-06-01')
    .group_by(database_io.dims.Games.game_id, database_io.dims.Games.team_id, database_io.dims.Games.result)
    .subquery()
)

result_table = (
    session.query(ranked_subquery.c.game_id, ranked_subquery.c.team_id, ranked_subquery.c.result)
    .filter(ranked_subquery.c.rank == 1)
    .subquery()
)

# Subquery for elo_table
elo_table = (
    session.query(
        func.avg(database_io.dims.Games.elo).label('avg_elo'),
        database_io.dims.Games.team_id,
        database_io.dims.Games.game_id,
        database_io.dims.Games.game_date,
        database_io.dims.Games.home
    )
    .filter(database_io.dims.Games.game_date > '2018-06-01')
    .group_by(database_io.dims.Games.game_id, database_io.dims.Games.team_id)
    .subquery()
)

# Joining the subqueries
query = (
    session.query(
        result_table.c.game_id,
        result_table.c.team_id,
        result_table.c.result,
        elo_table.c.avg_elo,
        elo_table.c.game_date, 
        elo_table.c.home

    )
    .join(elo_table, 
          (result_table.c.game_id == elo_table.c.game_id) & 
          (result_table.c.team_id == elo_table.c.team_id))
)

results = query.all()

In [None]:
data = pd.DataFrame(np.array(results), columns=["game_id", "team_id", "result", "avg_elo", "date", "home"])

In [None]:
data

In [None]:
train_data = data[data["date"] <= "2021-07-01"].copy()
test_data = data[data["date"] > "2021-07-01"].copy()

In [None]:
train_data = pd.merge(train_data, train_data, how="outer", on="game_id")
train_data = train_data.loc[train_data["team_id_x"] != train_data["team_id_y"]]

test_data = pd.merge(test_data, test_data, how="outer", on="game_id")
test_data = test_data.loc[test_data["team_id_x"] != test_data["team_id_y"]]

In [None]:
train_data = train_data[train_data["home_x"] == "1"]
test_data = test_data[test_data["home_x"] == "1"]

MOV Regressor analysis

Create Model

In [None]:
train_data["goals_x"] = train_data["result_x"].apply(lambda x: int(x.split("-")[0]))
train_data["goals_y"] = train_data["result_y"].apply(lambda x: int(x.split("-")[0]))

In [None]:
train_data[train_data.columns] = train_data[train_data.columns].apply(pd.to_numeric, errors='ignore')

In [None]:
train_data

In [None]:
import pymc as pm

with pm.Model() as game_model:
    home_advantage = pm.Normal("home_advantage", 0, 10)
    home_elo_diff = pm.Normal("home_elo", 0, 10) 
    away_elo_diff = pm.Normal("away_elo", 0, 10)

    # theta_h = home_advantage + home_elo_diff * (train_data["avg_elo_x"] - train_data["avg_elo_y"])
    theta_h = home_advantage + home_elo_diff * (train_data["avg_elo_x"] - train_data["avg_elo_y"])
    theta_a = away_elo_diff * (train_data["avg_elo_y"] - train_data["avg_elo_x"])

    goals_h = pm.Poisson("goals_h", pm.math.exp(theta_h), observed=train_data["goals_x"])
    goals_a = pm.Poisson("goals_a", pm.math.exp(theta_a), observed=train_data["goals_y"])

    # # observed
    # goal_diff = goals_h - goals_a

In [None]:
with game_model:
    trace = pm.sample(tune=2000)

Predict Test Data

In [None]:
test_data

In [None]:
test_data[test_data.columns] = test_data[test_data.columns].apply(pd.to_numeric, errors='ignore')

In [None]:
test_data["win_x"] = test_data.apply(lambda x: np.count_nonzero(np.random.poisson(np.exp(trace.posterior.home_advantage[0] + trace.posterior.home_elo[0] * (x["avg_elo_x"] - x["avg_elo_y"]))) - 
                                               np.random.poisson(np.exp(trace.posterior.away_elo[0] * (x["avg_elo_y"] - x["avg_elo_x"]))) > 0) / 1000, 
                                               axis=1)
test_data["draw"] = test_data.apply(lambda x: np.count_nonzero(np.random.poisson(np.exp(trace.posterior.home_advantage[0] + trace.posterior.home_elo[0] * (x["avg_elo_x"] - x["avg_elo_y"]))) - 
                                               np.random.poisson(np.exp(trace.posterior.away_elo[0] * (x["avg_elo_y"] - x["avg_elo_x"]))) == 0) / 1000, 
                                               axis=1)
test_data["win_y"] = test_data.apply(lambda x: np.count_nonzero(np.random.poisson(np.exp(trace.posterior.home_advantage[0] + trace.posterior.home_elo[0] * (x["avg_elo_x"] - x["avg_elo_y"]))) - 
                                               np.random.poisson(np.exp(trace.posterior.away_elo[0] * (x["avg_elo_y"] - x["avg_elo_x"]))) < 0) / 1000, 
                                               axis=1)

In [None]:
test_data["sum"] = test_data["win_x"] + test_data["draw"] + test_data["win_y"] # sanity check

In [None]:
test_data

In [None]:
def brier_multi(targets, probs):
    return np.mean(np.sum((probs - targets)**2, axis=1))

labels = [[1, 0, 0] if int(x.split("-")[0]) > int(x.split("-")[1]) else [0, 1, 0] if int(x.split("-")[0]) == int(x.split("-")[1]) else [0, 0, 1] for x in test_data.result_x]
probs = [[x, y, z] for x, y, z in zip(test_data.win_x, test_data.draw, test_data.win_y)]

In [None]:
brier_multi(np.array(labels), np.array(probs))