In [None]:
import pymc as pm
import numpy as np
import pandas as pd
import arviz as az
import matplotlib.pyplot as plt
import math
import aesara.tensor as at
import seaborn as sb
import matplotlib

In [None]:
train_data_all = pd.read_csv(
    "data_acquisition/data_0.3/data_game_values_train.csv", sep=";"
)

In [None]:
xg_mean = np.mean(
    np.mean(train_data_all["home_xG"]) + np.mean(train_data_all["away_xG"])
)
xg_against_mean = np.mean(
    np.mean(train_data_all["home_xg_against"])
    + np.mean(train_data_all["away_xg_against"])
)

xt_mean = np.mean(
    np.mean(train_data_all["home_xT_all"]) + np.mean(train_data_all["away_xT_all"])
)
xt_against_mean = np.mean(
    np.mean(train_data_all["home_xt_all_against"])
    + np.mean(train_data_all["away_xt_all_against"])
)

form_for_mean = np.mean(
    np.mean(train_data_all["ha_form_home_for"] / 5)
    + np.mean(train_data_all["ha_form_away_for"] / 5)
)
form_against_mean = np.mean(
    np.mean(train_data_all["ha_form_home_against"] / 5)
    + np.mean(train_data_all["ha_form_away_against"] / 5)
)

In [None]:
def table_pos_to_id(tp):
    if tp <= 6:
        return 2
    if tp <= 13:
        return 1
    return 0


matchup_category = [
    (table_pos_to_id(x["table_pos_home"]) - table_pos_to_id(x["table_pos_away"])) + 2
    for _, x in train_data_all.iterrows()
]
matchup_category = np.array(matchup_category).reshape(len(matchup_category), 1)
train_data_all["match_category"] = matchup_category

In [None]:
models = []
sizes = []
for x in range(5):
    train_data = train_data_all.loc[train_data_all["match_category"] == x]
    sizes.append(train_data.shape[0])
models = []
sizes = []
for x in range(5):
    train_data = train_data_all.loc[train_data_all["match_category"] == x]
    sizes.append(train_data.shape[0])
    features = np.swapaxes(
        np.array(
            [
                train_data["home_xG"] - xg_mean,
                train_data["away_xg_against"] - xg_against_mean,
                train_data["away_xG"] - xg_mean,
                train_data["home_xg_against"] - xg_against_mean,
                train_data["home_xT_all"] - xt_mean,
                train_data["away_xt_all_against"] - xt_against_mean,
                train_data["away_xT_all"] - xt_mean,
                train_data["home_xt_all_against"] - xt_against_mean,
            ]
        ),
        0,
        1,
    )

    # gi = np.swapaxes(np.array([train_data["home_gi"], train_data["away_gi"]]), 0, 1)

    form = np.swapaxes(
        np.array(
            [
                (train_data["ha_form_home_for"] / 5) - form_for_mean,
                (train_data["ha_form_home_against"] / 5) - form_against_mean,
                (train_data["ha_form_away_for"] / 5) - form_for_mean,
                (train_data["ha_form_away_against"] / 5) - form_against_mean,
            ]
        ),
        0,
        1,
    )

    goals = np.swapaxes(
        np.array([train_data["home_score"], train_data["away_score"]]), 0, 1
    )

    elo_diff = np.swapaxes(
        np.array([(train_data["elo_home"] / 1000) - (train_data["elo_away"] / 1000)]),
        0,
        1,
    )

    with pm.Model() as independent_poisson:
        pm_features = pm.Data("pm_features", features, mutable=True)
        pm_form_diff = pm.Data("pm_form_diff", form, mutable=True)
        pm_goals = pm.Data("pm_goals", goals, mutable=True)
        pm_elo_diff = pm.Data("pm_elo_diff", elo_diff, mutable=True)

        coefs_features = pm.HalfNormal(
            "coefs_features",  # mu=[[1, 0], [1, 0], [0, 1], [0, 1], [1, 0], [1, 0], [0, 1], [0, 1]],
            sigma=[
                [1, 0.001],
                [1, 0.001],
                [0.001, 1],
                [0.001, 1],
                [1, 0.001],
                [1, 0.001],
                [0.001, 1],
                [0.001, 1],
            ],
            shape=(features.shape[1], 2),
        )

        coefs_elo_diff = pm.Normal(
            "coefs_elo_diff", mu=[0.5, -0.5], sigma=[0.2, 0.2], shape=(1, 2)
        )

        coefs_form_diff = pm.Normal(
            "coefs_form_diff", shape=(form.shape[1], 2)
        )  # ,mu=[0.5, -0.5]
        factor = pm.Dirichlet("factor", a=np.ones(3))
        # home_advantage = pm.HalfNormal("home_advantage",sigma=[1, 0.001], shape=(2))
        intercepts = pm.Normal("intercepts", shape=2)

        # log_lam = pm.Deterministic("log_lam", intercepts + home_advantage +
        #                                                     factor[0]*(pm_elo_diff @ coefs_elo_diff) +
        #                                                     factor[1]*(pm_form_diff @ coefs_form_diff) +
        #                                                     factor[2]*(pm_features @ coefs_features))

        # log_lam = pm.Deterministic("log_lam", intercepts + home_advantage +
        #                                                 0.1*(pm_elo_diff @ coefs_elo_diff) +
        #                                                 0.4*(pm_form_diff @ coefs_form_diff) +
        #                                                 0.5*(pm_features @ coefs_features))

        log_lam = pm.Deterministic(
            "log_lam",
            intercepts
            + 0.1 * (pm_elo_diff @ coefs_elo_diff)
            + 0.4 * (pm_form_diff @ coefs_form_diff)
            + 0.5 * (pm_features @ coefs_features),
        )

        lam = pm.math.exp(log_lam)

        obs = pm.Poisson("obs", mu=lam, observed=pm_goals)

    models.append(independent_poisson)

In [None]:
# N = train_data.shape[0]
# K = 2
# features = np.swapaxes(np.array([train_data["home_xG"], train_data["away_xg_against"],
#                                  train_data["away_xG"], train_data["home_xg_against"]]), 0, 1)
# # features_a = np.swapaxes(np.array([train_data["away_xG"], train_data["away_xT_only_pos"], train_data["home_xg_against"], train_data["home_xt_only_pos_against"]]), 0, 1)

# gi = np.swapaxes(np.array([train_data["home_gi"], train_data["away_gi"]]), 0, 1)
# # gi_a = np.swapaxes(np.array([train_data["away_gi"]]), 0, 1)

# form = np.swapaxes(np.array([(train_data["form_home"] / 15), (train_data["form_away"] / 15)]), 0, 1)

# goals = np.swapaxes(np.array([train_data["home_score"], train_data["away_score"]]), 0, 1)
# # goals_a = np.swapaxes(np.array([train_data["away_score"]]), 0, 1)

# elo = np.swapaxes(np.array([(train_data["elo_home"] / 1000) - (train_data["elo_away"] / 1000)]), 0, 1)
# # elo_a = np.swapaxes(np.array(), 0, 1)
# M = 10

# with pm.Model() as dependent_normal_1:
#     pm_features = pm.Data("pm_features", features, mutable=True)
#     pm_gi = pm.Data("pm_gi", gi, mutable=True)
#     pm_form = pm.Data("pm_form", form, mutable=True)
#     pm_goals = pm.Data("pm_goals", goals, mutable=True)
#     pm_elo = pm.Data("pm_elo", elo, mutable=True)

#     cov_diag = pm.HalfNormal('cov_diag', shape=K)
#     cov_root = pm.Normal('cov_root', shape=(M, K))
#     cov = pm.Deterministic('cov', cov_root.T @ cov_root + at.diag(cov_diag))

#     coefs_features = pm.Normal('coefs_features', mu=[[0.5, 0], [0.5, 0], [0, 0.5], [0, 0.5]],
#                                                  sigma=[[2, 0.001], [2, 0.001], [0.001, 2], [0.001, 2]], shape=(features.shape[1], 2))
#     coefs_elo = pm.Normal('coefs_elo', mu=[1,1], sigma=[0.5, 0.5], shape=(1,2))
#     coefs_form = pm.Normal('coefs_form', mu=[[1, -0.5], [-0.5, 1]], sigma=[[0.5, 0.05],[0.05, 0.5]], shape=(form.shape[1],2))

#     # factor = pm.Dirichlet("factor", a=np.ones(3))
#     # home_advantage = pm.Normal("home_advantage")
#     intercepts = pm.Normal("intercepts", shape=2)

#     log_lam = pm.MvNormal("log_lam", mu = intercepts + 0.33*(pm_elo @ coefs_elo) +
#                                                        0.33*(pm_form @ coefs_form) +
#                                                        0.33*(pm_features @ coefs_features), cov=cov, shape=(K))

#     lam = pm.math.exp(log_lam)

#     obs = pm.Poisson("obs", mu=lam, observed=pm_goals)

In [None]:
# N = train_data.shape[0]
# K = 2
# features = np.swapaxes(np.array([train_data["home_xG"], train_data["home_xT"],
#                                  train_data["away_xG"], train_data["away_xT"],
#                                  train_data["home_xD"], train_data["home_xK"],
#                                  train_data["away_xD"], train_data["away_xK"]]), 0, 1)
# counts = np.swapaxes(np.array([train_data["home_score"], train_data["away_score"]]), 0, 1)
# P_features = features.shape[1]
# M = 10

# with pm.Model() as train_model:
#     sd_dist = pm.HalfNormal.dist(shape=K)
#     chol, corr, stds = pm.LKJCholeskyCov('chol_cov', n=K, eta=2, sd_dist=sd_dist, compute_corr=True)

#     coefs = pm.Normal('coefs', shape=(P_features, K))

#     intercepts = pm.Normal('intercepts', shape=K)
#     log_lam = pm.MvNormal('log_lam', mu=intercepts + features @ coefs, chol=chol, shape=(N,K))
#     lam = pm.math.exp(log_lam)
#     obs = pm.Poisson('obs', mu=lam, observed=counts)

In [None]:
MODEL = independent_poisson
# MODEL = dependent_normal_1
# MODEL =

In [None]:
pm.model_to_graphviz(MODEL)

In [None]:
traces = []
for model in models:
    with model:
        # trace = pm.sample(1000, tune=1000, return_inferencedata=True, discard_tuned_samples=True)
        app = pm.fit(50000, progressbar=True)
        trace = app.sample(1000)
        traces.append(trace)

In [None]:
az.summary(
    traces[0].posterior,
    var_names=["coefs_elo_diff", "coefs_form_diff", "coefs_features", "intercepts"],
    kind="stats",
)

# Test

In [None]:
test_data_all = pd.read_csv(
    "data_acquisition/data_0.3/data_game_values_test.csv", sep=";"
).dropna()
matchup_category = [
    (table_pos_to_id(x["table_pos_home"]) - table_pos_to_id(x["table_pos_away"])) + 2
    for _, x in test_data_all.iterrows()
]
matchup_category = np.array(matchup_category).reshape(len(matchup_category), 1)
test_data_all["match_category"] = matchup_category

all_predictions, orig_score_home, orig_score_away = [], [], []
test_data_container, test_data_sizes = [], []

for x in range(5):
    test_data = test_data_all.loc[test_data_all["match_category"] == x]
    test_data_container.append(test_data)
    test_data_orig_size = test_data.shape[0]
    test_data_sizes.append(test_data_orig_size)
    size_diff = sizes[x] - test_data.shape[0]
    column_size = test_data.shape[1]
    fill_data = [np.ones(column_size) for _ in range(size_diff)]
    test_data_fill = pd.DataFrame(data=fill_data, columns=test_data.columns)
    test_data = pd.concat([test_data, test_data_fill])

    features = np.swapaxes(
        np.array(
            [
                test_data["home_xG"] - xg_mean,
                test_data["away_xg_against"] - xg_against_mean,
                test_data["away_xG"] - xg_mean,
                test_data["home_xg_against"] - xg_against_mean,
                test_data["home_xT_all"] - xt_mean,
                test_data["away_xT_all"] - xt_mean,
                test_data["home_xt_all_against"] - xt_against_mean,
                test_data["away_xt_all_against"] - xt_against_mean,
            ]
        ),
        0,
        1,
    )
    # features_a = np.swapaxes(np.array([train_data["away_xG"], train_data["away_xT_only_pos"], train_data["home_xg_against"], train_data["home_xt_only_pos_against"]]), 0, 1)

    gi = np.swapaxes(np.array([test_data["home_gi"], test_data["away_gi"]]), 0, 1)
    # gi_a = np.swapaxes(np.array([train_data["away_gi"]]), 0, 1)

    form = np.swapaxes(
        np.array(
            [
                (test_data["ha_form_home_for"] / 5) - form_for_mean,
                (test_data["ha_form_home_against"] / 5) - form_against_mean,
                (test_data["ha_form_away_for"] / 5) - form_for_mean,
                (test_data["ha_form_away_against"] / 5) - form_against_mean,
            ]
        ),
        0,
        1,
    )

    goals = np.swapaxes(
        np.array([test_data["home_score"], test_data["away_score"]]), 0, 1
    )

    elo = np.swapaxes(
        np.array([(test_data["elo_home"] / 1000) - (test_data["elo_away"] / 1000)]),
        0,
        1,
    )

    with MODEL:
        pm.set_data(
            {
                "pm_elo_diff": elo,
                "pm_features": features,
                "pm_form_diff": form,
            }
        )

        sample_res = pm.sample_posterior_predictive(trace, predictions=True)
        predictions = sample_res["predictions"]
        all_predictions.append(predictions)

    orig_score_home.append(test_data[:test_data_orig_size].home_score.values[0])
    orig_score_away.append(test_data[:test_data_orig_size].away_score.values[0])

In [None]:
all_predictions[0].obs[0].shape

In [None]:
all_crosses, all_res, all_pred_home, all_pred_away = [], [], [], []
for x in range(5):
    test_data = test_data_all.loc[test_data_all["match_category"] == x]
    actual_home = test_data[: test_data_sizes[x]].home_score
    actual_away = test_data[: test_data_sizes[x]].away_score
    act_res = []
    for h, a in zip(actual_home, actual_away):
        act_res.append(f"{str(int(h))}:{str(int(a))}")

    predictions = all_predictions[x]
    predictions_home = np.swapaxes(np.array(predictions.obs[0].values), 0, 1)[
        : test_data_sizes[x]
    ][:, :, 0]
    predictions_away = np.swapaxes(np.array(predictions.obs[0].values), 0, 1)[
        : test_data_sizes[x]
    ][:, :, 1]
    all_pred_home.append(predictions_home)
    all_pred_away.append(predictions_away)
    game_quotes = []
    most_goals = {"home": [], "away": []}
    for game_idx in range(len(predictions_home)):
        home_hist, bin_edges = np.histogram(
            predictions_home[game_idx], [0, 1, 2, 3, 4, 5, 6]
        )
        away_hist, bin_edges = np.histogram(
            predictions_away[game_idx], [0, 1, 2, 3, 4, 5, 6]
        )
        home, draw, away = 0, 0, 0
        for i in range(len(home_hist)):
            for j in range(len(away_hist)):
                if j < i:
                    home += (home_hist[i] / 1000) * (away_hist[j] / 1000)
                elif j == i:
                    draw += (home_hist[i] / 1000) * (away_hist[j] / 1000)
                elif j > i:
                    away += (home_hist[i] / 1000) * (away_hist[j] / 1000)

        game_quotes.append(f"{round(home, 3)}-{round(draw, 3)}-{round(away, 3)}")
        most_goals["home"].append(
            f"h: {np.argmax(home_hist)} - {round(np.max(home_hist)/10)}%"
        )
        most_goals["away"].append(
            f"a: {np.argmax(away_hist)} - {round(np.max(away_hist)/10)}%"
        )

    df_res = pd.DataFrame(
        {
            "actual": act_res,
            "predicted": game_quotes,
            "prob goals home": most_goals["home"],
            "prob goals away": most_goals["away"],
        }
    )
    df_cross = pd.DataFrame(
        {
            "actual": [
                0
                if int(df_res.iloc[i]["actual"].split(":")[0])
                > int(df_res.iloc[i]["actual"].split(":")[1])
                else 1
                if int(df_res.iloc[i]["actual"].split(":")[0])
                == int(df_res.iloc[i]["actual"].split(":")[1])
                else 2
                for i in range(df_res.shape[0])
            ],
            "pred": [
                np.argmax([float(y) for y in df_res.iloc[i]["predicted"].split("-")])
                for i in range(df_res.shape[0])
            ],
            "pred_val": [
                np.max([float(y) for y in df_res.iloc[i]["predicted"].split("-")])
                for i in range(df_res.shape[0])
            ],
            "bookie": [
                np.argmax(
                    [
                        test_data_container[x].iloc[i].bookie_home,
                        test_data_container[x].iloc[i].bookie_draw,
                        test_data_container[x].iloc[i].bookie_away,
                    ]
                )
                for i in range(test_data_sizes[x])
            ],
            "bookie_val": [
                np.max(
                    [
                        test_data_container[x].iloc[i].bookie_home,
                        test_data_container[x].iloc[i].bookie_draw,
                        test_data_container[x].iloc[i].bookie_away,
                    ]
                )
                for i in range(test_data_sizes[x])
            ],
        }
    )

    all_res.append(df_res)
    all_crosses.append(df_cross)

df_cross = pd.concat(all_crosses)
df_res = pd.concat(all_res)
all_pred_home = np.concatenate(all_pred_home)
all_pred_away = np.concatenate(all_pred_away)

In [None]:
df_res.head(25)

In [None]:
df_cross

In [None]:
print("Verteilung:")
print(
    f"Anzahl Predicted Home: {df_cross[df_cross['pred'] == 0].shape[0]} ({round(df_cross[df_cross['pred'] == 0].shape[0] / df_cross.shape[0] * 100, 2)}%)",
    end="",
)
print(
    f" | Verteilung Tatsächlich Home: {round(df_cross[df_cross['actual'] == 0].shape[0] / df_cross.shape[0] * 100, 2)}%"
)
print(
    f"Anzahl Predicted Draw: {df_cross[df_cross['pred'] == 1].shape[0]} ({round(df_cross[df_cross['pred'] == 1].shape[0] / df_cross.shape[0] * 100, 2)}%)",
    end="",
)
print(
    f" | Verteilung Tatsächlich Draw: {round(df_cross[df_cross['actual'] == 1].shape[0] / df_cross.shape[0] * 100, 2)}%"
)
print(
    f"Anzahl Predicted Away: {df_cross[df_cross['pred'] == 2].shape[0]} ({round(df_cross[df_cross['pred'] == 2].shape[0] / df_cross.shape[0] * 100, 2)}%)",
    end="",
)
print(
    f" | Verteilung Tatsächlich Away: {round(df_cross[df_cross['actual'] == 2].shape[0] / df_cross.shape[0] * 100, 2)}%"
)
print(
    "---------------------------------------------------------------------------------"
)
print(
    "---------------------------------------------------------------------------------"
)
right, wrong, home_right, draw_right, away_right = 0, 0, 0, 0, 0
home_pred = {"act_away": 0, "act_draw": 0}
draw_pred = {"act_away": 0, "act_home": 0}
away_pred = {"act_home": 0, "act_draw": 0}
for x in range(df_cross.shape[0]):
    if df_cross.iloc[x]["actual"] != df_cross.iloc[x]["pred"]:
        if df_cross.iloc[x]["pred"] == 0:
            if df_cross.iloc[x]["actual"] == 1:
                home_pred["act_draw"] += 1
            else:
                home_pred["act_away"] += 1
        elif df_cross.iloc[x]["pred"] == 1:
            if df_cross.iloc[x]["actual"] == 0:
                draw_pred["act_home"] += 1
            else:
                draw_pred["act_away"] += 1
        else:
            if df_cross.iloc[x]["actual"] == 0:
                away_pred["act_home"] += 1
            else:
                away_pred["act_draw"] += 1
        wrong += 1
    else:
        if df_cross.iloc[x]["actual"] == 0:
            home_right += 1
        elif df_cross.iloc[x]["actual"] == 1:
            draw_right += 1
        else:
            away_right += 1
        right += 1

home_wrong = home_pred["act_away"] + home_pred["act_draw"]
draw_wrong = draw_pred["act_away"] + draw_pred["act_home"]
away_wrong = away_pred["act_home"] + away_pred["act_draw"]
print("Prediction:")
print(
    f"Anzahl Korrekt: {right} ({round(right / df_cross.shape[0] * 100,2)}%), Anzahl Falsch: {wrong} ({round(wrong / df_cross.shape[0] * 100, 2)}%)"
)
print(
    "---------------------------------------------------------------------------------"
)
print(
    f"Anzahl Home Korrekt: {home_right} ({round(home_right / df_cross[df_cross['actual'] == 0].shape[0] * 100, 2)}%), Anzahl Home Falsch: {home_wrong}"
)
print(f"Home Pred. aber Draw --> {home_pred['act_draw']}")
print(f"Home Pred. aber Away --> {home_pred['act_away']}")
print(
    f"Anzahl Draw Korrekt: {draw_right} ({round(draw_right / df_cross[df_cross['actual'] == 1].shape[0] * 100, 2)}%), Anzahl Draw Falsch: {draw_wrong}"
)
print(f"Draw Pred. aber Home --> {draw_pred['act_home']}")
print(f"Draw Pred. aber Away --> {draw_pred['act_away']}")
print(
    f"Anzahl Away Korrekt: {away_right} ({round(away_right / df_cross[df_cross['actual'] == 2].shape[0] * 100, 2)}%), Anzahl Away Falsch: {away_wrong}"
)
print(f"Away Pred. aber Home --> {away_pred['act_home']}")
print(f"Away Pred. aber Draw --> {away_pred['act_draw']}")

In [None]:
home_histograms, away_histograms = [], []
for game_idx in range(len(all_pred_home)):
    home_hist, bin_edges = np.histogram(all_pred_home[game_idx], [0, 1, 2, 3, 4, 5, 6])
    away_hist, bin_edges = np.histogram(all_pred_away[game_idx], [0, 1, 2, 3, 4, 5, 6])
    home_histograms.append(home_hist)
    away_histograms.append(away_hist)

home_cum_hist = np.sum(home_histograms, axis=0)
away_cum_hist = np.sum(away_histograms, axis=0)

In [None]:
print("observed")
print(np.mean(train_data_all.home_score))
print(np.mean(train_data_all.away_score))

In [None]:
print(np.sum([x * idx for idx, x in enumerate(home_cum_hist)]) / np.sum(home_cum_hist))
print(np.sum([x * idx for idx, x in enumerate(away_cum_hist)]) / np.sum(away_cum_hist))

In [None]:
his_h, _ = np.histogram(train_data_all.home_score, [0, 1, 2, 3, 4, 5, 6])
his_a, _ = np.histogram(train_data_all.away_score, [0, 1, 2, 3, 4, 5, 6])
fig, (ax1, ax2) = plt.subplots(2, 2)
fig.set_size_inches(10, 10)
fig.suptitle("Overshrinkage Poisson Model", fontsize=16)
ax1[0].bar(np.arange(len(his_h)), his_h / np.sum(his_h), color="lightskyblue")
ax1[0].set_title("Home Observed")
ax1[0].set_xlabel("Number of Goals")
ax1[0].set_ylabel("Probability of observed goal count")
ax1[0].get_yaxis().set_major_formatter(
    matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x * 100), ",") + "%")
)
ax1[0].grid(axis="y")

ax2[0].bar(
    np.arange(len(home_cum_hist)),
    home_cum_hist / np.sum(home_cum_hist),
    color="lightcoral",
)
ax2[0].set_title("Home Predicted")
ax2[0].set_xlabel("Number of Goals")
ax2[0].set_ylabel("Probability of observed goal count")
ax2[0].get_yaxis().set_major_formatter(
    matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x * 100), ",") + "%")
)
ax2[0].grid(axis="y")

ax1[1].bar(np.arange(len(his_a)), his_a / np.sum(his_a), color="lightskyblue")
ax1[1].set_title("Away Observed")
ax1[1].set_xlabel("Number of Goals")
ax1[1].set_ylabel("Probability of observed goal count")
ax1[1].get_yaxis().set_major_formatter(
    matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x * 100), ",") + "%")
)
ax1[1].grid(axis="y")

ax2[1].bar(
    np.arange(len(away_cum_hist)),
    away_cum_hist / np.sum(away_cum_hist),
    color="lightcoral",
)
ax2[1].set_title("Away Predicted")
ax2[1].set_xlabel("Number of Goals")
ax2[1].set_ylabel("Probability of observed goal count")
ax2[1].get_yaxis().set_major_formatter(
    matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x * 100), ",") + "%")
)
ax2[1].grid(axis="y")

In [None]:
actual_home, actual_away = [], []
for x in range(5):
    test_data = test_data_all.loc[test_data_all["match_category"] == x]
    a_h = test_data[: test_data_sizes[x]].home_score
    a_a = test_data[: test_data_sizes[x]].away_score
    actual_home.append(a_h)
    actual_away.append(a_a)
actual_home = np.concatenate(actual_home)
actual_away = np.concatenate(actual_away)
# actual
max_goals = int(np.max([np.max(actual_home), np.max(actual_away)])) + 1
res_table_a = np.array([np.zeros(max_goals) for _ in range(max_goals)])
for h, a in zip(actual_home, actual_away):
    res_table_a[int(h)][int(a)] += 1
res_table_a = res_table_a / test_data_all.shape[0]

# predicted
max_goals = int(np.max([np.max(actual_home), np.max(actual_away)])) + 1
res_table_p = np.array([np.zeros(max_goals) for _ in range(max_goals)])

game_quotes = []
for game_idx in range(len(all_pred_home)):
    home_hist, bin_edges = np.histogram(
        all_pred_home[game_idx], [0, 1, 2, 3, 4, 5, 6, 7]
    )
    away_hist, bin_edges = np.histogram(
        all_pred_away[game_idx], [0, 1, 2, 3, 4, 5, 6, 7]
    )
    if home_hist.shape[0] < max_goals:
        home_hist = np.append(home_hist, np.zeros(max_goals - home_hist.shape[0]))
    if away_hist.shape[0] < max_goals:
        away_hist = np.append(away_hist, np.zeros(max_goals - away_hist.shape[0]))

    home_hist = home_hist / 1000
    away_hist = away_hist / 1000
    probs = home_hist.reshape(home_hist.shape[0], 1) * away_hist
    probs = probs
    res_table_p += probs

res_table_p = res_table_p / test_data_all.shape[0]

# sb.heatmap(res_table_p, annot=True)

# top pred result:
max_goals = int(np.max([np.max(actual_home), np.max(actual_away)])) + 1
res_table_pr = np.array([np.zeros(max_goals) for _ in range(max_goals)])

game_quotes = []
for game_idx in range(len(predictions_home)):
    home_hist, bin_edges = np.histogram(
        predictions_home[game_idx], [0, 1, 2, 3, 4, 5, 6, 7]
    )
    away_hist, bin_edges = np.histogram(
        predictions_away[game_idx], [0, 1, 2, 3, 4, 5, 6, 7]
    )
    res_table_pr[np.argmax(home_hist)][np.argmax(away_hist)] += 1
res_table_pr = res_table_pr / test_data_orig_size
# sb.heatmap(res_table_pr, annot=True)

fig, axes = plt.subplots(1, 2, figsize=(12.5, 5))
fig.suptitle("Overshrinkage Poisson Model")
sb.heatmap(ax=axes[0], data=res_table_a, annot=True, fmt=".2f")
axes[0].set_title("Observed")
sb.heatmap(ax=axes[1], data=res_table_p, annot=True, fmt=".2f")
axes[1].set_title("Predicted")

In [None]:
print(np.tril(res_table_a).sum() - np.trace(res_table_a))
print(np.trace(res_table_a))
print(np.triu(res_table_a).sum() - np.trace(res_table_a))

In [None]:
print(np.tril(res_table_p).sum() - np.trace(res_table_p))
print(np.trace(res_table_p))
print(np.triu(res_table_p).sum() - np.trace(res_table_p))

In [None]:
# max_goals = int(np.max([np.max(actual_home), np.max(actual_away)])) + 1
# res_table = np.array([np.zeros(max_goals) for _ in range(max_goals)])

# game_quotes = []
# for game_idx in range(len(predictions_home)):
#     home_hist, bin_edges = np.histogram(predictions_home[game_idx], [0,1,2,3,4,5,6,7])
#     away_hist, bin_edges = np.histogram(predictions_away[game_idx], [0,1,2,3,4,5,6,7])
#     if home_hist.shape[0] < max_goals:
#         home_hist = np.append(home_hist, np.zeros(max_goals - home_hist.shape[0]))
#     if away_hist.shape[0] < max_goals:
#         away_hist = np.append(away_hist, np.zeros(max_goals - away_hist.shape[0]))

#     home_hist = home_hist / 1000
#     away_hist = away_hist / 1000
#     probs = home_hist.reshape(home_hist.shape[0], 1) * away_hist
#     probs = probs
#     res_table += probs


# for x in range(len(res_table)):
#     for y in range(len(res_table[x])):
#         if x == y:
#             res_table[x][y] *= 1.2
#         else:
#             res_table[x][y] *= 0.95

# res_table = res_table / test_data_orig_size
# print(sum(sum(res_table)))
# sb.heatmap(res_table, annot=True)

In [None]:
# game_quotes = []
# most_goals = {"home": [], "away": []}
# for game_idx in range(len(predictions_home)):
#     home_hist, bin_edges = np.histogram(predictions_home[game_idx], [0,1,2,3,4,5,6,7])
#     away_hist, bin_edges = np.histogram(predictions_away[game_idx], [0,1,2,3,4,5,6,7])
#     if home_hist.shape[0] < max_goals:
#         home_hist = np.append(home_hist, np.zeros(max_goals - home_hist.shape[0]))
#     if away_hist.shape[0] < max_goals:
#         away_hist = np.append(away_hist, np.zeros(max_goals - away_hist.shape[0]))
#     home_hist = home_hist / 1000
#     away_hist = away_hist / 1000
#     probs = home_hist.reshape(home_hist.shape[0], 1) * away_hist
#     for x in range(len(probs)):
#         for y in range(len(probs[x])):
#             if x == y:
#                 probs[x][y] *= 1.5
#             else:
#                 probs[x][y] *= (1-0.071428571)
#     home = np.tril(probs).sum()-np.trace(probs)
#     draw = np.trace(probs)
#     away = np.triu(probs).sum()-np.trace(probs)
#     game_quotes.append(f"{home}-{draw}-{away}")
#     most_goals["home"].append(f"h: {np.argmax(home_hist)} - {round(np.max(home_hist)/10)}%")
#     most_goals["away"].append(f"a: {np.argmax(away_hist)} - {round(np.max(away_hist)/10)}%")

# df_res = pd.DataFrame({"actual": act_res, "predicted": game_quotes, "prob goals home" : most_goals['home'], "prob goals away" : most_goals['away']})
# df_cross = pd.DataFrame({"actual": [0 if int(df_res.iloc[x]['actual'].split(':')[0]) > int(df_res.iloc[x]['actual'].split(':')[1])
#                                     else 1 if int(df_res.iloc[x]['actual'].split(':')[0]) == int(df_res.iloc[x]['actual'].split(':')[1])
#                                     else 2
#                                     for x in range(df_res.shape[0])],
#                          "pred":   [np.argmax([float(y) for y in df_res.iloc[x]['predicted'].split('-')])
#                                     for x in range(df_res.shape[0])],
#                          "pred_val": [np.max([float(y) for y in df_res.iloc[x]['predicted'].split('-')])
#                                     for x in range(df_res.shape[0])],
#                          "bookie": [np.argmax([test_data.iloc[x].bookie_home, test_data.iloc[x].bookie_draw, test_data.iloc[x].bookie_away])
#                                     for x in range(test_data_orig_size)],
#                          "bookie_val": [np.max([test_data.iloc[x].bookie_home, test_data.iloc[x].bookie_draw, test_data.iloc[x].bookie_away])
#                                     for x in range(test_data_orig_size)]
#                                     })

In [None]:
# print("Verteilung:")
# print(f"Anzahl Predicted Home: {df_cross[df_cross['pred'] == 0].shape[0]} ({round(df_cross[df_cross['pred'] == 0].shape[0] / df_cross.shape[0] * 100, 2)}%)", end = '')
# print(f" | Verteilung Tatsächlich Home: {round(df_cross[df_cross['actual'] == 0].shape[0] / df_cross.shape[0] * 100, 2)}%")
# print(f"Anzahl Predicted Draw: {df_cross[df_cross['pred'] == 1].shape[0]} ({round(df_cross[df_cross['pred'] == 1].shape[0] / df_cross.shape[0] * 100, 2)}%)", end = '')
# print(f" | Verteilung Tatsächlich Draw: {round(df_cross[df_cross['actual'] == 1].shape[0] / df_cross.shape[0] * 100, 2)}%")
# print(f"Anzahl Predicted Away: {df_cross[df_cross['pred'] == 2].shape[0]} ({round(df_cross[df_cross['pred'] == 2].shape[0] / df_cross.shape[0] * 100, 2)}%)", end = '')
# print(f" | Verteilung Tatsächlich Away: {round(df_cross[df_cross['actual'] == 2].shape[0] / df_cross.shape[0] * 100, 2)}%")
# print("---------------------------------------------------------------------------------")
# print("---------------------------------------------------------------------------------")
# right, wrong, home_right, draw_right, away_right = 0, 0, 0, 0, 0
# home_pred = {'act_away': 0, 'act_draw': 0}
# draw_pred = {'act_away': 0, 'act_home': 0}
# away_pred = {'act_home': 0, 'act_draw': 0}
# for x in range(df_cross.shape[0]):
#     if df_cross.iloc[x]["actual"] != df_cross.iloc[x]["pred"]:
#         if df_cross.iloc[x]["pred"] == 0:
#             if df_cross.iloc[x]["actual"] == 1:
#                 home_pred['act_draw'] += 1
#             else:
#                 home_pred['act_away'] += 1
#         elif df_cross.iloc[x]["pred"] == 1:
#             if df_cross.iloc[x]["actual"] == 0:
#                 draw_pred['act_home'] += 1
#             else:
#                 draw_pred['act_away'] += 1
#         else:
#             if df_cross.iloc[x]["actual"] == 0:
#                 away_pred['act_home'] += 1
#             else:
#                 away_pred['act_draw'] += 1
#         wrong += 1
#     else:
#         if df_cross.iloc[x]["actual"] == 0:
#             home_right += 1
#         elif df_cross.iloc[x]["actual"] == 1:
#             draw_right += 1
#         else:
#             away_right += 1
#         right += 1

# home_wrong = home_pred['act_away'] + home_pred['act_draw']
# draw_wrong = draw_pred['act_away'] + draw_pred['act_home']
# away_wrong = away_pred['act_home'] + away_pred['act_draw']
# print("Prediction:")
# print(f"Anzahl Korrekt: {right} ({round(right / df_cross.shape[0] * 100,2)}%), Anzahl Falsch: {wrong} ({round(wrong / df_cross.shape[0] * 100, 2)}%)")
# print("---------------------------------------------------------------------------------")
# print(f"Anzahl Home Korrekt: {home_right} ({round(home_right / df_cross[df_cross['actual'] == 0].shape[0] * 100, 2)}%), Anzahl Home Falsch: {home_wrong}")
# print(f"Home Pred. aber Draw --> {home_pred['act_draw']}")
# print(f"Home Pred. aber Away --> {home_pred['act_away']}")
# print(f"Anzahl Draw Korrekt: {draw_right} ({round(draw_right / df_cross[df_cross['actual'] == 1].shape[0] * 100, 2)}%), Anzahl Draw Falsch: {draw_wrong}")
# print(f"Draw Pred. aber Home --> {draw_pred['act_home']}")
# print(f"Draw Pred. aber Away --> {draw_pred['act_away']}")
# print(f"Anzahl Away Korrekt: {away_right} ({round(away_right / df_cross[df_cross['actual'] == 2].shape[0] * 100, 2)}%), Anzahl Away Falsch: {away_wrong}")
# print(f"Away Pred. aber Home --> {away_pred['act_home']}")
# print(f"Away Pred. aber Draw --> {away_pred['act_draw']}")

In [None]:
# ece
data_length = df_cross.shape[0]

# acc_model = np.sum(df_cross['actual'] == df_cross['pred']) / data_length
# acc_bookie = np.sum(df_cross['bookie'] == df_cross['actual']) / data_length

# print("Accuracy Model: ", acc_model)
# print("Accuracy Bookie: ", acc_bookie)

# conf_model = np.sum(df_cross['pred_val']) / data_length
# conf_bookie = np.sum(df_cross['bookie_val']/100) / data_length

# print("Confidence Model: ", conf_model)
# print("Confidence Bookie: ", conf_bookie)

# ece_model = np.mean(np.absolute(acc_model - conf_model))
# ece_bookie = np.mean(np.absolute(acc_bookie - conf_bookie))

# print(f"Model ECE: {ece_model}")
# print(f"Bookie ECE: {ece_bookie}")
acc_home = (
    df_cross.loc[(df_cross["bookie"] == 0) & (df_cross["actual"] == 0)].shape[0]
) / df_cross.loc[df_cross["actual"] == 0].shape[0]
acc_draw = (
    df_cross.loc[(df_cross["bookie"] == 1) & (df_cross["actual"] == 1)].shape[0]
) / df_cross.loc[df_cross["actual"] == 1].shape[0]
acc_away = (
    df_cross.loc[(df_cross["bookie"] == 2) & (df_cross["actual"] == 2)].shape[0]
) / df_cross.loc[df_cross["actual"] == 2].shape[0]

conf_home = np.mean(df_cross.loc[(df_cross["bookie"] == 0)].bookie_val / 100)
conf_draw = np.mean(df_cross.loc[(df_cross["bookie"] == 1)].bookie_val / 100)
conf_away = np.mean(df_cross.loc[(df_cross["bookie"] == 2)].bookie_val / 100)
print(acc_home - conf_home)
print(acc_draw - conf_draw)
print(acc_away - conf_away)


acc_home = (
    df_cross.loc[(df_cross["pred"] == 0) & (df_cross["actual"] == 0)].shape[0]
) / df_cross.loc[df_cross["actual"] == 0].shape[0]
acc_draw = (
    df_cross.loc[(df_cross["pred"] == 1) & (df_cross["actual"] == 1)].shape[0]
) / df_cross.loc[df_cross["actual"] == 1].shape[0]
acc_away = (
    df_cross.loc[(df_cross["pred"] == 2) & (df_cross["actual"] == 2)].shape[0]
) / df_cross.loc[df_cross["actual"] == 2].shape[0]

conf_home = np.mean(df_cross.loc[(df_cross["pred"] == 0)].pred_val)
conf_draw = np.mean(df_cross.loc[(df_cross["pred"] == 1)].pred_val)
conf_away = np.mean(df_cross.loc[(df_cross["pred"] == 2)].pred_val)
print(acc_home - conf_home)
print(acc_draw - conf_draw)
print(acc_away - conf_away)