In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sb
import itertools

In [None]:
train_data = pd.read_csv(
    "data_acquisition/data_0.3/data_game_values_train.csv", sep=";"
)

In [None]:
test_data = pd.read_csv(
    "data_acquisition/data_0.3/data_game_values_test_odds.csv", sep=";"
)

In [None]:
train_data.home_score.mean()

In [None]:
train_data.away_score.mean()

In [None]:
train_data["elo_diff_home"] = train_data["elo_home"] - train_data["elo_away"]
train_data["elo_diff_away"] = train_data["elo_away"] - train_data["elo_home"]

test_data["elo_diff_home"] = test_data["elo_home"] - test_data["elo_away"]
test_data["elo_diff_away"] = test_data["elo_away"] - test_data["elo_home"]

In [None]:
import seaborn as sns

boxplot_dict = {
    "ELO Diff Home": train_data["elo_diff_home"],
    "Home xG": train_data["home_xG"],
    "Form Home": train_data["ha_form_home_for"],
}
fig = plt.figure(figsize=(4, 5))
bplot = sns.boxplot(
    [
        train_data["elo_diff_home"],
        train_data["home_xG"],
        train_data["ha_form_home_for"],
    ],
    color="lightgreen",
)
bplot.set_xticklabels(boxplot_dict.keys())

In [None]:
boxplot_dict = {
    "Home xG": train_data["home_xG"],
    "Form Home For": train_data["ha_form_home_for"],
}
fig = plt.figure(figsize=(4, 5))
# ax = fig.subplots()
bplot = sns.boxplot(
    [train_data["home_xG"], train_data["ha_form_home_for"]], color="lightgreen"
)
bplot.set_xticklabels(boxplot_dict.keys())

In [None]:
features = [
    "home_xG",
    "away_xG",
    "home_xg_against",
    "away_xg_against",
    "home_xT_all",
    "away_xT_all",
    "home_xt_all_against",
    "away_xt_all_against",
    "ha_form_home_for",
    "ha_form_away_for",
    "ha_form_home_against",
    "ha_form_away_against",
    "elo_diff_home",
    "elo_diff_away",
]

In [None]:
for f in features:
    category_mean = train_data[f].mean()
    category_std = train_data[f].std()
    train_data[f] = (train_data[f] - category_mean) / category_std
    test_data[f] = (test_data[f] - category_mean) / category_std

In [None]:
train_data_removed = train_data[features + ["game_id", "home_score", "away_score"]]
test_data_removed = test_data[
    features
    + [
        "game_id",
        "home_score",
        "away_score",
        "bookie_home",
        "bookie_draw",
        "bookie_away",
        "bookie_home_odd",
        "bookie_draw_odd",
        "bookie_away_odd",
    ]
]

In [None]:
test_data_removed.describe()

In [None]:
his_h, _ = np.histogram(train_data_removed.home_score, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
his_a, _ = np.histogram(train_data_removed.away_score, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
home_mean = train_data_removed.home_score.mean()
away_mean = train_data_removed.away_score.mean()
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(10, 5)
# fig.suptitle("Independent Poisson Model", fontsize=16)
ax1.bar(np.arange(len(his_h)), his_h / np.sum(his_h), color="lightskyblue")
ax1.set_title("Home Observed")
ax1.set_xlabel("Number of Goals")
ax1.set_ylabel("Observed home goal count")
ax1.get_yaxis().set_major_formatter(
    matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x * 100), ",") + "%")
)
ax1.set_ylim([0, 0.4])
ax1.grid(axis="y")
ax1.vlines(x=home_mean, ymin=0, ymax=0.4, colors="r")

ax2.bar(np.arange(len(his_a)), his_a / np.sum(his_a), color="lightskyblue")
ax2.set_title("Away Observed")
ax2.set_xlabel("Number of Goals")
ax2.set_ylabel("Observed away goal count")
ax2.get_yaxis().set_major_formatter(
    matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x * 100), ",") + "%")
)
ax2.set_ylim([0, 0.4])
ax2.vlines(x=away_mean, ymin=0, ymax=0.4, colors="r")
ax2.grid(axis="y")

In [None]:
train_data_removed["away_score"].hist()

In [None]:
train_data_removed.describe()

In [None]:
boxplot_dict = {
    "Elo Diff Home": train_data_removed["elo_diff_home"],
    "Home xG": train_data_removed["home_xG"],
    "Form Home": train_data_removed["ha_form_home_for"],
}
bplot = sns.boxplot(
    [
        train_data_removed["elo_diff_home"],
        train_data_removed["home_xG"],
        train_data_removed["ha_form_home_for"],
    ],
    color="lightgreen",
)
bplot.set_xticklabels(boxplot_dict.keys())

In [None]:
# data_corr = all_data[["home_xG","away_xG","home_xg_against","away_xg_against","home_xT_all","away_xT_all","home_xt_all_against","away_xt_all_against","ha_form_home_for","ha_form_away_for","ha_form_home_against","ha_form_away_against","elo_diff_home","elo_diff_away"]]
data_corr = train_data_removed[
    [
        "home_xG",
        "away_xG",
        "home_xg_against",
        "away_xg_against",
        "home_xT_all",
        "away_xT_all",
        "home_xt_all_against",
        "away_xt_all_against",
        "ha_form_home_for",
        "ha_form_away_for",
        "ha_form_home_against",
        "ha_form_away_against",
        "elo_diff_home",
    ]
]

In [None]:
import seaborn as sns

In [None]:
plt.figure(figsize=(16, 6))
mask = np.triu(np.ones_like(data_corr.corr(), dtype=bool), k=1)
heatmap = sns.heatmap(
    data_corr.corr().abs(), mask=mask, vmin=0, vmax=1, annot=True, cmap="crest"
)
heatmap.set_title("Correlation Heatmap", fontdict={"fontsize": 18}, pad=16)
# heatmap.set_xticks(range(len(data_corr.corr()))) # <--- set the ticks first
heatmap.set_xticklabels(
    [
        "Home xG",
        "Away xG",
        "Home $\overline{xG}$",
        "Away $\overline{xG}$",
        "Home xT",
        "Away xT",
        "Home $\overline{xT}$",
        "Away $\overline{xT}$",
        "Form Home",
        "Form Away",
        "$\overline{Form Home}$",
        "$\overline{Form Away}$",
        "ELO Diff",
    ]
)
# heatmap.set_xticks(range(len(data_corr.corr()))) # <--- set the ticks first
heatmap.set_yticklabels(
    [
        "Home xG",
        "Away xG",
        "Home $\overline{xG}$",
        "Away $\overline{xG}$",
        "Home xT",
        "Away xT",
        "Home $\overline{xT}$",
        "Away $\overline{xT}$",
        "Form Home",
        "Form Away",
        "$\overline{Form Home}$",
        "$\overline{Form Away}$",
        "ELO Diff",
    ]
)

In [None]:
# training_standardized = all_data_removed[all_data_removed["game_id"].isin(train_data["game_id"])]
# test_standardized = all_data_removed[all_data_removed["game_id"].isin(test_data["game_id"])]

In [None]:
# train_data_removed.to_csv("data_acquisition/data_0.3/training_standardized.csv", sep=";", index=False)
# test_data_removed.to_csv("data_acquisition/data_0.3/test_standardized.csv", sep=";", index=False)

In [None]:
import math

n = 16
fact = 0
for x in range(7):
    if x < 5:
        continue
    m = x + 1
    fact += math.factorial(n) / (math.factorial(m) * math.factorial(n - m))

print(fact)

In [None]:
explore_data["home_xG_norm2"] = explore_data["home_xG"] / explore_data["home_xG"].std()
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import math

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
hist, bins = np.histogram(explore_data["home_xG_norm2"], bins=10)
hist = hist / explore_data["home_xG_norm2"].count()
width = 0.95 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align="center", width=width)
mu = 0
variance = 1
sigma = math.sqrt(variance)
x = np.linspace(mu - 3 * sigma, mu + 3 * sigma, 100)
plt.plot(x, stats.norm.pdf(x, mu, sigma), color="black")

fig.show()

In [None]:
import itertools

# how to round
error1 = np.arange(0, 3, 0.1)
error2 = np.arange(0, 3, 0.1)

errors = []
for x, y in itertools.product(error1, error2):
    errors.append(x - y)

errors = np.array(np.round(errors, 1))

In [None]:
np.unique(errors)

In [None]:
int_rounding = []
for x, y in itertools.product(error1, error2):
    int_rounding.append(int(x - y))

In [None]:
bankers_rounding = []
for x, y in itertools.product(error1, error2):
    bankers_rounding.append(np.rint(x - y))

In [None]:
my_rounding = []
for x, y in itertools.product(error1, error2):
    if ((x - y) >= 0.5) or ((y - x) >= 0.5):
        x = int(x)
        y = int(y)
    else:
        c = max(x, y)
        x = int(c)
        y = int(c)
    my_rounding.append(x - y)

In [None]:
fig = plt.figure(figsize=(20, 5))
ax = fig.subplots(1, 3)
ax[0].scatter(errors, int_rounding)
ax[0].grid()
ax[1].scatter(errors, bankers_rounding)
ax[1].grid()
ax[2].scatter(errors, my_rounding)
ax[2].grid()

In [None]:
x = np.linspace(-10, 10, 1000)
y = np.maximum(0, x)

plt.figure(figsize=(10, 5))
plt.plot(x, y)
plt.legend(["Relu"])
plt.xlim(-10, 10)
plt.ylim(-10, 10)
plt.axhline(0, color="black", linewidth=0.5)
plt.axvline(0, color="black", linewidth=0.5)
# plt.grid()
plt.show()