In [None]:
import soccerdata as sd
from pathlib import PosixPath
from gde_utils.date_utils import to_datetime
from datetime import datetime, timedelta
import pandas as pd
import numpy as np

In [None]:
# load all data
ws = sd.WhoScored(
    leagues="GER-Bundesliga2",
    seasons=[15, 16, 17, 18, 19, 20],
    no_cache=False,
    no_store=False,
    data_dir=PosixPath("/home/morten/Develop/Open-Data/soccerdata"),
    path_to_browser="/usr/bin/chromium",
    headless=False,
)

In [None]:
buli_schedule = ws.read_schedule()
elo = sd.ClubElo()
buli_schedule.reset_index(inplace=True)

In [None]:
def is_own_goal(qualifiers):
    return [max([(True if x["type"]["displayName"] == "OwnGoal" else False) for x in events], default=False) for events in qualifiers]

def get_opposition_team(df_goals: pd.DataFrame, df_teams: pd.DataFrame):
    switched_teams = df_goals.copy()
    team_id_one = df_teams["team_id"].unique()[0]
    team_id_two = df_teams["team_id"].unique()[1]
    switched_teams.replace({team_id_one : team_id_two,
                            team_id_two : team_id_one}, inplace = True)
    return switched_teams

def get_score(events_df: pd.DataFrame, df_teams: pd.DataFrame):
    goals = events_df.loc[(events_df["is_goal"] == True)].copy()
    goals["own_goal"] = is_own_goal(goals["qualifiers"])
    goals.loc[~goals["own_goal"], "goal_team_id"] = goals.loc[~goals["own_goal"],"team_id"]
    goals.loc[goals["own_goal"], "goal_team_id"] = get_opposition_team(goals["team_id"], df_teams)[goals["own_goal"]]
    goals.reset_index(inplace=True)
    return goals[["expanded_minute", "goal_team_id"]]

In [None]:
data = []
failed_count = 0
# create dataset:
for game_id in buli_schedule.game_id.values:
    try:
        # pre match information:
        pre_game_date = to_datetime(buli_schedule[buli_schedule["game_id"] == game_id]["date"].values[0]) - timedelta(days=1)
        game_elo = elo.read_by_date(pre_game_date)
        game_elo.reset_index(inplace=True)
        home_team = buli_schedule[buli_schedule["game_id"] == game_id].home_team.values[0]
        away_team = buli_schedule[buli_schedule["game_id"] == game_id].away_team.values[0]
        home_elo = game_elo[game_elo["team"] == home_team].elo.values[0]
        away_elo = game_elo[game_elo["team"] == away_team].elo.values[0]
        df_events = ws.read_events([game_id])
        loader = ws.read_events(match_id=[game_id], output_fmt='loader')
        df_teams = loader.teams(game_id)
        df_events.reset_index(inplace=True)
        game_score = get_score(df_events, df_teams)
        score_readable = game_score.groupby("goal_team_id").count().reset_index().rename(columns={"expanded_minute": "score"})
        score_value_home = score_readable[score_readable["goal_team_id"] == (home_id := df_teams[df_teams["team_name"] == home_team].team_id.values[0])].score.values
        score_value_away = score_readable[score_readable["goal_team_id"] == (away_id := df_teams[df_teams["team_name"] == away_team].team_id.values[0])].score.values
        home_score = score_value_home[0] if len(score_value_home) > 0 else 0
        away_score = score_value_away[0] if len(score_value_away) > 0 else 0
        data.append([home_id, home_score, home_elo, away_id, away_score, away_elo])
    except:
        failed_count +=1
print("Failed: ", failed_count)

In [None]:
print(failed_count)

In [None]:
elo_dif = pd.DataFrame(data, columns=["home_id", "home_score", "home_elo", "away_id", "away_score", "away_elo"])

In [None]:
elo_dif

In [None]:
# get elo and margin of victory
elo_dif["elo_dif"] = elo_dif["home_elo"] - elo_dif["away_elo"]
elo_dif["goal_dif"] = elo_dif["home_score"] - elo_dif["away_score"]

In [None]:
elo_dif.corr()

In [None]:
# create regression model
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(elo_dif["elo_dif"].values.reshape(-1, 1), elo_dif["goal_dif"].values.reshape(-1, 1))

In [None]:
lr.predict([[0]])

In [None]:
lr.intercept_

In [None]:
lr.coef_