In [36]:
import sys
sys.path.append('../')
from proto_files.player import Player

import socceraction.spadl as spadl
import socceraction.xthreat as xthreat
import soccerdata as sd
from pathlib import PosixPath
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt

import socceraction.spadl.config as spadlconfig
from socceraction.spadl.schema import SPADLSchema
from pandera.typing import DataFrame, Series
import numpy.typing as npt
from sklearn.exceptions import NotFittedError
from typing import Callable, List, Optional, Tuple
import os
from datetime import date, timedelta
from ipywidgets import interact
import math
from oddCalculation import OddToPercentage
import json

In [37]:
ws = sd.WhoScored(
    leagues="ITA-Serie A",
    seasons=[21],
    no_cache=False,
    no_store=False,
    data_dir=PosixPath("/home/morten/Develop/Open-Data/soccerdata"),
    path_to_browser="/usr/bin/chromium",
    headless=True,
)
corona_games = [1376730, 1376707, 1376732, 1376726, 1376733, 1376725, 1376729, 1376716, 1376735, 1376734, 1376721, 1376722, 1376723, 1376728, 1376724, 1376731, 1376727, 1376720, 1376718, 1376717, 1376719]
loader = ws.read_events(output_fmt='loader', skip=corona_games)
loader_season_ids = ["2122"]
all_games = []
for s_id in loader_season_ids:
    all_games.append(loader.games(competition_id="ITA-Serie A", season_id=s_id))

df_games = pd.concat(all_games)
# ["GER-Bundesliga", "GER-Bundesliga2", "ENG-Premier League", "ESP-La Liga", "FRA-Ligue 1", "ITA-Serie A"]
betting_data = pd.read_csv("betting-data/I1.csv", sep=',')
name_substitutes = json.load(open("/home/morten/soccerdata/config/teamname_replacements.json"))
for replace in name_substitutes:
    for name in name_substitutes[replace]:
        betting_data.replace(name, replace, inplace=True)

In [38]:
betting_data.HomeTeam.unique()

array(['Inter', 'Verona', 'Empoli', 'Torino', 'Bologna', 'Udinese',
       'Napoli', 'Roma', 'Cagliari', 'Sampdoria', 'Atalanta', 'Lazio',
       'Fiorentina', 'Juventus', 'Genoa', 'Sassuolo', 'AC Milan',
       'Salernitana', 'Spezia', 'Venezia'], dtype=object)

In [39]:
def get_odds_by_names(home, away):
    betting_game_entry = betting_data[betting_data["HomeTeam"] == home]
    betting_game_entry = betting_game_entry[betting_game_entry["AwayTeam"] == away]
    # get Bet365 Values
    home_odds = betting_game_entry["B365H"].values[0]
    draw_odds = betting_game_entry["B365D"].values[0]
    away_odds = betting_game_entry["B365A"].values[0]
    o2p = OddToPercentage()
    home_prob, home_perc = o2p.odd_to_percentage(home_odds)
    draw_prob, draw_perc = o2p.odd_to_percentage(draw_odds)
    away_prob, away_perc = o2p.odd_to_percentage(away_odds)
    # make them fair
    bookie_sum = home_prob + draw_prob + away_prob
    home_perc = home_perc / bookie_sum
    draw_perc = draw_perc / bookie_sum
    away_perc = away_perc / bookie_sum

    return [home_perc, draw_perc, away_perc, home_odds, draw_odds, away_odds]


def up_team_count(count_t, df_g, g_id):
    h_id = df_g[df_g['game_id'] == g_id]['home_team_id'].values[0]
    a_id = df_g[df_g['game_id'] == g_id]['away_team_id'].values[0]

    if h_id in count_t['all']:
        count_t['all'][h_id] += 1
    else:  
        count_t['all'][h_id] = 0
    if a_id in count_t['all']:
        count_t['all'][a_id] += 1
    else:  
        count_t['all'][a_id] = 0

    if h_id in count_t['home']:
        count_t['home'][h_id] += 1
    else:  
        count_t['home'][h_id] = 0
    if a_id in count_t['away']:
        count_t['away'][a_id] += 1
    else:  
        count_t['away'][a_id] = 0

In [40]:
df_games = df_games.sort_values("game_date")
count_teams = {"all": {}, "home": {}, "away": {}}
for idx, game_id in enumerate(tqdm(df_games['game_id'])):
    df_teams = loader.teams(game_id=game_id)
    df_players = loader.players(game_id=game_id)
    df_game = df_games[df_games['game_id'] == game_id]
    up_team_count(count_teams, df_games, game_id)
    # skip first ten games of season because of insufficent data
    if(count_teams['all'][df_games[df_games['game_id'] == game_id]['home_team_id'].values[0]] < 10 or 
        count_teams['all'][df_games[df_games['game_id'] == game_id]['away_team_id'].values[0]] < 10):
        continue
    home_id = df_game.home_team_id.values[0]
    away_id = df_game.away_team_id.values[0]
    game_date = df_game.game_date.values[0].astype(str)[:10]
    home_score = df_game.home_score.values[0]
    away_score = df_game.away_score.values[0]
    home_team_name = df_teams[df_teams["team_id"] == home_id]['team_name'].values[0]
    away_team_name = df_teams[df_teams["team_id"] == away_id]['team_name'].values[0]
    if(home_team_name == "Torino" and away_team_name == "Fiorentina"):
        continue
    bookie_h, bookie_d, bookie_a, bookie_home_odd, bookie_draw_odd, bookie_away_odd = get_odds_by_names(home_team_name, away_team_name)
    
    data = {"home": {'starter': {"xg_for": 0., "xt_all": 0., "xt_only_pos": 0., "xd_press": 0., "xd_normal": 0., "xk": 0., "gi": 0., "xg_against": 0., "xt_against_all": 0., "xt_against_only_pos": 0.},
                     'subs': {"xg_for": 0., "xt_all": 0., "xt_only_pos": 0., "xd_press": 0., "xd_normal": 0., "xk": 0., "gi": 0., "xg_against": 0., "xt_against_all": 0., "xt_against_only_pos": 0.}},
            "away": {'starter': {"xg_for": 0., "xt_all": 0., "xt_only_pos": 0., "xd_press": 0., "xd_normal": 0., "xk": 0., "gi": 0., "xg_against": 0., "xt_against_all": 0., "xt_against_only_pos": 0.},
                     'subs': {"xg_for": 0., "xt_all": 0., "xt_only_pos": 0., "xd_press": 0., "xd_normal": 0., "xk": 0., "gi": 0., "xg_against": 0., "xt_against_all": 0., "xt_against_only_pos": 0.}},
            "general": {"game_id": set(), "table_pos_home": set(), "table_pos_away": set(), "ha_table_pos_home": set(), "ha_table_pos_away": set(), 
                        "form_home_for": set(), "form_home_against": set(), "form_away_for": set(), "form_away_against": set(), "ha_form_home_for": set(), 
                        "ha_form_home_against": set(), "ha_form_away_for": set(),  "ha_form_away_against": set(), "elo_home": set(), "elo_away": set()}}
    
    starter_home_empty, starter_away_empty = 0, 0
    for player_id in df_players["player_id"]:
        player_row = df_players[df_players['player_id'] == player_id]
        if player_row.minutes_played.values[0] == 0:
            continue
        proto_player = Player().parse(open(f"./../data/data_0.3/{str(player_id)}.pb", "rb").read())
        player_df = pd.DataFrame(proto_player.expected_game_impact)
        player_df.drop_duplicates(inplace=True)
        player_df = player_df.sort_values("game_date").reset_index(drop=True)
        game_df = player_df[player_df["game_date"] == game_date]
        previous_df = player_df[player_df["game_date"] < game_date]
        previous_df = previous_df.loc[(previous_df["starter"] == game_df["starter"].values[0]) & (previous_df["home"] == game_df["home"].values[0])]
        previous_df = previous_df[-5:]
        is_home = "home" if game_df["home"].values[0] else "away"
        is_starter = "starter" if game_df["starter"].values[0] else "subs"
        if previous_df.shape[0] == 0 and game_df["starter"].values[0]:
            if game_df["home"].values[0]:
                starter_home_empty += 1
            else:
                starter_away_empty += 1

        data[is_home][is_starter]["xg_for"] += (np.mean((previous_df["xg"] / previous_df["minutes_played"])) * 90) if not math.isnan((np.mean((previous_df["xg"] / previous_df["minutes_played"])) * 90)) else 0
        data[is_home][is_starter]["xt_all"] += (np.mean((previous_df["xt_all"] / previous_df["minutes_played"])) * 90) if not math.isnan((np.mean((previous_df["xt_all"] / previous_df["minutes_played"])) * 90)) else 0
        data[is_home][is_starter]["xt_only_pos"] += (np.mean((previous_df["xt_only_pos"] / previous_df["minutes_played"])) * 90) if not math.isnan((np.mean((previous_df["xt_only_pos"] / previous_df["minutes_played"])) * 90)) else 0
        data[is_home][is_starter]["xd_press"] += (np.mean((previous_df["xd_press"] / previous_df["minutes_played"])) * 90) if not math.isnan((np.mean((previous_df["xd_press"] / previous_df["minutes_played"])) * 90)) else 0
        data[is_home][is_starter]["xd_normal"] += (np.mean((previous_df["xd_def"] / previous_df["minutes_played"])) * 90) if not math.isnan((np.mean((previous_df["xd_def"] / previous_df["minutes_played"])) * 90)) else 0
        data[is_home][is_starter]["xk"] += (np.mean((previous_df["xk_save"] / previous_df["minutes_played"])) * 90) if not math.isnan((np.mean((previous_df["xk_save"] / previous_df["minutes_played"])) * 90)) else 0
        data[is_home][is_starter]["gi"] += (np.mean((previous_df["gi"] / previous_df["minutes_played"])) * 90) if not math.isnan((np.mean((previous_df["gi"] / previous_df["minutes_played"])) * 90)) else 0
        data[is_home][is_starter]["xg_against"] += (np.mean((previous_df["xg_against"] / previous_df["minutes_played"])) * 90) if not math.isnan((np.mean((previous_df["xg_against"] / previous_df["minutes_played"])) * 90)) else 0
        data[is_home][is_starter]["xt_against_all"] += (np.mean((previous_df["xt_against_all"] / previous_df["minutes_played"])) * 90) if not math.isnan((np.mean((previous_df["xt_against_all"] / previous_df["minutes_played"])) * 90)) else 0
        data[is_home][is_starter]["xt_against_only_pos"] += (np.mean((previous_df["xt_against_only_pos"] / previous_df["minutes_played"])) * 90) if not math.isnan((np.mean((previous_df["xt_against_only_pos"] / previous_df["minutes_played"])) * 90)) else 0


        data['general']["game_id"].add(game_df['game_id'].values[0])
        data["general"]["table_pos_home"].add(game_df["team_pos"].values[0] if game_df["home"].values[0] else game_df["opp_position"].values[0])
        data["general"]["table_pos_away"].add(game_df["opp_position"].values[0] if game_df["home"].values[0] else game_df["team_pos"].values[0])
        data["general"]["ha_table_pos_home"].add(game_df["team_pos_home_away"].values[0] if game_df["home"].values[0] else game_df["opp_position_home_away"].values[0])
        data["general"]["ha_table_pos_away"].add(game_df["opp_position_home_away"].values[0] if game_df["home"].values[0] else game_df["team_pos_home_away"].values[0])
        data["general"]["form_home_for"].add(game_df["team_form_for"].values[0] if game_df["home"].values[0] else game_df["opp_form_for"].values[0])
        data["general"]["form_home_against"].add(game_df["team_form_against"].values[0] if game_df["home"].values[0] else game_df["opp_form_against"].values[0])
        data["general"]["form_away_for"].add(game_df["opp_form_for"].values[0] if game_df["home"].values[0] else game_df["team_form_for"].values[0])
        data["general"]["form_away_against"].add(game_df["opp_form_against"].values[0] if game_df["home"].values[0] else game_df["team_form_against"].values[0])
        data["general"]["ha_form_home_for"].add(game_df["team_form_home_away_for"].values[0] if game_df["home"].values[0] else game_df["opp_form_home_away_for"].values[0])
        data["general"]["ha_form_home_against"].add(game_df["team_form_home_away_against"].values[0] if game_df["home"].values[0] else game_df["opp_form_home_away_against"].values[0])
        data["general"]["ha_form_away_for"].add(game_df["opp_form_home_away_for"].values[0] if game_df["home"].values[0] else game_df["team_form_home_away_for"].values[0])
        data["general"]["ha_form_away_against"].add(game_df["opp_form_home_away_against"].values[0] if game_df["home"].values[0] else game_df["team_form_home_away_against"].values[0])
        data["general"]["elo_home"].add(game_df["team_elo"].values[0] if game_df["home"].values[0] else game_df["opposition_elo"].values[0])
        data["general"]["elo_away"].add(game_df["opposition_elo"].values[0] if game_df["home"].values[0] else game_df["team_elo"].values[0])

    if starter_home_empty > 3 or starter_away_empty > 3:
        continue
    pd.DataFrame({
        "game_id": [list(data['general']["game_id"])[0]],
        "table_pos_home": [list(data["general"]["table_pos_home"])[0]],
        "table_pos_away": [list(data["general"]["table_pos_away"])[0]],
        "ha_table_pos_home": [list(data["general"]["ha_table_pos_home"])[0]],
        "ha_table_pos_away": [list(data["general"]["ha_table_pos_away"])[0]],
        "form_home_for": [list(data["general"]["form_home_for"])[0]],
        "form_home_against": [list(data["general"]["form_home_against"])[0]],
        "form_away_for": [list(data["general"]["form_away_for"])[0]],
        "form_away_against": [list(data["general"]["form_away_against"])[0]],
        "ha_form_home_for": [list(data["general"]["ha_form_home_for"])[0]],
        "ha_form_home_against": [list(data["general"]["ha_form_home_against"])[0]],
        "ha_form_away_for": [list(data["general"]["ha_form_away_for"])[0]],
        "ha_form_away_against": [list(data["general"]["ha_form_away_against"])[0]],
        "elo_home": [list(data["general"]["elo_home"])[0]],
        "elo_away": [list(data["general"]["elo_away"])[0]],

        "home_xG": [data["home"]["starter"]["xg_for"]], 
        "home_xT_all": [data["home"]["starter"]["xt_all"]],
        "home_xT_only_pos": [data["home"]["starter"]["xt_only_pos"]],
        "home_xD_press": [data["home"]["starter"]["xd_press"]],
        "home_xD_normal": [data["home"]["starter"]["xd_normal"]],
        "home_xK": [data["home"]["starter"]["xk"]],
        "home_gi": [data["home"]["starter"]["gi"]],
        "home_xg_against": [data["home"]["starter"]["xg_against"]],
        "home_xt_all_against": [data["home"]["starter"]["xt_against_all"]],
        "home_xt_only_pos_against": [data["home"]["starter"]["xt_against_only_pos"]],
        "home_sub_xG": [data["home"]["subs"]["xg_for"]], 
        "home_sub_xT_all": [data["home"]["subs"]["xt_all"]],
        "home_sub_xT_only_pos": [data["home"]["subs"]["xt_only_pos"]],
        "home_sub_xD_press": [data["home"]["subs"]["xd_press"]],
        "home_sub_xD_normal": [data["home"]["subs"]["xd_normal"]],
        "home_sub_xK": [data["home"]["subs"]["xk"]],
        "home_sub_gi": [data["home"]["subs"]["gi"]],
        "home_sub_xg_against": [data["home"]["subs"]["xg_against"]],
        "home_sub_xt_all_against": [data["home"]["subs"]["xt_against_all"]],
        "home_sub_xt_only_pos_against": [data["home"]["subs"]["xt_against_only_pos"]],

        "away_xG": [data["away"]["starter"]["xg_for"]], 
        "away_xT_all": [data["away"]["starter"]["xt_all"]],
        "away_xT_only_pos": [data["away"]["starter"]["xt_only_pos"]],
        "away_xD_press": [data["away"]["starter"]["xd_press"]],
        "away_xD_normal": [data["away"]["starter"]["xd_normal"]],
        "away_xK": [data["away"]["starter"]["xk"]],
        "away_gi": [data["away"]["starter"]["gi"]],
        "away_xg_against": [data["away"]["starter"]["xg_against"]],
        "away_xt_all_against": [data["away"]["starter"]["xt_against_all"]],
        "away_xt_only_pos_against": [data["away"]["starter"]["xt_against_only_pos"]],
        "away_sub_xG": [data["away"]["subs"]["xg_for"]], 
        "away_sub_xT_all": [data["away"]["subs"]["xt_all"]],
        "away_sub_xT_only_pos": [data["away"]["subs"]["xt_only_pos"]],
        "away_sub_xD_press": [data["away"]["subs"]["xd_press"]],
        "away_sub_xD_normal": [data["away"]["subs"]["xd_normal"]],
        "away_sub_xK": [data["away"]["subs"]["xk"]],
        "away_sub_gi": [data["away"]["subs"]["gi"]],
        "away_sub_xg_against": [data["away"]["subs"]["xg_against"]],
        "away_sub_xt_all_against": [data["away"]["subs"]["xt_against_all"]],
        "away_sub_xt_only_pos_against": [data["away"]["subs"]["xt_against_only_pos"]],
        "home_score": [home_score],
        "away_score": [away_score],
        "bookie_home": [bookie_h],
        "bookie_draw": [bookie_d],
        "bookie_away": [bookie_a], 
        "bookie_home_odd": [bookie_home_odd],
        "bookie_draw_odd": [bookie_draw_odd],
        "bookie_away_odd": [bookie_away_odd]
    }).to_csv("data_0.3/data_game_values_test_odds.csv", mode='a', header=False, index=False, sep=';')
    

100%|██████████| 380/380 [14:57<00:00,  2.36s/it]


In [41]:
# test 0.3
# ita_1
# fra_1