In [None]:
import subprocess
import json
import pandas as pd
import os
import numpy as np

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

# statsperform API + bq backloading

In [None]:
import importlib
import data
importlib.reload(data)
from data import DataLoader

prem2425 = '9n12waklv005j8r32sfjj2eqc'

prem_loader = DataLoader(prem2425)

# feature engineering

## data loading

In [None]:
def get_all_team_data():
        query = f"""
                select *
                from prizepicksanalytics.soccer_simulations.team_match_data
                """
        
        return prem_loader.execute_bq_query(query)

def get_all_player_data():
        query = f"""
                select *
                from prizepicksanalytics.soccer_simulations.player_match_data
                """
        
        return prem_loader.execute_bq_query(query)

def get_historic_team_data(match):

        home_query = f"""
                select *
                from prizepicksanalytics.soccer_simulations.team_match_data
                where team_id = '{match.home_id}'
                and date(REPLACE(match_date, 'Z', '')) < date('{match.localDate}')
                """
        
        away_query = f"""
                select *
                from prizepicksanalytics.soccer_simulations.team_match_data
                where team_id = '{match.away_id}'
                and date(REPLACE(match_date, 'Z', '')) < date('{match.localDate}')
                """


        return prem_loader.execute_bq_query(home_query),   prem_loader.execute_bq_query(away_query)


# def get_historic_player_data(match,
#                              list_of_player_ids):
        
#         player_query = f"""
#                 select *
#                 from prizepicksanalytics.soccer_simulations.player_match_data
#                 where playerId in ({", ".join(["'" + x + "'" for x in list_of_player_ids])})
#                 and date(REPLACE(match_date, 'Z', '')) < date('{match.localDate}')
#                 """


#         return prem_loader.execute_bq_query(player_query)

        # return player_query

def get_squads(match):

        home_query = f"""
                select *
                from prizepicksanalytics.soccer_simulations.squad_data
                where contestantId = '{match.home_id}'
                """
        
        away_query = f"""
                select *
                from prizepicksanalytics.soccer_simulations.squad_data
                where contestantId = '{match.away_id}'
                """


        return prem_loader.execute_bq_query(home_query), prem_loader.execute_bq_query(away_query)


def get_calendar():
    query = """
        select * from prizepicksanalytics.soccer_simulations.match_calendar
    """
    return prem_loader.execute_bq_query(query)

In [None]:


# away_query = f"""
#                 select *
#                 from prizepicksanalytics.soccer_simulations.squad_data
#                 """


# squad_data = prem_loader.execute_bq_query(away_query)

# calendar_data = get_calendar()
# calendar_data.to_csv('calendar_data.csv', index=False)


# tourneys = get_statsperform_tourneys()
# team_data = get_all_team_data()
# team_data.to_csv('team_data.csv', index=False)
# player_data = get_all_player_data()
# player_data.to_csv('player_data.csv', index=False)
# ars_lfc_match = prem_loader.matches.iloc[290]

## agg methods

In [None]:
def _get_team_stats_vector(team_data,
                           opposite_team_data,
                           position):
    
    home = True if position == 'home' else False
    position_performances = team_data[team_data['home'] == home]

    if team_data.shape[0] == 0:
        avg_possession = np.nan
        avg_finalThirdEntries = np.nan
        avg_accurateBackZonePass = np.nan
        avg_accurateFwdZonePass = np.nan

        avg_possWonAtt3rd = np.nan
        avg_possWonMid3rd = np.nan
        avg_interception = np.nan
    else:
        avg_possession = team_data.possessionPercentage.mean()
        avg_finalThirdEntries = team_data.finalThirdEntries.mean()
        avg_accurateBackZonePass = team_data.accurateBackZonePass.mean()
        avg_accurateFwdZonePass = team_data.accurateFwdZonePass.mean()

        avg_possWonAtt3rd = team_data.possWonAtt3rd.mean()
        avg_possWonMid3rd = team_data.possWonMid3rd.mean()
        avg_interception = team_data.interception.mean()

    if position_performances.shape[0] == 0:
        ps_possession = np.nan
        ps_finalThirdEntries = np.nan
        ps_accurateBackZonePass = np.nan
        ps_accurateFwdZonePass = np.nan

        ps_possWonAtt3rd = np.nan
        ps_possWonMid3rd = np.nan
        ps_interception = np.nan
    else:
        ps_possession = position_performances.possessionPercentage.mean()
        ps_finalThirdEntries = position_performances.finalThirdEntries.mean()
        ps_accurateBackZonePass = position_performances.accurateBackZonePass.mean()
        ps_accurateFwdZonePass = position_performances.accurateFwdZonePass.mean()

        ps_possWonAtt3rd = position_performances.possWonAtt3rd.mean()
        ps_possWonMid3rd = position_performances.possWonMid3rd.mean()
        ps_interception = position_performances.interception.mean()
    
    
    return {
        'avg_possession': avg_possession,
        # 'avg_finalThirdEntries': avg_finalThirdEntries,
        # 'avg_accurateBackZonePass': avg_accurateBackZonePass,
        # 'avg_accurateFwdZonePass': avg_accurateFwdZonePass,
        # 'avg_possWonAtt3rd': avg_possWonAtt3rd,
        # 'avg_possWonMid3rd': avg_possWonMid3rd,
        # 'avg_interception': avg_interception,

        'ps_possession': ps_possession,
        # 'ps_finalThirdEntries': ps_finalThirdEntries,
        # 'ps_accurateBackZonePass': ps_accurateBackZonePass,
        # 'ps_accurateFwdZonePass': ps_accurateFwdZonePass,
        # 'ps_possWonAtt3rd': ps_possWonAtt3rd,
        # 'ps_possWonMid3rd': ps_possWonMid3rd,
        # 'ps_interception': ps_interception,
    }

def _get_player_stats(player_performances,
                      position):
    home = True if position == 'home' else False
    position_performances = player_performances[player_performances['home'] == home]

    if player_performances.shape[0] == 0:
        avg_passAtt = np.nan
        std_passAtt = np.nan
        avg_minsPlayed = np.nan
    else:
        avg_passAtt = player_performances.totalPass.mean()
        std_passAtt = player_performances.totalPass.std()
        avg_minsPlayed = player_performances.minsPlayed.mean()

    if position_performances.shape[0] == 0:
        ps_passAtt = np.nan
        std_ps_passAtt = np.nan
        ps_minsPlayed = np.nan
    else:
        ps_passAtt = position_performances.totalPass.mean()
        std_ps_passAtt = position_performances.totalPass.std()
        ps_minsPlayed = position_performances.minsPlayed.mean()
    
    
    return {
        'avg_passes': avg_passAtt,
        'std_passes': std_passAtt,
        'avg_minsPlayed': avg_minsPlayed,
        'ps_passes': ps_passAtt,
        'std_ps_passes':std_ps_passAtt,
        'ps_minsPlayed': ps_minsPlayed,
    }

## get match vectors

In [None]:
import numpy as np

    
def get_match_vectors(match,
                      all_performances = None,
                      all_squads=None,
                      all_player_data=None,
                      training=True):

    final_feat_list = []

    team_ids = {'home':match.home_id,
                'away':match.away_id}

    if all_performances is None:
        home_data, away_data = get_historic_team_data(match)
    else:
        home_data = all_performances[(all_performances['team_id'] == team_ids['home']) &
                                     (all_performances['match_date'].apply(lambda x: x.replace('Z', '')) < match.localDate)]
        
        away_data = all_performances[(all_performances['team_id'] == team_ids['away']) &
                                     (all_performances['match_date'].apply(lambda x: x.replace('Z', '')) < match.localDate)]

    team_data = {
        'home':home_data,
        'away':away_data
    }

    if all_squads is None:
        home_squad, away_squad = get_squads(match)
    else:
        home_squad, away_squad = all_squads[team_ids['home']], all_squads[team_ids['away']]

    squad_data = {
        'home':home_squad,
        'away':away_squad
    }

    for team in ['home', 'away']:
        opposite_position = 'away' if team == 'home' else 'home'

        team_stats_vector = _get_team_stats_vector(team_data[team],
                                                      team_data[opposite_position],
                                                      team)
        
        # team_name = team_data[team].iloc[0].team_name
        
        team_squad = squad_data[team]

        if all_player_data is None:
            historical_player_data = get_historic_player_data(match, list(team_squad.id.unique()))
        else: 
            historical_player_data = all_player_data[(all_player_data['playerId'].isin(team_squad.id.unique())) &
                                                      (all_player_data['match_date'].apply(lambda x: x.replace('Z', '')) < match.localDate)]
            
        # team_squad = team_squad[team_squad['id'].isin(historical_player_data.playerId.unique())]
        team_squad = all_player_data[(all_player_data['team_id'] == team_ids[team]) & (all_player_data['match_id'] == match.id)]

        for i, player in team_squad.iterrows():

            final_dict = {
                'player_id':player.playerId,
                'player_name':player.matchName,
                'player_position':player.position,
                'team_id':team_ids[team],
                'match_id':match.id,
                'match_date':match.localDate,
                'match_week':match.week,
                # 'cheating_possession':
            }

            # print('player id', player.playerId)

            player_performances = historical_player_data[historical_player_data['playerId'] == player.playerId]
            player_vector = _get_player_stats(player_performances,
                                              team)
            
            final_dict.update(player_vector)
            final_dict.update(team_stats_vector)          

            if training == True:
                try:
                    mask = (all_player_data['match_id'] == match.id) & (all_player_data['playerId'] == player.playerId)
                    performance = all_player_data[mask].iloc[0]
                    final_dict['target'] = performance.totalPass
                    # final_dict['is_sub'] = performance.isSub

                    # mask2 = (all_performances['match_id'] == match.id) & (all_performances['team_id'] == team_ids[team])
                    # final_dict['cheat_possession'] = all_performances[mask2].iloc[0].possessionPercentage
                except IndexError as e:
                    continue
        
            # player_vector['match_id'] = match.id
            # player_vector['match_date'] = match.localDate
            # player_vector['match_week'] = match.week

            final_feat_list.append(final_dict)
        
    return final_feat_list

## feature generation

In [None]:
import pickle
calendar = pd.read_csv('match_calendar.csv')

team_data = pd.read_csv('team_data.csv')
team_data = team_data.sort_values(['match_date'], ascending=True).reset_index(drop=True)

player_data = pd.read_csv('player_data.csv')
player_data = player_data.sort_values(['match_date', 'match_id', 'team_id', 'playerId'], ascending=True).reset_index(drop=True)

with open('squads.pickle', 'rb') as fila:
    all_squads = pickle.load(fila)

list_of_all_vectors = []

for i, match in calendar.iterrows():

    vectors = get_match_vectors(match,
                    all_performances=team_data,
                    all_squads=all_squads,
                    all_player_data=player_data,
                    training=True)
                    

    list_of_all_vectors.extend(vectors)

feature_list = pd.DataFrame(list_of_all_vectors)
feature_list.to_csv('./v4features.csv')

# modeling

In [None]:
version = 4

features = pd.read_csv(f'v{version}features.csv', index_col=0)
features = features.dropna()
features = features.sort_values('match_date', ascending=True)

if version != 2:
    features = features.drop(index=features[features['player_position'] == 'Substitute'].index)
    position_dummies = pd.get_dummies(features.player_position).astype(int)
    features = features.merge(position_dummies, left_index=True, right_index=True, how='left')

weeks = features.match_week.unique()
test_split = int(len(weeks) * 0.8)

train_weeks = weeks[:test_split]
test_weeks = weeks[test_split:]

train = features[features['match_week'].isin(train_weeks)]
test = features[features['match_week'].isin(test_weeks)]


config_cols = ['player_name', 'player_position', 'team_id', 'match_id', 'match_date', 'match_week', 'player_id', 'std_ps_passes', 'std_passes']
feats = train.drop(columns=config_cols + ['target']).columns
target = ['target']

## alexboost

In [None]:
from alexboost import NegBinomial
from alexboost import Normal

In [None]:
alx_norm = Normal(
        num_trees=50, 
        num_samples_per_bucket=30,
    )

alx_neg_bin = NegBinomial(
        num_trees=50, 
        num_samples_per_bucket=30,
    )



# add a small amount to target to avoid error

alx_norm.fit(train[feats], train[target].target)
alx_neg_bin.fit(train[feats], train[target].target)

In [None]:
norm_preds = alx_norm.predict(test[feats])
neg_bin_preds = alx_neg_bin.predict(test[feats])

test[['norm_mean', 'norm_std']] = norm_preds
test[['nb_mean', 'nb_r']] = neg_bin_preds

test['error'] = (test.target - test.norm_mean)
# test['avg_error'] = (test.target - test.avg_passes)

In [None]:
test[test['player_name'].str.contains('Bruyne')]

# evaluation

In [None]:
from scipy.stats import norm, nbinom # Import the distributions

def normal_nll(y_true, mu, sigma):
    log_prob = norm.logpdf(y_true, loc=mu, scale=sigma)
    return -log_prob

def negative_binomial_nll(y_true, mu, r):
    p = r / (mu + r)
    p = np.clip(p, 1e-16, 1.0 - 1e-16)

    y_true = int(y_true)
    log_prob = nbinom.logpmf(y_true, n=r, p=p)
    return -log_prob

test['norm_nll'] = test.apply(lambda x: normal_nll(x.target, x.norm_mean, x.norm_std), axis=1)
test['nb_nll'] = test.apply(lambda x: negative_binomial_nll(x.target, x.nb_mean, x.nb_r), axis=1)
test['avg_nll'] = test.apply(lambda x: normal_nll(x.target, x.avg_passes, x.std_passes), axis=1)

In [None]:
print(f"AlexBoost Norm Dist NLL: {test.norm_nll.mean()}")
print(f"AlexBoost NegBinom Dist NLL: {test.nb_nll.mean()}")
print(f"Average Passes Norm Dist NLL: {test.avg_nll.mean()}")

In [None]:
norm_mae = np.sum((np.abs(test.target - test.norm_mean)) / test.shape[0])
negbin_mae = np.sum((np.abs(test.target - test.nb_mean)) / test.shape[0])
avg_mae = np.sum((np.abs(test.target - test.avg_passes)) / test.shape[0])

norm_rmse = np.sqrt(np.sum(np.abs(test.target - test.norm_mean)**2 / test.shape[0]))
negbin_rmse = np.sqrt(np.sum(np.abs(test.target - test.nb_mean)**2 / test.shape[0]))
avg_rmse = np.sqrt(np.sum(np.abs(test.target - test.avg_passes)**2 / test.shape[0]))

print(f"Norm MAE: {norm_mae} | Norm RMSE: {norm_rmse}")
print(f"NegBin MAE: {negbin_mae} | NegBin RMSE: {negbin_rmse}")
print(f"Average Passes MAE: {avg_mae} | Average Passes RMSE: {avg_rmse}")

In [None]:
alx_norm.regressor.feature_importances_
pd.DataFrame({"feature": train[feats].columns, "importance": alx_norm.regressor.feature_importances_}).sort_values(by="importance", ascending=False).round(3)

In [None]:
alx_neg_bin.regressor.feature_importances_
pd.DataFrame({"feature": train[feats].columns, "importance": alx_neg_bin.regressor.feature_importances_}).sort_values(by="importance", ascending=False).round(3)

## misc

In [None]:
test.groupby('player_position')['mae_error'].sum().sort_values()

In [None]:
test.groupby('player_position')['mae_error'].apply(lambda x: (sum(x)) / len(x)).sort_values()

In [None]:
test.groupby('team_id')['error'].apply(lambda x: (sum(x)) / len(x)).sort_values()

In [None]:
import seaborn as sns # Often imported as sns

# test.exp_std.plot()
sns.scatterplot(
    data=test[test['error'] < 60].iloc[:1000],
    x='error',
    y='avg_minsPlayed',
    hue='player_position'
)

In [None]:
import matplotlib.pyplot as plt
train[target].target.hist()

In [None]:
train.sort_values('target', ascending=True)