In [None]:
import subprocess
import json
import pandas as pd
import os
import numpy as np

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

import warnings
warnings.filterwarnings('ignore')


# statsperform API + bq backloading

In [None]:
import importlib
import data
importlib.reload(data)
from data import DataLoader

prem2425 = '9n12waklv005j8r32sfjj2eqc'

prem_loader = DataLoader(prem2425)

# feature engineering

## data loading

In [None]:
def get_all_team_data():
        query = f"""
                select *
                from prizepicksanalytics.soccer_simulations.team_match_data
                """
        
        return prem_loader.execute_bq_query(query)

def get_all_player_data():
        query = f"""
                select *
                from prizepicksanalytics.soccer_simulations.player_match_data
                """
        
        return prem_loader.execute_bq_query(query)

def get_historic_team_data(match):

        home_query = f"""
                select *
                from prizepicksanalytics.soccer_simulations.team_match_data
                where team_id = '{match.home_id}'
                and date(REPLACE(match_date, 'Z', '')) < date('{match.localDate}')
                """
        
        away_query = f"""
                select *
                from prizepicksanalytics.soccer_simulations.team_match_data
                where team_id = '{match.away_id}'
                and date(REPLACE(match_date, 'Z', '')) < date('{match.localDate}')
                """


        return prem_loader.execute_bq_query(home_query),   prem_loader.execute_bq_query(away_query)


# def get_historic_player_data(match,
#                              list_of_player_ids):
        
#         player_query = f"""
#                 select *
#                 from prizepicksanalytics.soccer_simulations.player_match_data
#                 where playerId in ({", ".join(["'" + x + "'" for x in list_of_player_ids])})
#                 and date(REPLACE(match_date, 'Z', '')) < date('{match.localDate}')
#                 """


#         return prem_loader.execute_bq_query(player_query)

        # return player_query

def get_squads(match):

        home_query = f"""
                select *
                from prizepicksanalytics.soccer_simulations.squad_data
                where contestantId = '{match.home_id}'
                """
        
        away_query = f"""
                select *
                from prizepicksanalytics.soccer_simulations.squad_data
                where contestantId = '{match.away_id}'
                """


        return prem_loader.execute_bq_query(home_query), prem_loader.execute_bq_query(away_query)


def get_calendar():
    query = """
        select * from prizepicksanalytics.soccer_simulations.match_calendar
    """
    return prem_loader.execute_bq_query(query)

In [None]:


# away_query = f"""
#                 select *
#                 from prizepicksanalytics.soccer_simulations.squad_data
#                 """


# squad_data = prem_loader.execute_bq_query(away_query)

# calendar_data = get_calendar()
# calendar_data.to_csv('calendar_data.csv', index=False)


# tourneys = get_statsperform_tourneys()
# team_data = get_all_team_data()
# team_data.to_csv('team_data.csv', index=False)
# player_data = get_all_player_data()
# player_data.to_csv('player_data.csv', index=False)
# ars_lfc_match = prem_loader.matches.iloc[290]

## agg methods

In [None]:
def _get_team_stats_vector(team_data,
                           opposite_team_data,
                           position):
    
    home = True if position == 'home' else False

    if team_data.shape[0] == 0:
        avg_possession = np.nan
        weighted_possession = np.nan
    else:
        avg_possession = team_data.possessionPercentage.mean()
        weighted_possession = (team_data.possessionPercentage * team_data.poss_weight).sum() / team_data.poss_weight.sum()


    
    return {
        't_avg_possession': avg_possession,
        "t_posw_possession": weighted_possession
    }



def _get_player_stats(player_performances,
                      team_performances,
                      position):
    home = True if position == 'home' else False
    
    team_performances = team_performances.sort_values('match_date', ascending=True)
    last_5_team_perf = team_performances.iloc[-5:].match_id

    player_last_5_perf = player_performances[player_performances['match_id'].isin(last_5_team_perf)]

    if player_performances.shape[0] == 0:
        avg_passAtt = np.nan
        std_passAtt = np.nan
        avg_minsPlayed = np.nan
        # last5_mins = np.nan
        weighted_passAtt = np.nan

    else:
        avg_passAtt = player_performances.totalPass.mean()
        std_passAtt = player_performances.totalPass.std()
        avg_minsPlayed = player_performances.minsPlayed.mean()
        # last5_mins = player_performances
        weighted_passAtt = (player_performances['totalPass'] * player_performances['poss_weight'].astype(float)).sum() / player_performances['poss_weight'].astype(float).sum()
    
    
    return {
        'p_avg_passes': avg_passAtt,
        'p_std_passes': std_passAtt,
        'p_avg_minsPlayed': avg_minsPlayed,
        'p_weighted_passes': weighted_passAtt,
        'p_last5_minsPlayed': player_last_5_perf.minsPlayed.mean() if not player_last_5_perf.empty else 0
    }

In [None]:
team_mappings = {
  "b9si1jn1lfxfund69e9ogcu2n": "Wolves",
  "7yx5dqhhphyvfisohikodajhv": "Brentford",
  "22doj4sgsocqpxw45h607udje": "Tottenham",
  "e5p0ehyguld7egzhiedpdnc3w": "Brighton",
  "d5ydtvt96bv7fq04yqm2w2632": "Southampton",
  "4dsgumo7d4zupm2ugsvm4zm4d": "Arsenal",
  "1qtaiy11gswx327s0vkibf70n": "Nottm Forest",
  "9q0arba2kbnywth8bkxlhgmdr": "Chelsea",
  "7vn2i2kd35zuetw6b38gw9jsz": "Newcastle",
  "ehd2iemqmschhj2ec0vayztzz": "Everton",
  "6eqit8ye8aomdsrrq0hk3v7gh": "Man Utd",
  "b496gs285it6bheuikox6z9mj": "Aston Villa",
  "c8h9bw1l82s06h77xxrelzhur": "Liverpool",
  "1c8m2ko0wxq1asfkuykurdr0y": "Crystal Palace",
  "8b523ujgl21tbc01me65q0aoh": "Ipswich",
  "4txjdaqveermfryvbfrr4taf7": "West Ham",
  "hzqh7z0mdl3v7gwete66syxp": "Fulham",
  "a3nyxabgsqlnqfkeg41m6tnpp": "Man City",
  "1pse9ta7a45pi2w2grjim70ge": "Bournemouth",
  "avxknfz4f6ob0rv9dbnxdzde0": "Leicester"
}

## get match vectors

In [None]:
import numpy as np

def gaussian_weight(historic_possession, predicted_next_possession, bandwidth=20.0):
    """
    Calculates a Gaussian weight based on the similarity of possession percentages.
    Bandwidth (sigma) controls how quickly the weight drops off.
    """
    difference = historic_possession - predicted_next_possession
    # Larger bandwidth means less sensitive to differences
    return np.exp(-(difference**2) / (2 * bandwidth**2))

    
def get_match_vectors(match,
                      all_performances = None,
                      all_squads=None,
                      all_player_data=None,
                      training=True):

    final_feat_list = []

    team_ids = {'home':match.home_id,
                'away':match.away_id}

    if all_performances is None:
        home_data, away_data = get_historic_team_data(match)
    else:
        home_data = all_performances[(all_performances['team_id'] == team_ids['home']) &
                                     (all_performances['match_date'].apply(lambda x: x.replace('Z', '')) < match.localDate)]
        
        away_data = all_performances[(all_performances['team_id'] == team_ids['away']) &
                                     (all_performances['match_date'].apply(lambda x: x.replace('Z', '')) < match.localDate)]

    team_data = {
        'home':home_data,
        'away':away_data
    }

    if all_squads is None:
        home_squad, away_squad = get_squads(match)
    else:
        home_squad, away_squad = all_squads[team_ids['home']], all_squads[team_ids['away']]

    squad_data = {
        'home':home_squad,
        'away':away_squad
    }

    # print(match.description)
    for team in ['home', 'away']:
        # print(team_mappings[team_ids[team]])

        mask2 = (all_performances['match_id'] == match.id) & (all_performances['team_id'] == team_ids[team])
        # USED FOR GAUSSIAN KERNEL
        cheat_possession = all_performances[mask2].iloc[0].possessionPercentage

        # GAUSSIAN KERNEL APPLICATION
        team_data[team]['poss_weight'] = gaussian_weight(team_data[team].possessionPercentage, 
                                                         cheat_possession,
                                                         team_data[team].possessionPercentage.std())
        
        team_data[team]['poss_weight'] = team_data[team]['poss_weight'].astype(float)**2
        
        team_data[team]['opponent_id'] = team_data[team]['opponent_id'].apply(lambda x: team_mappings[x])
        # print(team_data[team][['opponent_id', 'possessionPercentage', 'poss_weight']].sort_values('poss_weight', ascending=False).reset_index(drop=True))
        

        opposite_position = 'away' if team == 'home' else 'home'

        team_stats_vector = _get_team_stats_vector(team_data[team],
                                                      team_data[opposite_position],
                                                      team)
        
        
        team_squad = squad_data[team]

        if all_player_data is None:
            historical_player_data = get_historic_player_data(match, list(team_squad.id.unique()))
        else: 
            historical_player_data = all_player_data[(all_player_data['playerId'].isin(team_squad.id.unique())) &
                                                      (all_player_data['match_date'].apply(lambda x: x.replace('Z', '')) < match.localDate)]
            
        # merge match weights on historical player data
        historical_player_data = historical_player_data.merge(team_data[team][['match_id', 'team_id', 'poss_weight']],
                                     how='left',
                                     on=['match_id', 'team_id'])
        
            
        # team_squad = team_squad[team_squad['id'].isin(historical_player_data.playerId.unique())]
        team_squad = all_player_data[(all_player_data['team_id'] == team_ids[team]) & (all_player_data['match_id'] == match.id)]

        for i, player in team_squad.iterrows():

            final_dict = {
                'player_id':player.playerId,
                'player_name':player.matchName,
                'player_position':player.position,
                'team_id':team_ids[team],
                'match_id':match.id,
                'match_date':match.localDate,
                'match_week':match.week,
                'is_home':True if team == 'home' else False
                # 'cheating_possession':
            }

            # print('player id', player.playerId)

            player_performances = historical_player_data[historical_player_data['playerId'] == player.playerId]


            player_vector = _get_player_stats(player_performances,
                                              team_data[team],
                                              team)
            
            
            
            final_dict.update(player_vector)
            final_dict.update(team_stats_vector)      

            # print(final_dict['p_weighted_passes'])    

            if training == True:
                try:
                    mask = (all_player_data['match_id'] == match.id) & (all_player_data['playerId'] == player.playerId)
                    performance = all_player_data[mask].iloc[0]

                    final_dict['is_sub'] = performance.isSub
                    final_dict['target'] = performance.totalPass

                except IndexError as e:
                    continue
        
            # player_vector['match_id'] = match.id
            # player_vector['match_date'] = match.localDate
            # player_vector['match_week'] = match.week

            final_feat_list.append(final_dict)
        
    return final_feat_list

## feature generation

In [None]:
import pickle
calendar = pd.read_csv('match_calendar.csv')

team_data = pd.read_csv('team_data.csv')
team_data = team_data.sort_values(['match_date'], ascending=True).reset_index(drop=True)

player_data = pd.read_csv('player_data.csv')
player_data = player_data.sort_values(['match_date', 'match_id', 'team_id', 'playerId'], ascending=True).reset_index(drop=True)

with open('squads.pickle', 'rb') as fila:
    all_squads = pickle.load(fila)

list_of_all_vectors = []

for i, match in calendar.iterrows():

    vectors = get_match_vectors(match,
                    all_performances=team_data,
                    all_squads=all_squads,
                    all_player_data=player_data,
                    training=True)
                    

    list_of_all_vectors.extend(vectors)

feature_list = pd.DataFrame(list_of_all_vectors)
feature_list.to_csv('./vcheatfeatures.csv')

In [None]:
feature_list.head()

# modeling

In [None]:
version = 'cheat'

features = pd.read_csv(f'v{version}features.csv', index_col=0)
features = features.dropna()
features = features.sort_values('match_date', ascending=True)

if version != 2:
    features = features.drop(index=features[features['player_position'] == 'Substitute'].index)
    position_dummies = pd.get_dummies(features.player_position).astype(int)
    features = features.merge(position_dummies, left_index=True, right_index=True, how='left')

weeks = features.match_week.unique()
test_split = int(len(weeks) * 0.8)

train_weeks = weeks[:test_split]
test_weeks = weeks[test_split:]

train = features[features['match_week'].isin(train_weeks)]
test = features[features['match_week'].isin(test_weeks)]

config_cols = ['player_name', 'player_position', 'team_id', 'match_id', 'match_date', 'match_week', 'player_id', 'p_std_passes']
feats = train.drop(columns=config_cols + ['target']).columns
target = ['target']

## alexboost

In [None]:
from alexboost import NegBinomial
from alexboost import Normal

In [None]:
alx_norm = Normal(
        num_trees=50, 
        num_samples_per_bucket=30,
    )

alx_neg_bin = NegBinomial(
        num_trees=50, 
        num_samples_per_bucket=30,
    )


# alx_norm.fit(train[feats], train[target].target)
alx_neg_bin.fit(train[feats], train[target].target)

In [None]:
# norm_preds = alx_norm.predict(test[feats])
neg_bin_preds = alx_neg_bin.predict(test[feats])

# test[['norm_mean', 'norm_std']] = norm_preds
test[['nb_mean', 'nb_r']] = neg_bin_preds

test['error'] = (test.target - test.nb_mean)
# test['avg_error'] = (test.target - test.avg_passes)

In [None]:
# test.team_id = test.team_id.apply(lambda x: team_mappings[x])
pd.DataFrame(test.groupby(['team_id', 'player_name'])['error'].mean())

# evaluation

In [None]:
from scipy.stats import norm, nbinom # Import the distributions

def normal_nll(y_true, mu, sigma):
    log_prob = norm.logpdf(y_true, loc=mu, scale=sigma)
    return -log_prob

def negative_binomial_nll(y_true, mu, r):
    p = r / (mu + r)
    p = np.clip(p, 1e-16, 1.0 - 1e-16)

    y_true = int(y_true)
    log_prob = nbinom.logpmf(y_true, n=r, p=p)
    return -log_prob

# test['norm_nll'] = test.apply(lambda x: normal_nll(x.target, x.norm_mean, x.norm_std), axis=1)
test['nb_nll'] = test.apply(lambda x: negative_binomial_nll(x.target, x.nb_mean, x.nb_r), axis=1)
test['avg_nll'] = test.apply(lambda x: normal_nll(x.target, x.p_avg_passes, x.p_std_passes), axis=1)

In [None]:
# norm_mae = np.sum((np.abs(test.target - test.norm_mean)) / test.shape[0])
negbin_mae = np.sum((np.abs(test.target - test.nb_mean)) / test.shape[0])
avg_mae = np.sum((np.abs(test.target - test.p_avg_passes)) / test.shape[0])

# norm_rmse = np.sqrt(np.sum(np.abs(test.target - test.norm_mean)**2 / test.shape[0]))
negbin_rmse = np.sqrt(np.sum(np.abs(test.target - test.nb_mean)**2 / test.shape[0]))
avg_rmse = np.sqrt(np.sum(np.abs(test.target - test.p_avg_passes)**2 / test.shape[0]))

# print(f"AlexBoost Norm Dist NLL: {test.norm_nll.mean()}")
# print(f"AlexBoost NegBinom Dist ")
# print(f"Average Passes Norm Dist ")
# print(f"Norm MAE: {norm_mae} | Norm RMSE: {norm_rmse}")
print(f"AlexBoost NegBinom -> MAE: {negbin_mae} | RMSE: {negbin_rmse} | NLL: {test.nb_nll.mean()}")
print(f"Average Passes -> MAE: {avg_mae} | RMSE: {avg_rmse} | NLL: {test.avg_nll.mean()}")

In [None]:
# alx_norm.regressor.feature_importances_
# pd.DataFrame({"feature": train[feats].columns, "importance": alx_norm.regressor.feature_importances_}).sort_values(by="importance", ascending=False).round(3)

In [None]:
alx_neg_bin.regressor.feature_importances_
pd.DataFrame({"feature": train[feats].columns, "importance": alx_neg_bin.regressor.feature_importances_}).sort_values(by="importance", ascending=False).round(3)

In [None]:
test.sort_values('error', ascending=False)

In [None]:
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt
import seaborn as sns

test['pred_mean_bin'] = pd.cut(test['nb_mean'], bins=20)

calibration_data = test.groupby('pred_mean_bin').agg(
    observed_mean=('target', 'mean'),
    predicted_mean_avg=('nb_mean', 'mean'),
    count=('target', 'size') # helpful to see how many samples in each bin
).dropna()


plt.figure(figsize=(8, 8))
sns.scatterplot(
    x=calibration_data['predicted_mean_avg'],
    y=calibration_data['observed_mean'],
    s=calibration_data['count'] * 2, # Scale point size by count in bin
    color='blue',
    alpha=0.7,
    label='Binned Data (Size proportional to count)'
)

# Plot the ideal calibration line (y = x)
max_val = max(calibration_data['predicted_mean_avg'].max(), calibration_data['observed_mean'].max())
min_val = min(calibration_data['predicted_mean_avg'].min(), calibration_data['observed_mean'].min())
plt.plot([min_val, max_val], [min_val, max_val], 'k--', label='Perfectly Calibrated')

plt.xlabel("Average Predicted Mean")
plt.ylabel("Average Actual Value")
plt.title("Calibration Curve: Average Actual vs. Average Predicted Mean")
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend()
plt.tight_layout()
plt.show()

# true_prob, pred_prob = calibration_curve(test['target'], test['nb_mean'], n_bins=10)
# plt.figure(figsize=(8, 8))
# plt.plot(pred_prob, true_prob, marker='o', label='Calibration Plot')
# plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Perfectly calibrated')
# plt.xlabel('Predicted "Mean" (conceptually probability)')
# plt.ylabel('Actual "Mean" (conceptually frequency)')
# plt.title('Scikit-learn style Calibration Curve (Conceptual)')
# plt.legend()
# plt.show()


In [None]:
import matplotlib.pyplot as plt
test['target_bin'] = pd.qcut(test.target, q=15)

sums_per_bin = test.groupby('target_bin')['error'].mean()

bin_labels = [str(int(round(interval.left, 1))) + '-' + str(int(round(interval.right, 1))) for interval in sums_per_bin.index] # Convert intervals to strings for plotting
bin_values = sums_per_bin.values

plt.figure(figsize=(10, 6)) # Adjust figure size as needed
plt.bar(bin_labels, bin_values, color='mediumseagreen') #

In [None]:
test['min_bin'] = pd.qcut(test.p_avg_minsPlayed, q=15)

sums_per_bin = test.groupby('min_bin')['error'].mean()

bin_labels = [str(int(round(interval.left, 1))) + '-' + str(int(round(interval.right, 1))) for interval in sums_per_bin.index] # Convert intervals to strings for plotting
bin_values = sums_per_bin.values

plt.figure(figsize=(10, 6)) # Adjust figure size as needed
plt.bar(bin_labels, bin_values, color='mediumseagreen') #

In [None]:
test['p_last5_minsPlayed_bin'] = pd.qcut(test.p_last5_minsPlayed, q=5)

sums_per_bin = test.groupby('p_last5_minsPlayed_bin')['error'].mean()

bin_labels = [str(int(round(interval.left, 1))) + '-' + str(int(round(interval.right, 1))) for interval in sums_per_bin.index] # Convert intervals to strings for plotting
bin_values = sums_per_bin.values

plt.figure(figsize=(10, 6)) # Adjust figure size as needed
plt.bar(bin_labels, bin_values, color='mediumseagreen') #

In [None]:
test.head()

## prizepicks lines eval

In [None]:
from google.cloud import bigquery as bqc

prizepicks_lines_query = """ 
SELECT
    start_time,
    created_at,
    game_id,
    stat_type_name,
    player_name,
    line_score,
    -- probability_of_more,
    score,
    description
FROM (
    SELECT
        start_time,
        created_at,
        game_id,
        stat_type_name,
        player_name,
        line_score,
        -- COALESCE(probability_of_more, 0.5) AS probability_of_more,
        -- probability_of_more,
        score,
        description,
        ROW_NUMBER() OVER(PARTITION BY date(start_time_est), stat_type_name, player_name ORDER BY created_at ASC) as rn
    FROM
        `pick_level.pick_level`
      WHERE stat_type_name = 'Passes Attempted'
      AND league_name = 'SOCCER'
) AS subquery
WHERE
    rn = 1
    and description in (
    "Arsenal",
    "Aston Villa",
    "Bournemouth",
    "Brentford",
    "Brighton",
    "Chelsea",
    "Crystal Palace",
    "Everton",
    "Fulham",
    "Ipswich", # Promoted from Championship
    "Leicester", # Promoted from Championship
    "Liverpool",
    "Man City",
    "Man Utd",
    "Newcastle"
    "Nottm Forest",
    "Southampton", # Promoted from Championship
    "Tottenham",
    "West Ham",
    "Wolves"
)
and date(start_time) > date(2025, 4, 1)
"""

def execute_bq_query(query: str) -> pd.DataFrame:
        client = bqc.Client(project='prizepicksanalytics')

        query_results = client.query(query)
        data = query_results.to_dataframe()
        return data

prizepicks_lines = execute_bq_query(prizepicks_lines_query)
prizepicks_lines.start_time = prizepicks_lines.start_time.apply(lambda x: str(x)[:10])

In [None]:
matches = execute_bq_query("""select * from soccer_simulations.match_calendar""")
matches['home_team'] = matches.home_id.apply(lambda x: team_mappings[x])
matches['away_team'] = matches.away_id.apply(lambda x: team_mappings[x])
matches['description'] = matches.home_team + ' vs ' + matches.away_team

In [None]:
def get_team(row):
    list_of_matches = matches[matches['localDate'] == row.start_time].description.to_list()
    teams = [x for x in list_of_matches if row.description in x]

    if len(teams) == 0:
        return np.nan
    
    teams = teams[0].split(' vs ')
    team = [t for t in teams if t != row.description][0]
    return team

prizepicks_lines['team'] = prizepicks_lines.apply(get_team, axis=1)
prizepicks_lines = prizepicks_lines[~prizepicks_lines.team.isna()]

In [None]:
%pip install fuzzywuzzy
from fuzzywuzzy import process, fuzz

team_squads = {x:{k:v for k, v in zip(y.id, y.shortFirstName + ' ' + y.shortLastName)} for x, y in squads.groupby('contestantShortName')}
test.player_name = test.apply(lambda x: team_squads[x.team_id][x.player_id], axis=1)

def fuzzy_match(query, choices):
    best_match, score = process.extractOne(query, choices, scorer=fuzz.token_set_ratio)
    return best_match

def get_fuzzy_name(row):
    new_name = fuzzy_match(row.player_name, team_squads[row.team].values())
    return new_name

prizepicks_lines.player_name = prizepicks_lines.apply(lambda x: get_fuzzy_name(x), axis=1)

In [None]:
pp_eval = test.merge(prizepicks_lines, how='left', left_on=['player_name', 'match_date'], right_on=['player_name', 'start_time'])
pp_eval = pp_eval[~pp_eval.line_score.isna()]

In [None]:
pp_eval = pp_eval[['target', 'player_name', 'nb_mean', 'line_score', 'score', 'p_avg_passes', 'p_weighted_passes']]

pp_eval['model_error'] = pp_eval.score - pp_eval.nb_mean
pp_eval['pp_error'] = pp_eval.score - pp_eval.line_score

model_mae = np.sum(np.abs(pp_eval['model_error'])) / pp_eval.shape[0]
model_rmse = np.sqrt(np.sum(np.abs(pp_eval['model_error']))**2 / pp_eval.shape[0])

pp_mae = np.sum(np.abs(pp_eval['pp_error'])) / pp_eval.shape[0]
pp_rmse = np.sqrt(np.sum(np.abs(pp_eval['pp_error']))**2 / pp_eval.shape[0])

print(f"Model MAE: {model_mae} | Model RMSE: {model_rmse}")
print(f"PP MAE: {pp_mae} | PP RMSE: {pp_rmse}")

## misc

In [None]:
test.groupby('player_position')['mae_error'].sum().sort_values()

In [None]:
test.groupby('player_position')['mae_error'].apply(lambda x: (sum(x)) / len(x)).sort_values()

In [None]:
test.groupby('team_id')['error'].apply(lambda x: (sum(x)) / len(x)).sort_values()

In [None]:
import seaborn as sns # Often imported as sns

# test.exp_std.plot()
sns.scatterplot(
    data=test[test['error'] < 60].iloc[:1000],
    x='error',
    y='avg_minsPlayed',
    hue='player_position'
)

In [None]:
import matplotlib.pyplot as plt
train[target].target.hist()

In [None]:
train.sort_values('target', ascending=True)