# End-to-end prediction steps

## Set-up

In [1]:
import os
import getpass
import pickle
from functools import reduce
import time

import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
from s3fs import S3FileSystem
import matplotlib.pyplot as plt
import seaborn as sns
from keras.models import load_model, Model
from keras.layers import Dense, Flatten, LSTM, BatchNormalization, Input, Dropout, Activation, Bidirectional
from keras import initializers, optimizers, Sequential
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from hyperopt import hp, fmin, rand, tpe, Trials
from selenium import webdriver

Using TensorFlow backend.


In [2]:
os.chdir('../..')

In [119]:
from src.models.utils import \
    _load_all_historical_data, \
    _map_season_string_to_ordered_numeric, \
    _generate_known_features_for_next_gw, \
    custom_train_test_split, \
    split_sequences, \
    _load_model_from_pickle
from src.visualisation.utils import plot_learning_curve
from src.data.s3_utilities import s3_filesystem
from src.data.live_season_data import _get_fpl_json

In [4]:
pd.options.display.max_columns = None

In [5]:
RANDOM_SEED = 3
N_STEPS_IN = 5
N_STEPS_OUT = 5

## 1. Load FFS data

### Historical FFS data

In [6]:
historical_ffs_all_data = pq.read_table(
    f"s3://fantasy-football-scout/processed/fantasy_football_scout_final_features_and_total_points.parquet", 
    filesystem=s3_filesystem
).to_pandas()

print(historical_ffs_all_data.shape)
historical_ffs_all_data.head()

(92410, 68)


Unnamed: 0,Name,Team,Aerial Duels - Won - Percentage,Assists,Big Chances Created,Caught Offside,Chances Created (Right Zone),Chances From Counter Attack,Clean Sheets,Crosses - Open Play - Successful,Crosses - Unsuccessful,Distribution - Successful,position,full_name,season,gw,Distribution - Unsuccessful,Dribbles - Successful Percentage,Goal Kicks,Goals,Goals Conceded,Goals From Penalties,Ground Duels Lost,Handballs,ICT Influence,Minutes Per Save,Minutes Per Touch,Pass Completion,Pass Completion - Final Third,Pass Completion - Opponents Half,Premier League Straight Red Cards,Premier League Total Red Cards,Recoveries,Tackles Lost,Throw Ins,Touches - Final Third,Saves,Shots - Inside Box,Shots - Long - Attempts,Shots On Target,Subbed Off,Subbed On,Tackles - Won - Percentage,Take Ons,Take Ons - Successful Percentage,Time Played,Bad Touches,Fouls,ICT Creativity,Own Goals,Penalties Missed,Premier League Yellow Cards,Saves From Penalty,Touches - Opponents Half - Right,Touches - Penalty Area,xGI Expected Goal Involvement,ICT Index,Minutes Per Block,Minutes Per Interception,Minutes Per Tackle Won,Minutes Per xG,Passes - Backward,Passes - Forward,Saves (Shots Outside Box),Shot Accuracy,Shots Blocked,name,total_points
0,van Persie,ARS,75.0,0,0,1,0,0,1,0,5,0,FWD,Robin van Persie,2011-2012,1,0,100.0,0,0,0,0,0,0,6,0.0,2.1,71,75,75,0,0,1,0,0,21,0,1,0,0,0,0,0.0,1,100.0,90,3,0,8,0,0,0,0,8,7,0.0,4.6,0.0,0.0,0.0,inf,7,13,0,0.0,1,robin_van persie,2
1,Zamora,QPR,40.0,0,0,2,0,0,1,0,0,0,FWD,Bobby Zamora,2011-2012,1,0,50.0,0,0,0,0,1,0,10,0.0,2.0,59,67,63,0,0,3,0,0,21,0,4,0,3,1,0,0.0,2,50.0,74,1,2,15,0,0,0,0,7,6,0.0,11.5,0.0,0.0,0.0,inf,7,16,0,75.0,1,bobby_zamora,2
2,Gervinho,ARS,0.0,0,0,0,1,1,1,1,4,0,FWD,Gervais Yao Kouassi,2011-2012,1,0,80.0,0,0,0,0,1,0,7,0.0,1.7,79,73,76,1,1,4,0,0,33,0,1,0,0,0,0,0.0,5,80.0,76,1,2,20,0,0,0,0,8,12,0.0,6.7,0.0,38.0,0.0,inf,12,9,0,0.0,1,gervais_yao kouassi,-1
3,Welbeck,MUN,0.0,0,0,0,0,0,0,0,0,0,FWD,Danny Welbeck,2011-2012,1,0,0.0,0,0,1,0,3,0,3,0.0,2.3,68,50,60,0,0,2,1,0,14,0,0,0,0,1,0,66.7,1,0.0,65,2,2,4,0,0,0,0,1,3,0.0,1.2,0.0,0.0,32.5,inf,7,7,0,0.0,0,danny_welbeck,2
4,Torres,CHE,0.0,0,1,2,1,0,1,1,2,0,FWD,Fernando Torres,2011-2012,1,0,55.6,0,0,0,0,6,0,10,0.0,1.4,75,68,70,0,0,4,1,0,43,0,3,0,1,1,0,50.0,9,55.6,89,4,4,31,0,0,0,0,6,9,0.0,9.3,0.0,0.0,89.0,inf,13,17,0,25.0,1,fernando_torres,2


Note: In practice only need last 5 gameweeks so can optimise here to save time if needed.

In [7]:
historical_ffs_all_data.drop(columns=['Name', 'full_name'], inplace=True)

### Latest FFS data

New season hasn't started yet so for now we write some sample code which can be used after GW 1

In [8]:
ffs_tables = [f'FINAL_FEATURES_{x}_{pos}' for pos in ['DEF', 'FWD', 'MID', 'GK'] for x in range(1, 7)]
ffs_tables

['FINAL_FEATURES_1_DEF',
 'FINAL_FEATURES_2_DEF',
 'FINAL_FEATURES_3_DEF',
 'FINAL_FEATURES_4_DEF',
 'FINAL_FEATURES_5_DEF',
 'FINAL_FEATURES_6_DEF',
 'FINAL_FEATURES_1_FWD',
 'FINAL_FEATURES_2_FWD',
 'FINAL_FEATURES_3_FWD',
 'FINAL_FEATURES_4_FWD',
 'FINAL_FEATURES_5_FWD',
 'FINAL_FEATURES_6_FWD',
 'FINAL_FEATURES_1_MID',
 'FINAL_FEATURES_2_MID',
 'FINAL_FEATURES_3_MID',
 'FINAL_FEATURES_4_MID',
 'FINAL_FEATURES_5_MID',
 'FINAL_FEATURES_6_MID',
 'FINAL_FEATURES_1_GK',
 'FINAL_FEATURES_2_GK',
 'FINAL_FEATURES_3_GK',
 'FINAL_FEATURES_4_GK',
 'FINAL_FEATURES_5_GK',
 'FINAL_FEATURES_6_GK']

In [9]:
SEASON = '2019-2020'

In [19]:
%%time

ffs_df_dict = {}

for table in ffs_tables:
    print(f'Working on {table}')
    df = pq.read_table(
        f"s3://fantasy-football-scout/raw-member-data/table={table}/season={SEASON}", 
        filesystem=s3_filesystem
    ).to_pandas()
    ffs_df_dict[table] = df
    print(f'Finished {table}')

Working on FINAL_FEATURES_1_DEF
Finished FINAL_FEATURES_1_DEF
Working on FINAL_FEATURES_2_DEF
Finished FINAL_FEATURES_2_DEF
Working on FINAL_FEATURES_3_DEF
Finished FINAL_FEATURES_3_DEF
Working on FINAL_FEATURES_4_DEF
Finished FINAL_FEATURES_4_DEF
Working on FINAL_FEATURES_5_DEF
Finished FINAL_FEATURES_5_DEF
Working on FINAL_FEATURES_6_DEF
Finished FINAL_FEATURES_6_DEF
Working on FINAL_FEATURES_1_FWD
Finished FINAL_FEATURES_1_FWD
Working on FINAL_FEATURES_2_FWD
Finished FINAL_FEATURES_2_FWD
Working on FINAL_FEATURES_3_FWD
Finished FINAL_FEATURES_3_FWD
Working on FINAL_FEATURES_4_FWD
Finished FINAL_FEATURES_4_FWD
Working on FINAL_FEATURES_5_FWD
Finished FINAL_FEATURES_5_FWD
Working on FINAL_FEATURES_6_FWD
Finished FINAL_FEATURES_6_FWD
Working on FINAL_FEATURES_1_MID
Finished FINAL_FEATURES_1_MID
Working on FINAL_FEATURES_2_MID
Finished FINAL_FEATURES_2_MID
Working on FINAL_FEATURES_3_MID
Finished FINAL_FEATURES_3_MID
Working on FINAL_FEATURES_4_MID
Finished FINAL_FEATURES_4_MID
Working 

#### Combine into a single DataFrame

In [26]:
JOINING_KEYS = ['Name', 'Team', 'position', 'full_name', 'gw']

In [27]:
latest_ffs_all_data = pd.DataFrame()

for position in ['FWD', 'GK', 'DEF', 'MID']:
    
    dfs = [ffs_df_dict[key] for key in [key for key in ffs_df_dict.keys() if f'_{position}' in key]]
    
    combined = reduce(
        lambda left, right: pd.merge(
            left,
            right,
            on=JOINING_KEYS,
            how='outer'
        ),
        dfs
    )
    
    print(combined.shape)
    print('\n')
    
    latest_ffs_all_data = latest_ffs_all_data.append(combined)

print(latest_ffs_all_data.shape)

(1585, 65)


(765, 65)


(3514, 65)


(4679, 65)


(10543, 65)


In [28]:
latest_ffs_all_data.sample(n=5)

Unnamed: 0,Name,Team,Aerial Duels - Won - Percentage,Assists,Big Chances Created,Caught Offside,Chances Created (Right Zone),Chances From Counter Attack,Clean Sheets,Crosses - Open Play - Successful,Crosses - Unsuccessful,Distribution - Successful,position,full_name,gw,Distribution - Unsuccessful,Dribbles - Successful Percentage,Goal Kicks,Goals,Goals Conceded,Goals From Penalties,Ground Duels Lost,Handballs,ICT Influence,Minutes Per Save,Minutes Per Touch,Pass Completion,Pass Completion - Final Third,Pass Completion - Opponents Half,Premier League Straight Red Cards,Premier League Total Red Cards,Recoveries,Tackles Lost,Throw Ins,Touches - Final Third,Saves,Shots - Inside Box,Shots - Long - Attempts,Shots On Target,Subbed Off,Subbed On,Tackles - Won - Percentage,Take Ons,Take Ons - Successful Percentage,Time Played,Bad Touches,Fouls,ICT Creativity,Own Goals,Penalties Missed,Premier League Yellow Cards,Saves From Penalty,Touches - Opponents Half - Right,Touches - Penalty Area,xGI Expected Goal Involvement,ICT Index,Minutes Per Block,Minutes Per Interception,Minutes Per Tackle Won,Minutes Per xG,Passes - Backward,Passes - Forward,Saves (Shots Outside Box),Shot Accuracy,Shots Blocked
1122,McArthur,CRY,40.0,0,0,0,0,0,0,0,0,0,MID,James McArthur,18,0,66.7,0,0,1,0,5,0,18,0,1.4,86,71,80,0,0,4,3,0,21,0,3,0,2,1,0,25.0,3,66.7,86,1,0,14,0,0,0,0,1,5,0.65,10.5,0.0,0.0,86.0,136.5,16,24,0,50.0,1
2414,Maguire,MUN,57.1,0,0,0,0,0,0,0,0,0,DEF,Harry Maguire,33,0,100.0,0,0,2,0,2,0,13,0,0.9,90,91,95,0,0,5,2,0,12,0,0,0,0,0,0,0.0,1,100.0,90,0,1,4,0,0,0,0,3,2,0.03,2.1,0.0,30.0,0.0,inf,16,41,0,0.0,0
11,Pukki,NOR,0.0,0,0,1,0,0,0,0,0,0,FWD,Teemu Pukki,1,0,0.0,0,1,4,0,1,0,39,0,3.0,64,100,67,0,0,2,0,0,13,0,3,0,1,1,0,100.0,1,0.0,83,2,1,21,0,0,0,0,2,4,0.52,10.1,0.0,83.0,83.0,169.4,9,2,0,33.3,1
2694,Mahrez,MCI,0.0,0,0,0,1,0,0,0,0,0,MID,Riyad Mahrez,3,0,0.0,0,0,0,0,1,0,5,0,0.8,78,67,75,0,0,2,0,0,4,0,0,0,0,0,1,100.0,0,0.0,11,1,1,14,0,0,0,0,0,1,0.04,2.0,0.0,11.0,11.0,inf,3,4,0,0.0,0
1140,Hourihane,AVL,100.0,0,0,0,0,0,0,0,1,0,MID,Conor Hourihane,18,0,0.0,0,0,3,0,0,0,2,0,1.6,85,86,83,0,0,2,0,1,10,0,0,0,0,1,0,0.0,0,0.0,56,0,1,3,0,0,0,0,1,2,0.05,1.0,0.0,0.0,0.0,1400.0,8,14,0,0.0,0


In [29]:
latest_ffs_all_data.isnull().sum().sum()

0

#### Processing

In [30]:
latest_ffs_all_data = latest_ffs_all_data.apply(pd.to_numeric, errors='ignore')

In [32]:
latest_ffs_all_data['name'] = latest_ffs_all_data['full_name'].str.split(" ", 1).apply(lambda x: '_'.join([x[0], x[1]])).str.lower()

In [33]:
latest_ffs_all_data.head()

Unnamed: 0,Name,Team,Aerial Duels - Won - Percentage,Assists,Big Chances Created,Caught Offside,Chances Created (Right Zone),Chances From Counter Attack,Clean Sheets,Crosses - Open Play - Successful,Crosses - Unsuccessful,Distribution - Successful,position,full_name,gw,Distribution - Unsuccessful,Dribbles - Successful Percentage,Goal Kicks,Goals,Goals Conceded,Goals From Penalties,Ground Duels Lost,Handballs,ICT Influence,Minutes Per Save,Minutes Per Touch,Pass Completion,Pass Completion - Final Third,Pass Completion - Opponents Half,Premier League Straight Red Cards,Premier League Total Red Cards,Recoveries,Tackles Lost,Throw Ins,Touches - Final Third,Saves,Shots - Inside Box,Shots - Long - Attempts,Shots On Target,Subbed Off,Subbed On,Tackles - Won - Percentage,Take Ons,Take Ons - Successful Percentage,Time Played,Bad Touches,Fouls,ICT Creativity,Own Goals,Penalties Missed,Premier League Yellow Cards,Saves From Penalty,Touches - Opponents Half - Right,Touches - Penalty Area,xGI Expected Goal Involvement,ICT Index,Minutes Per Block,Minutes Per Interception,Minutes Per Tackle Won,Minutes Per xG,Passes - Backward,Passes - Forward,Saves (Shots Outside Box),Shot Accuracy,Shots Blocked,name
0,Jesus,MCI,25.0,0,0,0,0,0,1,0,0,0,FWD,Gabriel Fernando de Jesus,1,0,100.0,0,1,0,0,5,0,33,0.0,3.8,91,100,88,0,0,1,2,0,12,0,1,0,2,1,0,0.0,1,100.0,69,0,5,5,0,0,0,0,0,2,0.25,8.0,0.0,0.0,0.0,300.0,6,2,0,100.0,0,gabriel_fernando de jesus
1,Wood,BUR,28.6,0,0,3,0,0,1,0,2,0,FWD,Chris Wood,1,0,0.0,0,0,0,0,1,0,0,0.0,3.1,38,40,40,0,0,2,0,0,14,0,1,0,0,0,0,0.0,0,0.0,90,6,2,1,0,0,0,0,3,3,0.04,0.8,0.0,0.0,0.0,2250.0,4,4,0,0.0,0,chris_wood
2,C Wilson,BOU,12.5,0,0,1,0,0,0,0,1,0,FWD,Callum Wilson,1,0,0.0,0,0,1,0,2,0,2,0.0,4.2,50,17,25,0,0,0,0,0,10,0,1,0,1,1,0,100.0,0,0.0,89,1,3,2,0,0,0,0,2,2,0.7,4.0,0.0,0.0,89.0,129.0,5,5,0,100.0,0,callum_wilson
3,Wickham,CRY,50.0,0,0,1,0,0,0,0,0,0,FWD,Connor Wickham,1,0,0.0,0,0,0,0,1,0,0,0.0,1.2,40,67,67,0,0,0,0,0,4,0,0,0,0,0,1,0.0,0,0.0,7,0,1,0,0,0,0,0,0,1,0.0,0.2,0.0,0.0,0.0,inf,1,3,0,0.0,0,connor_wickham
4,Vardy,LEI,50.0,0,0,0,0,0,1,1,1,0,FWD,Jamie Vardy,1,0,0.0,0,0,0,0,1,0,7,0.0,5.0,60,75,63,0,0,2,0,0,10,0,0,0,0,0,0,100.0,1,0.0,90,1,0,5,0,0,0,0,0,2,0.05,1.6,0.0,0.0,90.0,inf,3,5,0,0.0,0,jamie_vardy


Given that double gameweeks are collapsed into a single row, we need to slightly modify our calulcation of points for minutes played. Minutes for a single match do not exceed 90 (i.e. extra time not included) so we can use this to work out the minutes played in each match.

In [34]:
def _calculate_minutes_points_per_match(minutes):
    points = 0
    if minutes >= 60:
        points += 2
    elif minutes > 0:
        points += 1
    return points

In [35]:
print(_calculate_minutes_points_per_match(0))
print(_calculate_minutes_points_per_match(60))
print(_calculate_minutes_points_per_match(90))

0
2
2


In [36]:
def calculate_minutes_points_total_matches(total_minutes):
    second_match_mins = np.max([total_minutes - 90, 0])
    first_match_mins = total_minutes - second_match_mins
    
    second_match_points = _calculate_minutes_points_per_match(second_match_mins)
    first_match_points = _calculate_minutes_points_per_match(first_match_mins)
    
    total_points = first_match_points + second_match_points
    return total_points

In [38]:
print(0, calculate_minutes_points_total_matches(0))
print(24, calculate_minutes_points_total_matches(24))
print(90, calculate_minutes_points_total_matches(90))
print(91, calculate_minutes_points_total_matches(91))
print(149, calculate_minutes_points_total_matches(149))
print(150, calculate_minutes_points_total_matches(150))
print(180, calculate_minutes_points_total_matches(180))

0 0
24 1
90 2
91 3
149 3
150 4
180 4


In [40]:
latest_ffs_all_data['Time Played'].apply(calculate_minutes_points_total_matches).value_counts().sort_index()

1    2610
2    7874
3      18
4      41
Name: Time Played, dtype: int64

In [41]:
def calculate_fpl_points(df):
    df = df.copy()
    df['points'] = 0
    
    # Minutes
    df['points'] += df['Time Played'].apply(calculate_minutes_points_total_matches)
    
    # Goals
    df['points'] += np.where(
        df['position'].isin(['GK', 'DEF']), 
        6 * df['Goals'], 
        0
    )
    
    df['points'] += np.where(
        df['position'] == 'MID', 
        5 * df['Goals'], 
        0
    )
    
    df['points'] += np.where(
        df['position'] == 'FWD', 
        4 * df['Goals'], 
        0
    )
    
    # Assists
    df['points'] += df['Assists'] * 3
    
    # Clean sheets
    df['points'] += np.where(
        df['position'].isin(['GK', 'DEF']), 
        4 * df['Clean Sheets'], 
        0
    )
    
    df['points'] += np.where(
        df['position'] == 'MID', 
        df['Clean Sheets'], 
        0
    )
    
    # Shot saves
    df['points'] += np.where(
        df['position'] == 'GK', 
        (df['Saves'] / 3).astype(int), 
        0
    )
    
    # Penalty saves
    df['points'] += np.where(
        df['position'] == 'GK', 
        df['Saves From Penalty'] * 5, 
        0
    )
    
    # Penalty misses
    df['points'] += -2 * df['Penalties Missed']
    
    # Goals conceded
    df['points'] += np.where(
        df['position'].isin(['GK', 'DEF']), 
        -(df['Goals Conceded'] / 2).astype(int), 
        0
    )
    
    # Yellow cards
    df['points'] += -df['Premier League Yellow Cards']
    
    # Red cards
    df['points'] += -df['Premier League Total Red Cards'] * 3
    
    # Own goals
    df['points'] += -df['Own Goals'] * 2
    
    # 0 minutes
    df['points'] = np.where(
        df['Time Played'] == 0,
        0,
        df['points']
    )
    
    return df['points']

In [43]:
latest_ffs_all_data['total_points'] = calculate_fpl_points(latest_ffs_all_data)

In [44]:
latest_ffs_all_data.head()

Unnamed: 0,Name,Team,Aerial Duels - Won - Percentage,Assists,Big Chances Created,Caught Offside,Chances Created (Right Zone),Chances From Counter Attack,Clean Sheets,Crosses - Open Play - Successful,Crosses - Unsuccessful,Distribution - Successful,position,full_name,gw,Distribution - Unsuccessful,Dribbles - Successful Percentage,Goal Kicks,Goals,Goals Conceded,Goals From Penalties,Ground Duels Lost,Handballs,ICT Influence,Minutes Per Save,Minutes Per Touch,Pass Completion,Pass Completion - Final Third,Pass Completion - Opponents Half,Premier League Straight Red Cards,Premier League Total Red Cards,Recoveries,Tackles Lost,Throw Ins,Touches - Final Third,Saves,Shots - Inside Box,Shots - Long - Attempts,Shots On Target,Subbed Off,Subbed On,Tackles - Won - Percentage,Take Ons,Take Ons - Successful Percentage,Time Played,Bad Touches,Fouls,ICT Creativity,Own Goals,Penalties Missed,Premier League Yellow Cards,Saves From Penalty,Touches - Opponents Half - Right,Touches - Penalty Area,xGI Expected Goal Involvement,ICT Index,Minutes Per Block,Minutes Per Interception,Minutes Per Tackle Won,Minutes Per xG,Passes - Backward,Passes - Forward,Saves (Shots Outside Box),Shot Accuracy,Shots Blocked,name,total_points
0,Jesus,MCI,25.0,0,0,0,0,0,1,0,0,0,FWD,Gabriel Fernando de Jesus,1,0,100.0,0,1,0,0,5,0,33,0.0,3.8,91,100,88,0,0,1,2,0,12,0,1,0,2,1,0,0.0,1,100.0,69,0,5,5,0,0,0,0,0,2,0.25,8.0,0.0,0.0,0.0,300.0,6,2,0,100.0,0,gabriel_fernando de jesus,6
1,Wood,BUR,28.6,0,0,3,0,0,1,0,2,0,FWD,Chris Wood,1,0,0.0,0,0,0,0,1,0,0,0.0,3.1,38,40,40,0,0,2,0,0,14,0,1,0,0,0,0,0.0,0,0.0,90,6,2,1,0,0,0,0,3,3,0.04,0.8,0.0,0.0,0.0,2250.0,4,4,0,0.0,0,chris_wood,2
2,C Wilson,BOU,12.5,0,0,1,0,0,0,0,1,0,FWD,Callum Wilson,1,0,0.0,0,0,1,0,2,0,2,0.0,4.2,50,17,25,0,0,0,0,0,10,0,1,0,1,1,0,100.0,0,0.0,89,1,3,2,0,0,0,0,2,2,0.7,4.0,0.0,0.0,89.0,129.0,5,5,0,100.0,0,callum_wilson,2
3,Wickham,CRY,50.0,0,0,1,0,0,0,0,0,0,FWD,Connor Wickham,1,0,0.0,0,0,0,0,1,0,0,0.0,1.2,40,67,67,0,0,0,0,0,4,0,0,0,0,0,1,0.0,0,0.0,7,0,1,0,0,0,0,0,0,1,0.0,0.2,0.0,0.0,0.0,inf,1,3,0,0.0,0,connor_wickham,1
4,Vardy,LEI,50.0,0,0,0,0,0,1,1,1,0,FWD,Jamie Vardy,1,0,0.0,0,0,0,0,1,0,7,0.0,5.0,60,75,63,0,0,2,0,0,10,0,0,0,0,0,0,100.0,1,0.0,90,1,0,5,0,0,0,0,0,2,0.05,1.6,0.0,0.0,90.0,inf,3,5,0,0.0,0,jamie_vardy,2


#### Check against historical

In [45]:
set(latest_ffs_all_data.columns) - set(historical_ffs_all_data.columns)

{'Name', 'full_name'}

In [46]:
set(historical_ffs_all_data.columns) - set(latest_ffs_all_data.columns)

{'season'}

In [47]:
latest_ffs_all_data.shape

(10543, 67)

In [50]:
historical_ffs_all_data[historical_ffs_all_data['season'] == '2019-2020'].shape

(10543, 66)

## TODO: Add steps to append latest to historical

In [8]:
combined_ffs_all_data = historical_ffs_all_data.copy()

### Add 0 minute events back into data

FFS data only includes players who played > 0 minutes. Therefore if a player is benched one gameweek then they wouldn't appear in the data.

We therefore create a 'master' DataFrame of all unique names and all possible season and gameweek combinations. We then left join the FFS data and fill in nulls with zeros.

Note: We make sure to only add missing gameweeks for seasons in which the player played in the Premier League.

In [9]:
combined_ffs_all_data['season'].unique()

array(['2011-2012', '2012-2013', '2013-2014', '2014-2015', '2015-2016',
       '2016-2017', '2017-2018', '2018-2019', '2019-2020'], dtype=object)

In [10]:
ffs_data_names = combined_ffs_all_data[['name', 'Team', 'position', 'season']].drop_duplicates()
ffs_data_names['key'] = 1
print(ffs_data_names.shape)
ffs_data_names.head()

(4746, 5)


Unnamed: 0,name,Team,position,season,key
0,robin_van persie,ARS,FWD,2011-2012,1
1,bobby_zamora,QPR,FWD,2011-2012,1
2,gervais_yao kouassi,ARS,FWD,2011-2012,1
3,danny_welbeck,MUN,FWD,2011-2012,1
4,fernando_torres,CHE,FWD,2011-2012,1


In [11]:
gw_df = pd.DataFrame({'gw': range(1, 39)})
gw_df['key'] = 1
gw_df.head()

Unnamed: 0,gw,key
0,1,1
1,2,1
2,3,1
3,4,1
4,5,1


In [12]:
all_player_season_gw_df = gw_df.merge(ffs_data_names, on='key')
all_player_season_gw_df.drop('key', axis=1, inplace=True)
all_player_season_gw_df.shape

(180348, 5)

In [13]:
all_player_season_gw_df.head()

Unnamed: 0,gw,name,Team,position,season
0,1,robin_van persie,ARS,FWD,2011-2012
1,1,bobby_zamora,QPR,FWD,2011-2012
2,1,gervais_yao kouassi,ARS,FWD,2011-2012
3,1,danny_welbeck,MUN,FWD,2011-2012
4,1,fernando_torres,CHE,FWD,2011-2012


In [14]:
all_player_season_gw_df.sort_values(['name', 'season', 'gw'], inplace=True)

In [15]:
all_player_season_gw_df.head()

Unnamed: 0,gw,name,Team,position,season
822,1,aaron_connolly,BHA,FWD,2019-2020
5568,2,aaron_connolly,BHA,FWD,2019-2020
10314,3,aaron_connolly,BHA,FWD,2019-2020
15060,4,aaron_connolly,BHA,FWD,2019-2020
19806,5,aaron_connolly,BHA,FWD,2019-2020


In [16]:
ffs_data = all_player_season_gw_df.merge(combined_ffs_all_data, on=['name', 'Team', 'position', 'gw', 'season'], how='left', indicator=True)
print(ffs_data.shape)
ffs_data.head()

(180348, 67)


Unnamed: 0,gw,name,Team,position,season,Aerial Duels - Won - Percentage,Assists,Big Chances Created,Caught Offside,Chances Created (Right Zone),Chances From Counter Attack,Clean Sheets,Crosses - Open Play - Successful,Crosses - Unsuccessful,Distribution - Successful,Distribution - Unsuccessful,Dribbles - Successful Percentage,Goal Kicks,Goals,Goals Conceded,Goals From Penalties,Ground Duels Lost,Handballs,ICT Influence,Minutes Per Save,Minutes Per Touch,Pass Completion,Pass Completion - Final Third,Pass Completion - Opponents Half,Premier League Straight Red Cards,Premier League Total Red Cards,Recoveries,Tackles Lost,Throw Ins,Touches - Final Third,Saves,Shots - Inside Box,Shots - Long - Attempts,Shots On Target,Subbed Off,Subbed On,Tackles - Won - Percentage,Take Ons,Take Ons - Successful Percentage,Time Played,Bad Touches,Fouls,ICT Creativity,Own Goals,Penalties Missed,Premier League Yellow Cards,Saves From Penalty,Touches - Opponents Half - Right,Touches - Penalty Area,xGI Expected Goal Involvement,ICT Index,Minutes Per Block,Minutes Per Interception,Minutes Per Tackle Won,Minutes Per xG,Passes - Backward,Passes - Forward,Saves (Shots Outside Box),Shot Accuracy,Shots Blocked,total_points,_merge
0,1,aaron_connolly,BHA,FWD,2019-2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
1,2,aaron_connolly,BHA,FWD,2019-2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
2,3,aaron_connolly,BHA,FWD,2019-2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
3,4,aaron_connolly,BHA,FWD,2019-2020,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,8.0,100.0,0.0,100.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,inf,1.0,1.0,0.0,0.0,0.0,1.0,both
4,5,aaron_connolly,BHA,FWD,2019-2020,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0,100.0,100.0,100.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.14,2.2,0.0,0.0,0.0,42.9,0.0,0.0,0.0,50.0,0.0,1.0,both


In [17]:
ffs_data['_merge'].value_counts()

both          92410
left_only     87938
right_only        0
Name: _merge, dtype: int64

In [18]:
ffs_data.drop(columns='_merge', inplace=True)

In [19]:
ffs_data.sort_values(['name', 'season', 'gw'], inplace=True)
ffs_data.head()

Unnamed: 0,gw,name,Team,position,season,Aerial Duels - Won - Percentage,Assists,Big Chances Created,Caught Offside,Chances Created (Right Zone),Chances From Counter Attack,Clean Sheets,Crosses - Open Play - Successful,Crosses - Unsuccessful,Distribution - Successful,Distribution - Unsuccessful,Dribbles - Successful Percentage,Goal Kicks,Goals,Goals Conceded,Goals From Penalties,Ground Duels Lost,Handballs,ICT Influence,Minutes Per Save,Minutes Per Touch,Pass Completion,Pass Completion - Final Third,Pass Completion - Opponents Half,Premier League Straight Red Cards,Premier League Total Red Cards,Recoveries,Tackles Lost,Throw Ins,Touches - Final Third,Saves,Shots - Inside Box,Shots - Long - Attempts,Shots On Target,Subbed Off,Subbed On,Tackles - Won - Percentage,Take Ons,Take Ons - Successful Percentage,Time Played,Bad Touches,Fouls,ICT Creativity,Own Goals,Penalties Missed,Premier League Yellow Cards,Saves From Penalty,Touches - Opponents Half - Right,Touches - Penalty Area,xGI Expected Goal Involvement,ICT Index,Minutes Per Block,Minutes Per Interception,Minutes Per Tackle Won,Minutes Per xG,Passes - Backward,Passes - Forward,Saves (Shots Outside Box),Shot Accuracy,Shots Blocked,total_points
0,1,aaron_connolly,BHA,FWD,2019-2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2,aaron_connolly,BHA,FWD,2019-2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,3,aaron_connolly,BHA,FWD,2019-2020,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,4,aaron_connolly,BHA,FWD,2019-2020,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,8.0,100.0,0.0,100.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,inf,1.0,1.0,0.0,0.0,0.0,1.0
4,5,aaron_connolly,BHA,FWD,2019-2020,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0,100.0,100.0,100.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.14,2.2,0.0,0.0,0.0,42.9,0.0,0.0,0.0,50.0,0.0,1.0


In [20]:
ffs_data.isnull().sum()

gw                               0
name                             0
Team                             0
position                         0
season                           0
                             ...  
Passes - Forward             87938
Saves (Shots Outside Box)    87938
Shot Accuracy                87938
Shots Blocked                87938
total_points                 87938
Length: 66, dtype: int64

In [21]:
# From an offline check the missing entries are 0 minutes players in a given GW. We can therefore fill all missing data points with 0.
ffs_data.fillna(0, inplace=True)

### Position dummies

In [22]:
ffs_data = pd.get_dummies(ffs_data, columns=['position'])

In [23]:
ffs_data.rename(columns={'Team': 'team_name'}, inplace=True)

## 2. Fixture and odds data

Historical fixtures taken from FFS however looks like this only records finished matches. Instead we use the FPL API to get fixtures.

### Upcoming fixtures

In [24]:
FIXTURE_URL = "https://fantasy.premierleague.com/api/fixtures/"

In [25]:
fixtures = pd.json_normalize(_get_fpl_json(FIXTURE_URL))
print(fixtures.shape)
fixtures.head()

(380, 16)


Unnamed: 0,code,event,finished,finished_provisional,id,kickoff_time,minutes,provisional_start_time,started,team_a,team_a_score,team_h,team_h_score,stats,team_h_difficulty,team_a_difficulty
0,2128286,,False,False,379,,0,False,,13,,4,,[],4,3
1,2128290,,False,False,380,,0,False,,2,,12,,[],2,5
2,2128288,1.0,False,False,2,2020-09-12T11:30:00Z,0,False,False,1,,8,,[],3,2
3,2128287,1.0,False,False,1,2020-09-12T14:00:00Z,0,False,False,16,,6,,[],3,2
4,2128293,1.0,False,False,6,2020-09-12T14:00:00Z,0,False,False,14,,19,,[],2,2


In [26]:
fixtures.rename(columns={'event': 'gw'}, inplace=True)
fixtures = fixtures[['gw', 'team_a', 'team_h']]
fixtures.head()

Unnamed: 0,gw,team_a,team_h
0,,13,4
1,,2,12
2,1.0,1,8
3,1.0,16,6
4,1.0,14,19


### Odds data

Let's assume that there won't be any null odds for the next set of fixtures. We therefore skip the step of calculating mean odds from the previous season.

In [27]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--headless")

In [28]:
driver = webdriver.Chrome(
    "../../Python/Chrome Driver/chromedriver",
    options=chrome_options,
)

In [29]:
driver.implicitly_wait(10)

In [30]:
driver.get('https://www.oddsportal.com/soccer/england/premier-league/')
time.sleep(1)

In [31]:
driver.find_element_by_xpath('//*[@id="show-all-link"]/div/div/div/div/p/a').click()  # Expand odds box

In [32]:
TABLE_XPATH = '//*[@id="tournamentTable"]'

In [33]:
tbl = driver.find_element_by_xpath(TABLE_XPATH).get_attribute("outerHTML")
odds_table = pd.read_html(tbl, header=0)[0]

In [34]:
odds_table.head()

Unnamed: 0,Soccer» England»Premier League,Soccer» England»Premier League.1,Soccer» England»Premier League.2,Soccer» England»Premier League.3,Soccer» England»Premier League.4,Soccer» England»Premier League.5,Soccer» England»Premier League.6
0,12 Sep 2020,12 Sep 2020,12 Sep 2020,1,X,2,B's
1,,,,,,,
2,11:30,Fulham - Arsenal,Fulham - Arsenal,227/50,33/10,3/5,12
3,14:00,Crystal Palace - Southampton,Crystal Palace - Southampton,193/100,113/50,39/25,12
4,14:00,West Ham - Newcastle,West Ham - Newcastle,117/100,249/100,121/50,12


In [35]:
COLUMN_NAMES = ['KO', 'Match','Match_dup', '1', 'X', '2', 'num_available_bookmakers']

In [36]:
odds_table.columns = COLUMN_NAMES
odds_table.dropna(axis=0, how='all', inplace=True)

In [37]:
# Keep matches only (some rows are repeats of the header)
odds_table = odds_table[odds_table['Match'].str.contains('-')]

for odd_col in ['1', 'X', '2']:
    odds_table[odd_col].replace('-', np.nan, inplace=True)
    odds_table.loc[
        ~odds_table[odd_col].isnull(), 
        odd_col
    ] = odds_table.loc[
        ~odds_table[odd_col].isnull(), 
        odd_col
    ].str.split('/').apply(lambda x: float(x[0]) / float(x[1]))

odds_table['home_team'] = odds_table['Match'].str.split(' - ').apply(lambda x: x[0])
odds_table['away_team'] = odds_table['Match'].str.split(' - ').apply(lambda x: x[1])

In [38]:
odds_table

Unnamed: 0,KO,Match,Match_dup,1,X,2,num_available_bookmakers,home_team,away_team
2,11:30,Fulham - Arsenal,Fulham - Arsenal,4.54,3.3,0.6,12,Fulham,Arsenal
3,14:00,Crystal Palace - Southampton,Crystal Palace - Southampton,1.93,2.26,1.56,12,Crystal Palace,Southampton
4,14:00,West Ham - Newcastle,West Ham - Newcastle,1.17,2.49,2.42,12,West Ham,Newcastle
5,16:30,Liverpool - Leeds,Liverpool - Leeds,0.28,5.21,9.11,12,Liverpool,Leeds
8,13:00,West Brom - Leicester,West Brom - Leicester,2.77,2.65,0.99,11,West Brom,Leicester
9,15:30,Tottenham - Everton,Tottenham - Everton,0.86,2.7,3.28,11,Tottenham,Everton
12,19:00,Brighton - Chelsea,Brighton - Chelsea,3.99,3.03,0.69,11,Brighton,Chelsea
13,19:00,Sheffield Utd - Wolves,Sheffield Utd - Wolves,2.45,2.14,1.31,11,Sheffield Utd,Wolves
16,11:30,Everton - West Brom,Everton - West Brom,0.61,3.2,4.44,9,Everton,West Brom
17,14:00,Leeds - Fulham,Leeds - Fulham,0.79,2.73,3.62,9,Leeds,Fulham


In [39]:
odds_table.drop(columns=['Match', 'Match_dup', 'KO', 'num_available_bookmakers'], inplace=True)

In [40]:
odds_table.head()

Unnamed: 0,1,X,2,home_team,away_team
2,4.54,3.3,0.6,Fulham,Arsenal
3,1.93,2.26,1.56,Crystal Palace,Southampton
4,1.17,2.49,2.42,West Ham,Newcastle
5,0.28,5.21,9.11,Liverpool,Leeds
8,2.77,2.65,0.99,West Brom,Leicester


In [41]:
driver.close()

### Team data

In [42]:
team_data = pd.read_csv('data/external/team_season_data.csv')
team_data.head()

Unnamed: 0,team,team_name,promoted_side,top_6_last_season,season,team_name_ffs
0,1.0,Arsenal,0,1,2016-17,Arsenal
1,2.0,Bournemouth,0,0,2016-17,Bournemouth
2,3.0,Burnley,1,0,2016-17,Burnley
3,4.0,Chelsea,0,0,2016-17,Chelsea
4,5.0,Crystal Palace,0,0,2016-17,Crystal Palace


### TODO: May need to change FFS names for promoted sides after GW1

In [43]:
team_data['season'] = team_data['season'].apply(lambda x: x.replace('-','-20'))

In [44]:
SEASON = '2020-2021'
team_data = team_data.copy()[team_data['season'] == SEASON]

In [45]:
team_data.drop(columns=['team_name', 'season'], inplace=True)
team_data.rename(columns={'team_name_ffs': 'team_name'}, inplace=True)

In [46]:
team_data

Unnamed: 0,team,promoted_side,top_6_last_season,team_name
80,1.0,0,0,Arsenal
81,2.0,0,0,Aston Villa
82,3.0,0,0,Brighton and Hove Albion
83,4.0,0,0,Burnley
84,5.0,0,1,Chelsea
85,6.0,0,0,Crystal Palace
86,7.0,0,0,Everton
87,8.0,1,0,Fulham
88,9.0,0,1,Leicester City
89,10.0,1,0,Leeds


### Combine

In [47]:
fixtures.head()

Unnamed: 0,gw,team_a,team_h
0,,13,4
1,,2,12
2,1.0,1,8
3,1.0,16,6
4,1.0,14,19


In [48]:
team_data_a = team_data.copy()
team_data_a.columns = [col + '_a' for col in team_data_a.columns]

In [49]:
fixtures = fixtures.merge(team_data_a, on='team_a')
fixtures.head()

Unnamed: 0,gw,team_a,team_h,promoted_side_a,top_6_last_season_a,team_name_a
0,,13,4,0,1,Manchester United
1,3.0,13,3,0,1,Manchester United
2,5.0,13,14,0,1,Manchester United
3,8.0,13,7,0,1,Manchester United
4,10.0,13,16,0,1,Manchester United


In [50]:
team_data_h = team_data.copy()
team_data_h.columns = [col + '_h' for col in team_data_h.columns]

In [51]:
fixtures = fixtures.merge(team_data_h, on='team_h')
fixtures.head()

Unnamed: 0,gw,team_a,team_h,promoted_side_a,top_6_last_season_a,team_name_a,promoted_side_h,top_6_last_season_h,team_name_h
0,,13,4,0,1,Manchester United,0,0,Burnley
1,20.0,2,4,0,0,Aston Villa,0,0,Burnley
2,27.0,1,4,0,0,Arsenal,0,0,Burnley
3,3.0,16,4,0,0,Southampton,0,0,Burnley
4,31.0,14,4,0,0,Newcastle United,0,0,Burnley


In [52]:
fixtures.sort_values('gw', inplace=True)

#### Align names so all use FFS

In [53]:
ffs_team_names_set = set(team_data['team_name'])
odds_team_names_set = set(list(odds_table['home_team']) + list(odds_table['away_team']))

ffs_team_names_set - odds_team_names_set

{'Brighton and Hove Albion',
 'Leicester City',
 'Manchester United',
 'Newcastle United',
 'Sheffield United',
 'Tottenham Hotspur',
 'West Bromwich Albion',
 'West Ham United',
 'Wolverhampton Wanderers'}

In [54]:
odds_team_names_set - ffs_team_names_set

{'Brighton',
 'Leicester',
 'Manchester Utd',
 'Newcastle',
 'Sheffield Utd',
 'Tottenham',
 'West Brom',
 'West Ham',
 'Wolves'}

In [55]:
from src.data.historical_fixture_and_odds import ODDS_DATA_TEAM_NAME_TO_FFS_TEAM_NAME
ODDS_DATA_TEAM_NAME_TO_FFS_TEAM_NAME

{'Blackburn': 'Blackburn Rovers',
 'Bolton': 'Bolton Wanderers',
 'Brighton': 'Brighton and Hove Albion',
 'Cardiff': 'Cardiff City',
 'Huddersfield': 'Huddersfield Town',
 'Hull': 'Hull City',
 'Leicester': 'Leicester City',
 'Manchester Utd': 'Manchester United',
 'Newcastle': 'Newcastle United',
 'Norwich': 'Norwich City',
 'QPR': 'Queens Park Rangers',
 'Sheffield Utd': 'Sheffield United',
 'Stoke': 'Stoke City',
 'Swansea': 'Swansea City',
 'Tottenham': 'Tottenham Hotspur',
 'West Brom': 'West Bromwich Albion',
 'West Ham': 'West Ham United',
 'Wigan': 'Wigan Athletic',
 'Wolves': 'Wolverhampton Wanderers'}

In [56]:
odds_table.head()

Unnamed: 0,1,X,2,home_team,away_team
2,4.54,3.3,0.6,Fulham,Arsenal
3,1.93,2.26,1.56,Crystal Palace,Southampton
4,1.17,2.49,2.42,West Ham,Newcastle
5,0.28,5.21,9.11,Liverpool,Leeds
8,2.77,2.65,0.99,West Brom,Leicester


In [57]:
odds_table['home_team'].replace(ODDS_DATA_TEAM_NAME_TO_FFS_TEAM_NAME, inplace=True)
odds_table['away_team'].replace(ODDS_DATA_TEAM_NAME_TO_FFS_TEAM_NAME, inplace=True)

In [58]:
ffs_team_names_set = set(team_data['team_name'])
odds_team_names_set = set(list(odds_table['home_team']) + list(odds_table['away_team']))

print(ffs_team_names_set - odds_team_names_set)
print(odds_team_names_set - ffs_team_names_set)

set()
set()


#### Merge odds data

In [59]:
fixtures.head()

Unnamed: 0,gw,team_a,team_h,promoted_side_a,top_6_last_season_a,team_name_a,promoted_side_h,top_6_last_season_h,team_name_h
27,1.0,5,3,0,1,Chelsea,0,0,Brighton and Hove Albion
123,1.0,20,15,0,0,Wolverhampton Wanderers,0,0,Sheffield United
269,1.0,16,6,0,0,Southampton,0,0,Crystal Palace
215,1.0,9,18,0,1,Leicester City,1,0,West Bromwich Albion
99,1.0,14,19,0,0,Newcastle United,0,0,West Ham United


In [60]:
odds_table.head()

Unnamed: 0,1,X,2,home_team,away_team
2,4.54,3.3,0.6,Fulham,Arsenal
3,1.93,2.26,1.56,Crystal Palace,Southampton
4,1.17,2.49,2.42,West Ham United,Newcastle United
5,0.28,5.21,9.11,Liverpool,Leeds
8,2.77,2.65,0.99,West Bromwich Albion,Leicester City


In [61]:
fixtures = fixtures.merge(odds_table, left_on=['team_name_h', 'team_name_a'], right_on=['home_team', 'away_team'], how='left')
fixtures.head()

Unnamed: 0,gw,team_a,team_h,promoted_side_a,top_6_last_season_a,team_name_a,promoted_side_h,top_6_last_season_h,team_name_h,1,X,2,home_team,away_team
0,1.0,5,3,0,1,Chelsea,0,0,Brighton and Hove Albion,3.99,3.03,0.69,Brighton and Hove Albion,Chelsea
1,1.0,20,15,0,0,Wolverhampton Wanderers,0,0,Sheffield United,2.45,2.14,1.31,Sheffield United,Wolverhampton Wanderers
2,1.0,16,6,0,0,Southampton,0,0,Crystal Palace,1.93,2.26,1.56,Crystal Palace,Southampton
3,1.0,9,18,0,1,Leicester City,1,0,West Bromwich Albion,2.77,2.65,0.99,West Bromwich Albion,Leicester City
4,1.0,14,19,0,0,Newcastle United,0,0,West Ham United,1.17,2.49,2.42,West Ham United,Newcastle United


In [62]:
fixtures.rename(
    columns={
        '1': 'home_win',
        'X': 'draw',
        '2': 'away_win'
    },
    inplace=True
)

#### Reformat fixture data

Amend fixture data so that it is at a team level and specifies whether that team played home or away. This doubles the size of the dataset.

In [63]:
teams_in_gw_1 = fixtures.copy()
teams_in_gw_1['team_name'] = teams_in_gw_1['home_team']
teams_in_gw_1['team_name_opponent'] = teams_in_gw_1['away_team']
teams_in_gw_1['was_home'] = True

teams_in_gw_2 = fixtures.copy()
teams_in_gw_2['team_name'] = teams_in_gw_2['away_team']
teams_in_gw_2['team_name_opponent'] = teams_in_gw_2['home_team']
teams_in_gw_2['was_home'] = False

teams_in_gw = teams_in_gw_1.append(teams_in_gw_2)

print(teams_in_gw.shape)
teams_in_gw.head()

(760, 17)


Unnamed: 0,gw,team_a,team_h,promoted_side_a,top_6_last_season_a,team_name_a,promoted_side_h,top_6_last_season_h,team_name_h,home_win,draw,away_win,home_team,away_team,team_name,team_name_opponent,was_home
0,1.0,5,3,0,1,Chelsea,0,0,Brighton and Hove Albion,3.99,3.03,0.69,Brighton and Hove Albion,Chelsea,Brighton and Hove Albion,Chelsea,True
1,1.0,20,15,0,0,Wolverhampton Wanderers,0,0,Sheffield United,2.45,2.14,1.31,Sheffield United,Wolverhampton Wanderers,Sheffield United,Wolverhampton Wanderers,True
2,1.0,16,6,0,0,Southampton,0,0,Crystal Palace,1.93,2.26,1.56,Crystal Palace,Southampton,Crystal Palace,Southampton,True
3,1.0,9,18,0,1,Leicester City,1,0,West Bromwich Albion,2.77,2.65,0.99,West Bromwich Albion,Leicester City,West Bromwich Albion,Leicester City,True
4,1.0,14,19,0,0,Newcastle United,0,0,West Ham United,1.17,2.49,2.42,West Ham United,Newcastle United,West Ham United,Newcastle United,True


In [64]:
teams_in_gw['win_odds'] = np.where(
    teams_in_gw['was_home'],
    teams_in_gw['home_win'],
    teams_in_gw['away_win']
)

teams_in_gw['lose_odds'] = np.where(
    teams_in_gw['was_home'],
    teams_in_gw['away_win'],
    teams_in_gw['home_win']
)

teams_in_gw.rename(columns={'draw': 'draw_odds'}, inplace=True)

for col in ['win_odds', 'lose_odds', 'draw_odds']:
    teams_in_gw[col] = pd.to_numeric(teams_in_gw[col])

In [65]:
teams_in_gw.head()

Unnamed: 0,gw,team_a,team_h,promoted_side_a,top_6_last_season_a,team_name_a,promoted_side_h,top_6_last_season_h,team_name_h,home_win,draw_odds,away_win,home_team,away_team,team_name,team_name_opponent,was_home,win_odds,lose_odds
0,1.0,5,3,0,1,Chelsea,0,0,Brighton and Hove Albion,3.99,3.03,0.69,Brighton and Hove Albion,Chelsea,Brighton and Hove Albion,Chelsea,True,3.99,0.69
1,1.0,20,15,0,0,Wolverhampton Wanderers,0,0,Sheffield United,2.45,2.14,1.31,Sheffield United,Wolverhampton Wanderers,Sheffield United,Wolverhampton Wanderers,True,2.45,1.31
2,1.0,16,6,0,0,Southampton,0,0,Crystal Palace,1.93,2.26,1.56,Crystal Palace,Southampton,Crystal Palace,Southampton,True,1.93,1.56
3,1.0,9,18,0,1,Leicester City,1,0,West Bromwich Albion,2.77,2.65,0.99,West Bromwich Albion,Leicester City,West Bromwich Albion,Leicester City,True,2.77,0.99
4,1.0,14,19,0,0,Newcastle United,0,0,West Ham United,1.17,2.49,2.42,West Ham United,Newcastle United,West Ham United,Newcastle United,True,1.17,2.42


In [66]:
teams_in_gw.drop(columns=['team_a', 'team_h', 'team_name_a', 'team_name_h', 'home_win', 'away_win', 'home_team', 'away_team'], inplace=True)
teams_in_gw.head()

Unnamed: 0,gw,promoted_side_a,top_6_last_season_a,promoted_side_h,top_6_last_season_h,draw_odds,team_name,team_name_opponent,was_home,win_odds,lose_odds
0,1.0,0,1,0,0,3.03,Brighton and Hove Albion,Chelsea,True,3.99,0.69
1,1.0,0,0,0,0,2.14,Sheffield United,Wolverhampton Wanderers,True,2.45,1.31
2,1.0,0,0,0,0,2.26,Crystal Palace,Southampton,True,1.93,1.56
3,1.0,0,1,1,0,2.65,West Bromwich Albion,Leicester City,True,2.77,0.99
4,1.0,0,0,0,0,2.49,West Ham United,Newcastle United,True,1.17,2.42


In [67]:
teams_in_gw['promoted_side'] = np.where(
    teams_in_gw['was_home'],
    teams_in_gw['promoted_side_h'],
    teams_in_gw['promoted_side_a']
)

teams_in_gw['top_6_last_season'] = np.where(
    teams_in_gw['was_home'],
    teams_in_gw['top_6_last_season_h'],
    teams_in_gw['top_6_last_season_a']
)

teams_in_gw['promoted_side_opponent'] = np.where(
    teams_in_gw['was_home'],
    teams_in_gw['promoted_side_a'],
    teams_in_gw['promoted_side_h']
)

teams_in_gw['top_6_last_season_opponent'] = np.where(
    teams_in_gw['was_home'],
    teams_in_gw['top_6_last_season_a'],
    teams_in_gw['top_6_last_season_h']
)

In [68]:
teams_in_gw.drop(columns=['promoted_side_a', 'top_6_last_season_a', 'promoted_side_h', 'top_6_last_season_h'], inplace=True)

In [69]:
teams_in_gw.head()

Unnamed: 0,gw,draw_odds,team_name,team_name_opponent,was_home,win_odds,lose_odds,promoted_side,top_6_last_season,promoted_side_opponent,top_6_last_season_opponent
0,1.0,3.03,Brighton and Hove Albion,Chelsea,True,3.99,0.69,0,0,0,1
1,1.0,2.14,Sheffield United,Wolverhampton Wanderers,True,2.45,1.31,0,0,0,0
2,1.0,2.26,Crystal Palace,Southampton,True,1.93,1.56,0,0,0,0
3,1.0,2.65,West Bromwich Albion,Leicester City,True,2.77,0.99,1,0,0,1
4,1.0,2.49,West Ham United,Newcastle United,True,1.17,2.42,0,0,0,0


### Groupby aggregations

We want to collapse these cases into a single row. We will do the following aggregations:

- `draw_odds`: mean
- `win_odds`: mean
- `lose_odds`: mean
- `was_home`: sum (rename to number of home matches)
- `promoted_side_opponent`: sum (number of)
- `top_6_last_season_opponent`: sum (number of)
- `promoted_side`: mean (get original value back)
- `top_6_last_season`: mean (get original value back)
- Also add `number_of_matches` to identify double gameweeks

In [70]:
teams_in_gw.rename(
    columns={
        'was_home': 'number_of_home_matches',
        'promoted_side_opponent': 'number_of_promoted_side_opponent',
        'top_6_last_season_opponent': 'number_of_top_6_last_season_opponent',
    },
    inplace=True
)

In [71]:
teams_in_gw['number_of_matches'] = 1

In [72]:
teams_in_gw.head()

Unnamed: 0,gw,draw_odds,team_name,team_name_opponent,number_of_home_matches,win_odds,lose_odds,promoted_side,top_6_last_season,number_of_promoted_side_opponent,number_of_top_6_last_season_opponent,number_of_matches
0,1.0,3.03,Brighton and Hove Albion,Chelsea,True,3.99,0.69,0,0,0,1,1
1,1.0,2.14,Sheffield United,Wolverhampton Wanderers,True,2.45,1.31,0,0,0,0,1
2,1.0,2.26,Crystal Palace,Southampton,True,1.93,1.56,0,0,0,0,1
3,1.0,2.65,West Bromwich Albion,Leicester City,True,2.77,0.99,1,0,0,1,1
4,1.0,2.49,West Ham United,Newcastle United,True,1.17,2.42,0,0,0,0,1


In [73]:
agg_dict = {
    'draw_odds': 'mean',
    'win_odds': 'mean',
    'lose_odds': 'mean',
    'number_of_home_matches': 'sum',
    'number_of_promoted_side_opponent': 'sum',
    'number_of_top_6_last_season_opponent': 'sum',
    'promoted_side': 'mean',
    'top_6_last_season': 'mean',
    'number_of_matches': 'sum'
}

teams_in_gw_agg = teams_in_gw.groupby(['gw', 'team_name']).agg(agg_dict).reset_index()

In [74]:
teams_in_gw_agg.head()

Unnamed: 0,gw,team_name,draw_odds,win_odds,lose_odds,number_of_home_matches,number_of_promoted_side_opponent,number_of_top_6_last_season_opponent,promoted_side,top_6_last_season,number_of_matches
0,1.0,Arsenal,3.3,0.6,4.54,False,1,0,0,0,1
1,1.0,Brighton and Hove Albion,3.03,3.99,0.69,True,0,1,0,0,1
2,1.0,Chelsea,3.03,0.69,3.99,False,0,0,0,1,1
3,1.0,Crystal Palace,2.26,1.93,1.56,True,0,0,0,0,1
4,1.0,Everton,2.7,3.28,0.86,False,0,1,0,0,1


In [75]:
teams_in_gw_agg.shape

(36, 11)

Note: Groupby exlcudes nulls, in this case gameweeks without odds data.

In [76]:
teams_in_gw_agg

Unnamed: 0,gw,team_name,draw_odds,win_odds,lose_odds,number_of_home_matches,number_of_promoted_side_opponent,number_of_top_6_last_season_opponent,promoted_side,top_6_last_season,number_of_matches
0,1.0,Arsenal,3.3,0.6,4.54,False,1,0,0,0,1
1,1.0,Brighton and Hove Albion,3.03,3.99,0.69,True,0,1,0,0,1
2,1.0,Chelsea,3.03,0.69,3.99,False,0,0,0,1,1
3,1.0,Crystal Palace,2.26,1.93,1.56,True,0,0,0,0,1
4,1.0,Everton,2.7,3.28,0.86,False,0,1,0,0,1
5,1.0,Fulham,3.3,4.54,0.6,True,0,0,1,0,1
6,1.0,Leeds,5.21,9.11,0.28,False,0,1,1,0,1
7,1.0,Leicester City,2.65,0.99,2.77,False,1,0,0,1,1
8,1.0,Liverpool,5.21,0.28,9.11,True,1,0,0,1,1
9,1.0,Newcastle United,2.49,2.42,1.17,False,0,0,0,0,1


#### Double gameweek flag

In [77]:
teams_in_gw_agg['number_of_matches'].value_counts()

1    36
Name: number_of_matches, dtype: int64

In [78]:
teams_in_gw_agg['double_gameweek'] = np.where(
    teams_in_gw_agg['number_of_matches'] == 2,
    1,
    0
)

In [79]:
teams_in_gw_agg.drop('number_of_matches', axis=1, inplace=True)

In [80]:
teams_in_gw_agg.head()

Unnamed: 0,gw,team_name,draw_odds,win_odds,lose_odds,number_of_home_matches,number_of_promoted_side_opponent,number_of_top_6_last_season_opponent,promoted_side,top_6_last_season,double_gameweek
0,1.0,Arsenal,3.3,0.6,4.54,False,1,0,0,0,0
1,1.0,Brighton and Hove Albion,3.03,3.99,0.69,True,0,1,0,0,0
2,1.0,Chelsea,3.03,0.69,3.99,False,0,0,0,1,0
3,1.0,Crystal Palace,2.26,1.93,1.56,True,0,0,0,0,0
4,1.0,Everton,2.7,3.28,0.86,False,0,1,0,0,0


### Next match/gameweek features

Need to load old data to get next gameweek features for GW 38 of previous season

In [81]:
historical_fixture_and_odds_features = pd.read_parquet('data/processed/formatted_fixture_and_odds_features_2011_to_2020.parquet')
historical_fixture_and_odds_features.head()

Unnamed: 0,season,gw,team_name,draw_odds,win_odds,lose_odds,number_of_home_matches,number_of_promoted_side_opponent,number_of_top_6_last_season_opponent,promoted_side,top_6_last_season,double_gameweek,next_gameweek_draw_odds,next_gameweek_win_odds,next_gameweek_lose_odds,next_gameweek_number_of_home_matches,next_gameweek_number_of_promoted_side_opponent,next_gameweek_number_of_top_6_last_season_opponent,next_gameweek_double_gameweek
0,2011-2012,1,Arsenal,2.31,1.12,2.46,0.0,0,0,0,1,0,2.26,1.32,2.09,1.0,0.0,1.0,0.0
1,2011-2012,2,Arsenal,2.26,1.32,2.09,1.0,0,1,0,1,0,3.93,7.28,0.36,0.0,0.0,1.0,0.0
2,2011-2012,3,Arsenal,3.93,7.28,0.36,0.0,0,1,0,1,0,3.63,0.36,8.1,1.0,1.0,0.0,0.0
3,2011-2012,4,Arsenal,3.63,0.36,8.1,1.0,1,0,0,1,0,2.38,1.0,2.79,0.0,0.0,0.0,0.0
4,2011-2012,5,Arsenal,2.38,1.0,2.79,0.0,0,0,0,1,0,3.75,0.36,7.63,1.0,0.0,0.0,0.0


In [82]:
latest_fixture_and_odds = teams_in_gw_agg.copy()
latest_fixture_and_odds['season'] = SEASON

In [83]:
fixture_and_odds_features = historical_fixture_and_odds_features.append(latest_fixture_and_odds)

In [84]:
fixture_and_odds_features.sort_values(['season', 'team_name', 'gw'], inplace=True)

In [85]:
for feature in [
    'draw_odds', 
    'win_odds', 
    'lose_odds', 
    'number_of_home_matches', 
    'number_of_promoted_side_opponent', 
    'number_of_top_6_last_season_opponent',
    'double_gameweek'
]:
    fixture_and_odds_features[f'next_gameweek_{feature}'] = fixture_and_odds_features.groupby(['team_name'])[feature].shift(-1)

In [86]:
fixture_and_odds_features[(fixture_and_odds_features['season'] == '2019-2020') & (fixture_and_odds_features['gw'] == 38)].head()

Unnamed: 0,season,gw,team_name,draw_odds,win_odds,lose_odds,number_of_home_matches,number_of_promoted_side_opponent,number_of_top_6_last_season_opponent,promoted_side,top_6_last_season,double_gameweek,next_gameweek_draw_odds,next_gameweek_win_odds,next_gameweek_lose_odds,next_gameweek_number_of_home_matches,next_gameweek_number_of_promoted_side_opponent,next_gameweek_number_of_top_6_last_season_opponent,next_gameweek_double_gameweek
5966,2019-2020,38.0,Arsenal,2.81,1.01,2.61,1.0,0,0,0,1,0,3.3,0.6,4.54,0.0,1.0,0.0,0.0
6003,2019-2020,38.0,Aston Villa,2.56,1.67,1.62,0.0,0,0,1,0,0,2.31,1.74,1.65,1.0,0.0,0.0,0.0
6041,2019-2020,38.0,Bournemouth,2.68,1.94,1.35,0.0,0,0,0,0,0,,,,,,,
6079,2019-2020,38.0,Brighton and Hove Albion,2.36,1.77,1.64,0.0,0,0,0,0,0,3.03,3.99,0.69,1.0,0.0,1.0,0.0
6117,2019-2020,38.0,Burnley,2.36,1.64,1.77,1.0,0,0,0,0,0,2.95,4.23,0.66,0.0,0.0,1.0,0.0


In [87]:
fixture_and_odds_features.shape

(6720, 19)

In [90]:
fixture_and_odds_features[(fixture_and_odds_features['season'] == '2020-2021')]['gw'].value_counts().sort_index()

1.0    16
2.0    20
Name: gw, dtype: int64

Given that Oddsportal includes odds for the next 2 gameweeks we can incorporate this in a future version of the model.

## 3. Combine all data

Merging fixture data also serves another purpose. In the previous step we created _all_ possible gameweek-player combinations. However, due to double gameweeks there are many cases where a player simply would not feature in a given gameweek and in this case we want to remove that player-gameweek for the data. An inner join to fixtures data does this.

In [91]:
fixture_and_odds_features.head()

Unnamed: 0,season,gw,team_name,draw_odds,win_odds,lose_odds,number_of_home_matches,number_of_promoted_side_opponent,number_of_top_6_last_season_opponent,promoted_side,top_6_last_season,double_gameweek,next_gameweek_draw_odds,next_gameweek_win_odds,next_gameweek_lose_odds,next_gameweek_number_of_home_matches,next_gameweek_number_of_promoted_side_opponent,next_gameweek_number_of_top_6_last_season_opponent,next_gameweek_double_gameweek
0,2011-2012,1.0,Arsenal,2.31,1.12,2.46,0.0,0,0,0,1,0,2.26,1.32,2.09,1.0,0.0,1.0,0.0
1,2011-2012,2.0,Arsenal,2.26,1.32,2.09,1.0,0,1,0,1,0,3.93,7.28,0.36,0.0,0.0,1.0,0.0
2,2011-2012,3.0,Arsenal,3.93,7.28,0.36,0.0,0,1,0,1,0,3.63,0.36,8.1,1.0,1.0,0.0,0.0
3,2011-2012,4.0,Arsenal,3.63,0.36,8.1,1.0,1,0,0,1,0,2.38,1.0,2.79,0.0,0.0,0.0,0.0
4,2011-2012,5.0,Arsenal,2.38,1.0,2.79,0.0,0,0,0,1,0,3.75,0.36,7.63,1.0,0.0,0.0,0.0


### Format FFS team names and seasons to match

In [92]:
ffs_data.head()

Unnamed: 0,gw,name,team_name,season,Aerial Duels - Won - Percentage,Assists,Big Chances Created,Caught Offside,Chances Created (Right Zone),Chances From Counter Attack,Clean Sheets,Crosses - Open Play - Successful,Crosses - Unsuccessful,Distribution - Successful,Distribution - Unsuccessful,Dribbles - Successful Percentage,Goal Kicks,Goals,Goals Conceded,Goals From Penalties,Ground Duels Lost,Handballs,ICT Influence,Minutes Per Save,Minutes Per Touch,Pass Completion,Pass Completion - Final Third,Pass Completion - Opponents Half,Premier League Straight Red Cards,Premier League Total Red Cards,Recoveries,Tackles Lost,Throw Ins,Touches - Final Third,Saves,Shots - Inside Box,Shots - Long - Attempts,Shots On Target,Subbed Off,Subbed On,Tackles - Won - Percentage,Take Ons,Take Ons - Successful Percentage,Time Played,Bad Touches,Fouls,ICT Creativity,Own Goals,Penalties Missed,Premier League Yellow Cards,Saves From Penalty,Touches - Opponents Half - Right,Touches - Penalty Area,xGI Expected Goal Involvement,ICT Index,Minutes Per Block,Minutes Per Interception,Minutes Per Tackle Won,Minutes Per xG,Passes - Backward,Passes - Forward,Saves (Shots Outside Box),Shot Accuracy,Shots Blocked,total_points,position_DEF,position_FWD,position_GK,position_MID
0,1,aaron_connolly,BHA,2019-2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0
1,2,aaron_connolly,BHA,2019-2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0
2,3,aaron_connolly,BHA,2019-2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0
3,4,aaron_connolly,BHA,2019-2020,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,8.0,100.0,0.0,100.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,inf,1.0,1.0,0.0,0.0,0.0,1.0,0,1,0,0
4,5,aaron_connolly,BHA,2019-2020,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0,100.0,100.0,100.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.14,2.2,0.0,0.0,0.0,42.9,0.0,0.0,0.0,50.0,0.0,1.0,0,1,0,0


In [93]:
ffs_abbreviation_to_full = {
    'WHU': 'West Ham United', 
    'BUR': 'Burnley', 
    'HUD': 'Huddersfield Town', 
    'ARS': 'Arsenal', 
    'CRY': 'Crystal Palace', 
    'WAT': 'Watford', 
    'FUL': 'Fulham', 
    'LIV': 'Liverpool', 
    'BOU': 'Bournemouth',
    'WOL': 'Wolverhampton Wanderers', 
    'EVE': 'Everton', 
    'LEI': 'Leicester City', 
    'WBA': 'West Bromwich Albion', 
    'NEW': 'Newcastle United', 
    'SOU': 'Southampton', 
    'MUN': 'Manchester United', 
    'SWA': 'Swansea City', 
    'BHA': 'Brighton and Hove Albion',
    'CHE': 'Chelsea',
    'CAR': 'Cardiff City',
    'MCI': 'Manchester City',
    'TOT': 'Tottenham Hotspur',
    'STK': 'Stoke City',
    'AVL': 'Aston Villa',
    'BLA': 'Blackburn Rovers',
    'BOL': 'Bolton Wanderers',
    'HUL': 'Hull City',
    'MID': 'Middlesbrough',
    'NOR': 'Norwich City',
    'QPR': 'Queens Park Rangers',
    'RDG': 'Reading',
    'SHU': 'Sheffield United',
    'SUN': 'Sunderland',
    'WIG': 'Wigan Athletic'
}

In [94]:
set(ffs_data['team_name'].replace(ffs_abbreviation_to_full)) - set(fixture_and_odds_features['team_name'])

set()

In [95]:
set(fixture_and_odds_features['team_name']) - set(ffs_data['team_name'].replace(ffs_abbreviation_to_full))

{'Leeds'}

In [96]:
ffs_data['team_name'].replace(ffs_abbreviation_to_full, inplace=True)

In [97]:
ffs_data.head()

Unnamed: 0,gw,name,team_name,season,Aerial Duels - Won - Percentage,Assists,Big Chances Created,Caught Offside,Chances Created (Right Zone),Chances From Counter Attack,Clean Sheets,Crosses - Open Play - Successful,Crosses - Unsuccessful,Distribution - Successful,Distribution - Unsuccessful,Dribbles - Successful Percentage,Goal Kicks,Goals,Goals Conceded,Goals From Penalties,Ground Duels Lost,Handballs,ICT Influence,Minutes Per Save,Minutes Per Touch,Pass Completion,Pass Completion - Final Third,Pass Completion - Opponents Half,Premier League Straight Red Cards,Premier League Total Red Cards,Recoveries,Tackles Lost,Throw Ins,Touches - Final Third,Saves,Shots - Inside Box,Shots - Long - Attempts,Shots On Target,Subbed Off,Subbed On,Tackles - Won - Percentage,Take Ons,Take Ons - Successful Percentage,Time Played,Bad Touches,Fouls,ICT Creativity,Own Goals,Penalties Missed,Premier League Yellow Cards,Saves From Penalty,Touches - Opponents Half - Right,Touches - Penalty Area,xGI Expected Goal Involvement,ICT Index,Minutes Per Block,Minutes Per Interception,Minutes Per Tackle Won,Minutes Per xG,Passes - Backward,Passes - Forward,Saves (Shots Outside Box),Shot Accuracy,Shots Blocked,total_points,position_DEF,position_FWD,position_GK,position_MID
0,1,aaron_connolly,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0
1,2,aaron_connolly,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0
2,3,aaron_connolly,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0
3,4,aaron_connolly,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,8.0,100.0,0.0,100.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,inf,1.0,1.0,0.0,0.0,0.0,1.0,0,1,0,0
4,5,aaron_connolly,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0,100.0,100.0,100.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.14,2.2,0.0,0.0,0.0,42.9,0.0,0.0,0.0,50.0,0.0,1.0,0,1,0,0


In [98]:
ffs_data.shape

(180348, 69)

### Merge

In [99]:
fixture_and_odds_features.head()

Unnamed: 0,season,gw,team_name,draw_odds,win_odds,lose_odds,number_of_home_matches,number_of_promoted_side_opponent,number_of_top_6_last_season_opponent,promoted_side,top_6_last_season,double_gameweek,next_gameweek_draw_odds,next_gameweek_win_odds,next_gameweek_lose_odds,next_gameweek_number_of_home_matches,next_gameweek_number_of_promoted_side_opponent,next_gameweek_number_of_top_6_last_season_opponent,next_gameweek_double_gameweek
0,2011-2012,1.0,Arsenal,2.31,1.12,2.46,0.0,0,0,0,1,0,2.26,1.32,2.09,1.0,0.0,1.0,0.0
1,2011-2012,2.0,Arsenal,2.26,1.32,2.09,1.0,0,1,0,1,0,3.93,7.28,0.36,0.0,0.0,1.0,0.0
2,2011-2012,3.0,Arsenal,3.93,7.28,0.36,0.0,0,1,0,1,0,3.63,0.36,8.1,1.0,1.0,0.0,0.0
3,2011-2012,4.0,Arsenal,3.63,0.36,8.1,1.0,1,0,0,1,0,2.38,1.0,2.79,0.0,0.0,0.0,0.0
4,2011-2012,5.0,Arsenal,2.38,1.0,2.79,0.0,0,0,0,1,0,3.75,0.36,7.63,1.0,0.0,0.0,0.0


In [100]:
ffs_data.head()

Unnamed: 0,gw,name,team_name,season,Aerial Duels - Won - Percentage,Assists,Big Chances Created,Caught Offside,Chances Created (Right Zone),Chances From Counter Attack,Clean Sheets,Crosses - Open Play - Successful,Crosses - Unsuccessful,Distribution - Successful,Distribution - Unsuccessful,Dribbles - Successful Percentage,Goal Kicks,Goals,Goals Conceded,Goals From Penalties,Ground Duels Lost,Handballs,ICT Influence,Minutes Per Save,Minutes Per Touch,Pass Completion,Pass Completion - Final Third,Pass Completion - Opponents Half,Premier League Straight Red Cards,Premier League Total Red Cards,Recoveries,Tackles Lost,Throw Ins,Touches - Final Third,Saves,Shots - Inside Box,Shots - Long - Attempts,Shots On Target,Subbed Off,Subbed On,Tackles - Won - Percentage,Take Ons,Take Ons - Successful Percentage,Time Played,Bad Touches,Fouls,ICT Creativity,Own Goals,Penalties Missed,Premier League Yellow Cards,Saves From Penalty,Touches - Opponents Half - Right,Touches - Penalty Area,xGI Expected Goal Involvement,ICT Index,Minutes Per Block,Minutes Per Interception,Minutes Per Tackle Won,Minutes Per xG,Passes - Backward,Passes - Forward,Saves (Shots Outside Box),Shot Accuracy,Shots Blocked,total_points,position_DEF,position_FWD,position_GK,position_MID
0,1,aaron_connolly,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0
1,2,aaron_connolly,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0
2,3,aaron_connolly,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0
3,4,aaron_connolly,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,8.0,100.0,0.0,100.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,inf,1.0,1.0,0.0,0.0,0.0,1.0,0,1,0,0
4,5,aaron_connolly,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0,100.0,100.0,100.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.14,2.2,0.0,0.0,0.0,42.9,0.0,0.0,0.0,50.0,0.0,1.0,0,1,0,0


In [102]:
# Inner join also removes gw-season fixtures which did not happen but were filled with 0s
print(ffs_data.shape)

ffs_data = ffs_data.merge(
    fixture_and_odds_features, 
    on=['season', 'gw', 'team_name'], 
    how='inner'
)

print(ffs_data.shape)

ffs_data.head()

(180348, 69)
(176265, 85)


Unnamed: 0,gw,name,team_name,season,Aerial Duels - Won - Percentage,Assists,Big Chances Created,Caught Offside,Chances Created (Right Zone),Chances From Counter Attack,Clean Sheets,Crosses - Open Play - Successful,Crosses - Unsuccessful,Distribution - Successful,Distribution - Unsuccessful,Dribbles - Successful Percentage,Goal Kicks,Goals,Goals Conceded,Goals From Penalties,Ground Duels Lost,Handballs,ICT Influence,Minutes Per Save,Minutes Per Touch,Pass Completion,Pass Completion - Final Third,Pass Completion - Opponents Half,Premier League Straight Red Cards,Premier League Total Red Cards,Recoveries,Tackles Lost,Throw Ins,Touches - Final Third,Saves,Shots - Inside Box,Shots - Long - Attempts,Shots On Target,Subbed Off,Subbed On,Tackles - Won - Percentage,Take Ons,Take Ons - Successful Percentage,Time Played,Bad Touches,Fouls,ICT Creativity,Own Goals,Penalties Missed,Premier League Yellow Cards,Saves From Penalty,Touches - Opponents Half - Right,Touches - Penalty Area,xGI Expected Goal Involvement,ICT Index,Minutes Per Block,Minutes Per Interception,Minutes Per Tackle Won,Minutes Per xG,Passes - Backward,Passes - Forward,Saves (Shots Outside Box),Shot Accuracy,Shots Blocked,total_points,position_DEF,position_FWD,position_GK,position_MID,draw_odds,win_odds,lose_odds,number_of_home_matches,number_of_promoted_side_opponent,number_of_top_6_last_season_opponent,promoted_side,top_6_last_season,double_gameweek,next_gameweek_draw_odds,next_gameweek_win_odds,next_gameweek_lose_odds,next_gameweek_number_of_home_matches,next_gameweek_number_of_promoted_side_opponent,next_gameweek_number_of_top_6_last_season_opponent,next_gameweek_double_gameweek
0,1,aaron_connolly,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,2.29,3.09,1.08,0.0,0,0,0,0,0,2.45,1.21,2.41,1.0,0.0,0.0,0.0
1,1,aaron_mooy,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,2.29,3.09,1.08,0.0,0,0,0,0,0,2.45,1.21,2.41,1.0,0.0,0.0,0.0
2,1,adam_webster,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,2.29,3.09,1.08,0.0,0,0,0,0,0,2.45,1.21,2.41,1.0,0.0,0.0,0.0
3,1,alexis_mac allister,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,2.29,3.09,1.08,0.0,0,0,0,0,0,2.45,1.21,2.41,1.0,0.0,0.0,0.0
4,1,alireza_jahanbakhsh,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,2.29,3.09,1.08,0.0,0,0,0,0,0,2.45,1.21,2.41,1.0,0.0,0.0,0.0


In [103]:
ffs_data[
    (ffs_data['season'] == '2020-2021')
].head()

Unnamed: 0,gw,name,team_name,season,Aerial Duels - Won - Percentage,Assists,Big Chances Created,Caught Offside,Chances Created (Right Zone),Chances From Counter Attack,Clean Sheets,Crosses - Open Play - Successful,Crosses - Unsuccessful,Distribution - Successful,Distribution - Unsuccessful,Dribbles - Successful Percentage,Goal Kicks,Goals,Goals Conceded,Goals From Penalties,Ground Duels Lost,Handballs,ICT Influence,Minutes Per Save,Minutes Per Touch,Pass Completion,Pass Completion - Final Third,Pass Completion - Opponents Half,Premier League Straight Red Cards,Premier League Total Red Cards,Recoveries,Tackles Lost,Throw Ins,Touches - Final Third,Saves,Shots - Inside Box,Shots - Long - Attempts,Shots On Target,Subbed Off,Subbed On,Tackles - Won - Percentage,Take Ons,Take Ons - Successful Percentage,Time Played,Bad Touches,Fouls,ICT Creativity,Own Goals,Penalties Missed,Premier League Yellow Cards,Saves From Penalty,Touches - Opponents Half - Right,Touches - Penalty Area,xGI Expected Goal Involvement,ICT Index,Minutes Per Block,Minutes Per Interception,Minutes Per Tackle Won,Minutes Per xG,Passes - Backward,Passes - Forward,Saves (Shots Outside Box),Shot Accuracy,Shots Blocked,total_points,position_DEF,position_FWD,position_GK,position_MID,draw_odds,win_odds,lose_odds,number_of_home_matches,number_of_promoted_side_opponent,number_of_top_6_last_season_opponent,promoted_side,top_6_last_season,double_gameweek,next_gameweek_draw_odds,next_gameweek_win_odds,next_gameweek_lose_odds,next_gameweek_number_of_home_matches,next_gameweek_number_of_promoted_side_opponent,next_gameweek_number_of_top_6_last_season_opponent,next_gameweek_double_gameweek


In [104]:
ffs_data[
    (ffs_data['season'] == '2019-2020') & (ffs_data['gw'] == 38)
].head()

Unnamed: 0,gw,name,team_name,season,Aerial Duels - Won - Percentage,Assists,Big Chances Created,Caught Offside,Chances Created (Right Zone),Chances From Counter Attack,Clean Sheets,Crosses - Open Play - Successful,Crosses - Unsuccessful,Distribution - Successful,Distribution - Unsuccessful,Dribbles - Successful Percentage,Goal Kicks,Goals,Goals Conceded,Goals From Penalties,Ground Duels Lost,Handballs,ICT Influence,Minutes Per Save,Minutes Per Touch,Pass Completion,Pass Completion - Final Third,Pass Completion - Opponents Half,Premier League Straight Red Cards,Premier League Total Red Cards,Recoveries,Tackles Lost,Throw Ins,Touches - Final Third,Saves,Shots - Inside Box,Shots - Long - Attempts,Shots On Target,Subbed Off,Subbed On,Tackles - Won - Percentage,Take Ons,Take Ons - Successful Percentage,Time Played,Bad Touches,Fouls,ICT Creativity,Own Goals,Penalties Missed,Premier League Yellow Cards,Saves From Penalty,Touches - Opponents Half - Right,Touches - Penalty Area,xGI Expected Goal Involvement,ICT Index,Minutes Per Block,Minutes Per Interception,Minutes Per Tackle Won,Minutes Per xG,Passes - Backward,Passes - Forward,Saves (Shots Outside Box),Shot Accuracy,Shots Blocked,total_points,position_DEF,position_FWD,position_GK,position_MID,draw_odds,win_odds,lose_odds,number_of_home_matches,number_of_promoted_side_opponent,number_of_top_6_last_season_opponent,promoted_side,top_6_last_season,double_gameweek,next_gameweek_draw_odds,next_gameweek_win_odds,next_gameweek_lose_odds,next_gameweek_number_of_home_matches,next_gameweek_number_of_promoted_side_opponent,next_gameweek_number_of_top_6_last_season_opponent,next_gameweek_double_gameweek
925,38,aaron_connolly,Brighton and Hove Albion,2019-2020,16.7,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,1.0,1.0,0.0,2.0,1.0,32.0,0.0,3.6,70.0,100.0,71.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,5.0,0.0,2.0,1.0,0.0,100.0,1.0,100.0,89.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,6.0,0.68,12.7,0.0,0.0,89.0,132.8,5.0,2.0,0.0,33.3,2.0,6.0,0,1,0,0,2.36,1.77,1.64,0.0,0,0,0,0,0,3.03,3.99,0.69,1.0,0.0,1.0,0.0
926,38,aaron_mooy,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.1,100.0,100.0,100.0,0.0,0.0,1.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,16.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,4.0,0.0,0.06,1.1,0.0,0.0,0.0,533.3,3.0,2.0,0.0,0.0,0.0,1.0,0,0,0,1,2.36,1.77,1.64,0.0,0,0,0,0,0,3.03,3.99,0.69,1.0,0.0,1.0,0.0
927,38,adam_webster,Brighton and Hove Albion,2019-2020,83.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,19.0,0.0,1.8,62.0,100.0,80.0,0.0,0.0,8.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,5.0,0.0,0.01,2.0,90.0,90.0,0.0,inf,6.0,29.0,0.0,0.0,0.0,2.0,1,0,0,0,2.36,1.77,1.64,0.0,0,0,0,0,0,3.03,3.99,0.69,1.0,0.0,1.0,0.0
928,38,alexis_mac allister,Brighton and Hove Albion,2019-2020,33.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,2.3,86.0,100.0,78.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,74.0,2.0,2.0,11.0,0.0,0.0,0.0,0.0,0.0,2.0,0.34,3.4,0.0,0.0,0.0,224.2,6.0,7.0,0.0,33.3,0.0,2.0,0,0,0,1,2.36,1.77,1.64,0.0,0,0,0,0,0,3.03,3.99,0.69,1.0,0.0,1.0,0.0
929,38,alireza_jahanbakhsh,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,100.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,inf,0.0,1.0,0.0,0.0,0.0,1.0,0,0,0,1,2.36,1.77,1.64,0.0,0,0,0,0,0,3.03,3.99,0.69,1.0,0.0,1.0,0.0


### Filter final features

In [107]:
FINAL_FEATURES = [
    'Aerial Duels - Won - Percentage',
    'Assists',
    'Bad Touches',
    'Big Chances Created',
    'Caught Offside',
    'Chances From Counter Attack',
    'Clean Sheets',
    'Crosses - Open Play - Successful',
    'Crosses - Unsuccessful',
    'Distribution - Successful',
    'Dribbles - Successful Percentage',
    'Fouls',
    'Goals',
    'Goals Conceded',
    'Handballs',
    'ICT Creativity',
    'ICT Index',
    'Minutes Per Block',
    'Minutes Per Interception',
    'Minutes Per Save',
    'Minutes Per Tackle Won',
    'Minutes Per Touch',
    'xGI Expected Goal Involvement',
    'Pass Completion',
    'Pass Completion - Final Third',
    'Pass Completion - Opponents Half',
    'Passes - Backward',
    'Passes - Forward',
    'Premier League Straight Red Cards',
    'Premier League Total Red Cards',
    'Recoveries',
    'Saves (Shots Outside Box)',
    'Shot Accuracy',
    'Shots Blocked',
    'Shots On Target',
    'Subbed Off',
    'Subbed On',
    'Tackles - Won - Percentage',
    'Tackles Lost',
    'Take Ons',
    'Take Ons - Successful Percentage',
    'Throw Ins',
    'Time Played',
    'Touches - Final Third',
    'Touches - Penalty Area',
    'double_gameweek',
    'gw',
    'next_gameweek_double_gameweek',
    'next_gameweek_draw_odds',
    'next_gameweek_lose_odds',
    'next_gameweek_number_of_home_matches',
    'next_gameweek_number_of_promoted_side_opponent',
    'next_gameweek_number_of_top_6_last_season_opponent',
    'next_gameweek_win_odds',
    'number_of_home_matches',
    'number_of_top_6_last_season_opponent',
    'position_DEF',
    'position_FWD',
    'position_GK',
    'position_MID',
    'top_6_last_season',
    'total_points'
]

In [108]:
len(FINAL_FEATURES)

62

In [109]:
ffs_data = ffs_data[['name', 'team_name', 'season'] + FINAL_FEATURES]

In [110]:
ffs_data.shape

(176265, 65)

## 4. Make predictions

For now we just check that predictions can be made. The actual code to calculate predictions for a player will be similar to the original LSTM class.

In [111]:
ffs_data.sort_values(['name', 'season', 'gw'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [112]:
ffs_data.head()

Unnamed: 0,name,team_name,season,Aerial Duels - Won - Percentage,Assists,Bad Touches,Big Chances Created,Caught Offside,Chances From Counter Attack,Clean Sheets,Crosses - Open Play - Successful,Crosses - Unsuccessful,Distribution - Successful,Dribbles - Successful Percentage,Fouls,Goals,Goals Conceded,Handballs,ICT Creativity,ICT Index,Minutes Per Block,Minutes Per Interception,Minutes Per Save,Minutes Per Tackle Won,Minutes Per Touch,xGI Expected Goal Involvement,Pass Completion,Pass Completion - Final Third,Pass Completion - Opponents Half,Passes - Backward,Passes - Forward,Premier League Straight Red Cards,Premier League Total Red Cards,Recoveries,Saves (Shots Outside Box),Shot Accuracy,Shots Blocked,Shots On Target,Subbed Off,Subbed On,Tackles - Won - Percentage,Tackles Lost,Take Ons,Take Ons - Successful Percentage,Throw Ins,Time Played,Touches - Final Third,Touches - Penalty Area,double_gameweek,gw,next_gameweek_double_gameweek,next_gameweek_draw_odds,next_gameweek_lose_odds,next_gameweek_number_of_home_matches,next_gameweek_number_of_promoted_side_opponent,next_gameweek_number_of_top_6_last_season_opponent,next_gameweek_win_odds,number_of_home_matches,number_of_top_6_last_season_opponent,position_DEF,position_FWD,position_GK,position_MID,top_6_last_season,total_points
0,aaron_connolly,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0.0,2.45,2.41,1.0,0.0,0.0,1.21,0.0,0,0,1,0,0,0,0.0
25,aaron_connolly,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.0,2.3,2.29,1.0,0.0,0.0,1.34,1.0,0,0,1,0,0,0,0.0
50,aaron_connolly,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3,0.0,10.67,0.09,0.0,0.0,1.0,30.23,1.0,0,0,1,0,0,0,0.0
75,aaron_connolly,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,100.0,0.0,100.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0,4,0.0,2.4,2.67,1.0,0.0,0.0,1.14,0.0,1,0,1,0,0,0,1.0
100,aaron_connolly,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.2,0.0,0.0,0.0,0.0,2.0,0.14,100.0,100.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,6.0,3.0,2.0,0,5,0.0,2.22,1.4,0.0,0.0,0.0,2.26,1.0,0,0,1,0,0,0,1.0


### Filter out ineligible players

In [116]:
def filter_eligible_players(training_subset_df):
    training_subset_df['total_number_of_gameweeks'] = training_subset_df.groupby(['name']).transform('count')['team_name']
    training_subset_df = training_subset_df[training_subset_df['total_number_of_gameweeks'] >= N_STEPS_IN]
    training_subset_df.drop('total_number_of_gameweeks', axis=1, inplace=True)

    return training_subset_df

In [117]:
print(ffs_data.shape)
ffs_data = filter_eligible_players(ffs_data)
print(ffs_data.shape)

(176265, 65)
(176265, 65)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Normalise features

In [121]:
mms = _load_model_from_pickle('src/models/pickles/min_max_scalar_DeepFantasyFootball_v01.pickle')
mms

MinMaxScaler(copy=True, feature_range=(0, 1))

In [123]:
ffs_data[FINAL_FEATURES] = mms.transform(ffs_data[FINAL_FEATURES])

In [124]:
ffs_data.head()

Unnamed: 0,name,team_name,season,Aerial Duels - Won - Percentage,Assists,Bad Touches,Big Chances Created,Caught Offside,Chances From Counter Attack,Clean Sheets,Crosses - Open Play - Successful,Crosses - Unsuccessful,Distribution - Successful,Dribbles - Successful Percentage,Fouls,Goals,Goals Conceded,Handballs,ICT Creativity,ICT Index,Minutes Per Block,Minutes Per Interception,Minutes Per Save,Minutes Per Tackle Won,Minutes Per Touch,xGI Expected Goal Involvement,Pass Completion,Pass Completion - Final Third,Pass Completion - Opponents Half,Passes - Backward,Passes - Forward,Premier League Straight Red Cards,Premier League Total Red Cards,Recoveries,Saves (Shots Outside Box),Shot Accuracy,Shots Blocked,Shots On Target,Subbed Off,Subbed On,Tackles - Won - Percentage,Tackles Lost,Take Ons,Take Ons - Successful Percentage,Throw Ins,Time Played,Touches - Final Third,Touches - Penalty Area,double_gameweek,gw,next_gameweek_double_gameweek,next_gameweek_draw_odds,next_gameweek_lose_odds,next_gameweek_number_of_home_matches,next_gameweek_number_of_promoted_side_opponent,next_gameweek_number_of_top_6_last_season_opponent,next_gameweek_win_odds,number_of_home_matches,number_of_top_6_last_season_opponent,position_DEF,position_FWD,position_GK,position_MID,top_6_last_season,total_points
0,aaron_connolly,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04252,0.070418,0.5,0.0,0.0,0.034306,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.129032
25,aaron_connolly,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027027,0.0,0.030709,0.066807,0.5,0.0,0.0,0.038218,0.5,0.0,0.0,1.0,0.0,0.0,0.0,0.129032
50,aaron_connolly,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.054054,0.0,0.689764,0.000602,0.0,0.0,0.5,0.907614,0.5,0.0,0.0,1.0,0.0,0.0,0.0,0.129032
75,aaron_connolly,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.444444,0.0,1.0,0.0,1.0,0.013158,0.006944,0.0,0.0,0.03125,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.133333,0.0,0.0,0.0,0.081081,0.0,0.038583,0.078243,0.5,0.0,0.0,0.0322,0.0,0.5,0.0,1.0,0.0,0.0,0.0,0.16129
100,aaron_connolly,Brighton and Hove Albion,2019-2020,0.0,0.0,0.0,0.0,0.166667,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.050228,0.0,0.0,0.0,0.0,0.111111,0.045016,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.111111,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.033333,0.026087,0.083333,0.0,0.108108,0.0,0.024409,0.040024,0.0,0.0,0.0,0.065904,0.5,0.0,0.0,1.0,0.0,0.0,0.0,0.16129


### Prepare data for LSTM

In [125]:
ffs_data['total_points_plus1_gw'] = 0  # Add target column to make preparation step the same

In [126]:
def prepare_data_for_lstm(df, feature_list):
    df = df.copy()

    X_list = []
    y_list = []

    for player in list(df['name'].unique()):
        player_df = df[df['name'] == player]
        player_df = player_df[feature_list + ['total_points_plus1_gw']]

        X_player, y_player = split_sequences(
            df=player_df,
            target_column='total_points_plus1_gw',
            n_steps_in=N_STEPS_IN,
            n_steps_out=N_STEPS_OUT
        )
        X_list.append(X_player)
        y_list.append(y_player)

    X = np.concatenate(X_list, axis=0)
    y = np.concatenate(y_list, axis=0)
    print(X.shape)
    print(y.shape)
    
    return X, y

In [127]:
%%time
X_ffs, _ = prepare_data_for_lstm(ffs_data, feature_list=FINAL_FEATURES)

(162241, 5, 62)
(162241, 5)
CPU times: user 32.8 s, sys: 314 ms, total: 33.1 s
Wall time: 33.1 s


### Load model

In [128]:
lstm_model = load_model("src/models/pickles/DeepFantasyFootball_v01.h5")
lstm_model.summary()

Model: "new_lstm"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 5, 62)             0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 30)                11160     
_________________________________________________________________
dropout_4 (Dropout)          (None, 30)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 155       
Total params: 11,315
Trainable params: 11,315
Non-trainable params: 0
_________________________________________________________________


In [130]:
predictions = lstm_model.predict(X_ffs)
predictions

array([[1.2857234 , 1.2404766 , 1.2033465 , 1.1845675 , 1.1911798 ],
       [1.2240145 , 1.4976249 , 1.3844306 , 1.377922  , 1.334449  ],
       [1.3661089 , 1.3590624 , 1.2991513 , 1.2677128 , 1.2416898 ],
       ...,
       [2.231287  , 2.965804  , 2.7072983 , 2.7400243 , 2.6267664 ],
       [0.91098905, 1.6178327 , 1.6588471 , 1.7360748 , 1.7439619 ],
       [0.8235327 , 1.1318668 , 1.2929902 , 1.3336557 , 1.4008298 ]],
      dtype=float32)

In [133]:
pd.DataFrame(predictions).isnull().sum()

0    376
1    376
2    376
3    376
4    376
dtype: int64

Suspect these nulls are due to relegated teams at GW 38. Otherwise looks like everything working :)