<a href="https://colab.research.google.com/github/robert-shepherd/fpl/blob/main/Project_3_1_model_build_setup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model build setup


---

The purpose of this script is to select features to be used in the model, partition the data into a train/test split, and separate data into X (feature variables) and Y (response variable)

Data sources:
* Data post feature engineering:  https://raw.githubusercontent.com/robert-shepherd/fpl/main/fpl_features.csv

Output:

* Data is downloaded and subsequently saved to:
 * https://raw.githubusercontent.com/robert-shepherd/fpl/main/X_train.csv
 * https://raw.githubusercontent.com/robert-shepherd/fpl/main/X_test.csv
 * https://raw.githubusercontent.com/robert-shepherd/fpl/main/Y_train.csv
 * https://raw.githubusercontent.com/robert-shepherd/fpl/main/Y_test.csv

In [None]:
# Loading libraries
import pandas as pd
import pickle
import numpy as np
import scipy.stats as stats
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files

# Import measures
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error

  import pandas.util.testing as tm


## Setup

In [None]:
# Read in data
url = 'https://raw.githubusercontent.com/robert-shepherd/fpl/main/fpl_features.csv'

df = pd.read_csv(url)

In [None]:
# Check data
df.head(10)

Unnamed: 0,element,fixture,opponent_team,total_points,was_home,kickoff_time,team_h_score,team_a_score,round,minutes,goals_scored,assists,clean_sheets,goals_conceded,own_goals,penalties_saved,penalties_missed,yellow_cards,red_cards,saves,bonus,bps,influence,creativity,threat,ict_index,value,transfers_balance,selected,transfers_in,transfers_out,element_type,position,opponent_strength,season,cards,transfers_in_ratio,transfers_out_ratio,points_lag_1,points_lag_2,...,points_lag_21,points_lag_22,points_lag_23,points_lag_24,points_lag_25,points_lag_26,points_lag_27,points_lag_28,points_lag_29,points_lag_30,minutes_lag,minutes_rolling,goals_scored_lag,goals_scored_rolling,assists_lag,assists_rolling,clean_sheets_lag,clean_sheets_rolling,goals_conceded_lag,goals_conceded_rolling,saves_lag,saves_rolling,cards_lag,cards_rolling,bonus_lag,bonus_rolling,bps_lag,bps_rolling,influence_lag,influence_rolling,creativity_lag,creativity_rolling,threat_lag,threat_rolling,selected_lag,selected_rolling,transfers_in_ratio_lag,transfers_in_ratio_rolling,transfers_out_ratio_lag,transfers_out_ratio_rolling
0,1,1,13,3,True,2018-08-12T15:00:00Z,0.0,2.0,1,90,0,0,0,2,0,0,0,0,0,6,0,24,47.0,0.0,0.0,4.7,50,0,70767,0,0,1,GKP,5,2018,0,0.0,0.0,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1,14,6,3,False,2018-08-18T16:30:00Z,3.0,2.0,2,90,0,0,0,3,0,0,0,0,0,8,0,26,53.0,0.0,0.0,5.3,50,188,90038,6059,5871,1,GKP,4,2018,0,0.067294,0.065206,3.0,,...,,,,,,,,,,,90.0,,0.0,,0.0,,0.0,,2.0,,6.0,,0.0,,0.0,,24.0,,47.0,,0.0,,0.0,,70767.0,,0.0,,0.0,
2,1,21,19,3,True,2018-08-25T14:00:00Z,3.0,1.0,3,90,0,0,0,1,0,0,0,0,0,4,0,17,27.8,0.0,0.0,2.8,50,8540,107301,19550,11010,1,GKP,3,2018,0,0.182198,0.102609,3.0,3.0,...,,,,,,,,,,,90.0,,0.0,,0.0,,0.0,,3.0,,8.0,,0.0,,0.0,,26.0,,53.0,,0.0,,0.0,,90038.0,,0.067294,,0.065206,
3,1,33,5,1,False,2018-09-02T12:30:00Z,2.0,3.0,4,90,0,0,0,2,0,0,0,0,0,1,0,8,2.4,0.0,0.0,0.2,50,9582,123566,19332,9750,1,GKP,2,2018,0,0.156451,0.078905,3.0,3.0,...,,,,,,,,,,,90.0,,0.0,,0.0,,0.0,,1.0,,4.0,,0.0,,0.0,,17.0,,27.8,,0.0,,0.0,,107301.0,,0.182198,,0.102609,
4,1,46,15,2,False,2018-09-15T14:00:00Z,1.0,2.0,5,90,0,0,0,1,0,0,0,0,0,1,0,13,14.2,0.0,0.0,1.4,50,-3297,123310,8837,12134,1,GKP,3,2018,0,0.071665,0.098402,1.0,3.0,...,,,,,,,,,,,90.0,,0.0,,0.0,,0.0,,2.0,,1.0,,0.0,,0.0,,8.0,,2.4,,0.0,,0.0,,123566.0,,0.156451,,0.078905,
5,1,51,8,11,True,2018-09-23T15:00:00Z,2.0,0.0,6,90,0,0,1,0,0,0,0,0,0,6,3,36,43.4,0.0,0.0,4.3,50,-797,124787,6593,7390,1,GKP,3,2018,0,0.052834,0.059221,2.0,1.0,...,,,,,,,,,,,90.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.8,1.0,4.0,0.0,0.0,0.0,0.0,13.0,17.6,14.2,28.88,0.0,0.0,0.0,0.0,123310.0,102996.4,0.071665,0.095521,0.098402,0.069024
6,1,61,18,1,True,2018-09-29T14:00:00Z,2.0,0.0,7,45,0,0,0,0,0,0,0,0,0,1,0,6,17.2,0.0,0.0,1.7,50,9392,138891,13595,4203,1,GKP,3,2018,0,0.097883,0.030261,11.0,2.0,...,,,,,,,,,,,90.0,90.0,0.0,0.0,0.0,0.0,1.0,0.2,0.0,1.4,6.0,4.0,0.0,0.0,3.0,0.6,36.0,20.0,43.4,28.16,0.0,0.0,0.0,0.0,124787.0,113800.4,0.052834,0.106088,0.059221,0.080869
7,1,74,9,0,False,2018-10-07T11:00:00Z,1.0,5.0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,50,-28910,107912,924,29834,1,GKP,2,2018,0,0.008563,0.276466,1.0,11.0,...,,,,,,,,,,,45.0,81.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.8,1.0,2.6,0.0,0.0,0.0,0.6,6.0,16.0,17.2,21.0,0.0,0.0,0.0,0.0,138891.0,123571.0,0.097883,0.112206,0.030261,0.07388
8,1,81,11,0,True,2018-10-22T19:00:00Z,3.0,1.0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,50,-16395,92724,336,16731,1,GKP,3,2018,0,0.003624,0.180439,0.0,1.0,...,,,,,,,,,,,0.0,63.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.6,0.0,1.8,0.0,0.0,0.0,0.6,0.0,12.6,0.0,15.44,0.0,0.0,0.0,0.0,107912.0,123693.2,0.008563,0.077479,0.276466,0.108651
9,1,93,7,0,False,2018-10-28T13:30:00Z,2.0,2.0,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,50,-2999,90783,1139,4138,1,GKP,2,2018,0,0.012546,0.045581,0.0,0.0,...,,,,,,,,,,,0.0,45.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.2,0.0,1.6,0.0,0.0,0.0,0.6,0.0,11.0,0.0,14.96,0.0,0.0,0.0,0.0,92724.0,117524.8,0.003624,0.046914,0.180439,0.128958


In [None]:
# Select only features to be used to create the model
variables_for_selection = ['total_points','was_home','position',
'opponent_strength',
'points_lag_1','points_lag_2','points_lag_3','points_lag_4',
'points_lag_5',
'points_lag_6','points_lag_7','points_lag_8',
'points_lag_9','points_lag_10',
'minutes_lag',
'minutes_rolling', 'goals_scored_lag', 'goals_scored_rolling',
'assists_lag', 'assists_rolling', 'clean_sheets_lag',
'clean_sheets_rolling', 'goals_conceded_lag', 'goals_conceded_rolling',
'saves_lag', 'saves_rolling', 'cards_lag', 'cards_rolling', 'bonus_lag',
'bonus_rolling', 'bps_lag', 'bps_rolling', 'influence_lag',
'influence_rolling', 'creativity_lag', 'creativity_rolling',
'threat_lag', 'threat_rolling', 'selected_lag', 'selected_rolling',
'transfers_in_ratio_lag', 'transfers_in_ratio_rolling',
'transfers_out_ratio_lag', 'transfers_out_ratio_rolling']

df = df[variables_for_selection]
df.head()

Unnamed: 0,total_points,was_home,position,opponent_strength,points_lag_1,points_lag_2,points_lag_3,points_lag_4,points_lag_5,points_lag_6,points_lag_7,points_lag_8,points_lag_9,points_lag_10,minutes_lag,minutes_rolling,goals_scored_lag,goals_scored_rolling,assists_lag,assists_rolling,clean_sheets_lag,clean_sheets_rolling,goals_conceded_lag,goals_conceded_rolling,saves_lag,saves_rolling,cards_lag,cards_rolling,bonus_lag,bonus_rolling,bps_lag,bps_rolling,influence_lag,influence_rolling,creativity_lag,creativity_rolling,threat_lag,threat_rolling,selected_lag,selected_rolling,transfers_in_ratio_lag,transfers_in_ratio_rolling,transfers_out_ratio_lag,transfers_out_ratio_rolling
0,3,True,GKP,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,3,False,GKP,4,3.0,,,,,,,,,,90.0,,0.0,,0.0,,0.0,,2.0,,6.0,,0.0,,0.0,,24.0,,47.0,,0.0,,0.0,,70767.0,,0.0,,0.0,
2,3,True,GKP,3,3.0,3.0,,,,,,,,,90.0,,0.0,,0.0,,0.0,,3.0,,8.0,,0.0,,0.0,,26.0,,53.0,,0.0,,0.0,,90038.0,,0.067294,,0.065206,
3,1,False,GKP,2,3.0,3.0,3.0,,,,,,,,90.0,,0.0,,0.0,,0.0,,1.0,,4.0,,0.0,,0.0,,17.0,,27.8,,0.0,,0.0,,107301.0,,0.182198,,0.102609,
4,2,False,GKP,3,1.0,3.0,3.0,3.0,,,,,,,90.0,,0.0,,0.0,,0.0,,2.0,,1.0,,0.0,,0.0,,8.0,,2.4,,0.0,,0.0,,123566.0,,0.156451,,0.078905,


In [None]:
# Using one hot encoding to encode categorical variables
df_encoded = pd.get_dummies(data=df, columns=['was_home','position'],
                            drop_first=True)
df_encoded.head()

Unnamed: 0,total_points,opponent_strength,points_lag_1,points_lag_2,points_lag_3,points_lag_4,points_lag_5,points_lag_6,points_lag_7,points_lag_8,points_lag_9,points_lag_10,minutes_lag,minutes_rolling,goals_scored_lag,goals_scored_rolling,assists_lag,assists_rolling,clean_sheets_lag,clean_sheets_rolling,goals_conceded_lag,goals_conceded_rolling,saves_lag,saves_rolling,cards_lag,cards_rolling,bonus_lag,bonus_rolling,bps_lag,bps_rolling,influence_lag,influence_rolling,creativity_lag,creativity_rolling,threat_lag,threat_rolling,selected_lag,selected_rolling,transfers_in_ratio_lag,transfers_in_ratio_rolling,transfers_out_ratio_lag,transfers_out_ratio_rolling,was_home_True,position_FWD,position_GKP,position_MID
0,3,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,0,1,0
1,3,4,3.0,,,,,,,,,,90.0,,0.0,,0.0,,0.0,,2.0,,6.0,,0.0,,0.0,,24.0,,47.0,,0.0,,0.0,,70767.0,,0.0,,0.0,,0,0,1,0
2,3,3,3.0,3.0,,,,,,,,,90.0,,0.0,,0.0,,0.0,,3.0,,8.0,,0.0,,0.0,,26.0,,53.0,,0.0,,0.0,,90038.0,,0.067294,,0.065206,,1,0,1,0
3,1,2,3.0,3.0,3.0,,,,,,,,90.0,,0.0,,0.0,,0.0,,1.0,,4.0,,0.0,,0.0,,17.0,,27.8,,0.0,,0.0,,107301.0,,0.182198,,0.102609,,0,0,1,0
4,2,3,1.0,3.0,3.0,3.0,,,,,,,90.0,,0.0,,0.0,,0.0,,2.0,,1.0,,0.0,,0.0,,8.0,,2.4,,0.0,,0.0,,123566.0,,0.156451,,0.078905,,0,0,1,0


In [None]:
# Splitting the model dataset so Y is the target variable (total points) and 
# X are the predictors
X = df_encoded.drop(['total_points'], axis=1)
Y = df_encoded['total_points']

In [None]:
# Splitting between training and test set
test_size = 0.20
seed = 42
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size
                                                    , random_state=seed)

print('X_train      : ',X_train.shape)
print('X_test       : ',X_test.shape)
print('Y_train      : ',Y_train.shape)
print('Y_test       : ',Y_test.shape)

X_train      :  (40388, 45)
X_test       :  (10098, 45)
Y_train      :  (40388,)
Y_test       :  (10098,)


In [None]:
# Writing out files so that it is consistent between runs
X_train.to_csv('X_train.csv',index=False) 
files.download('X_train.csv')

X_test.to_csv('X_test.csv',index=False) 
files.download('X_test.csv')

Y_train.to_csv('Y_train.csv',index=False) 
files.download('Y_train.csv')

Y_test.to_csv('Y_test.csv',index=False) 
files.download('Y_test.csv')

File downloaded to local machine and uploaded to GitHub