# Initial Model Building

## Setup

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib.pyplot import subplots
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

In [5]:
gw_url = "https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2024-25/gws/merged_gw.csv"

df_og = pd.read_csv(gw_url)
print(df_og.shape)
print(df_og.columns)

# Saving a original df
df_og["value"] = df_og["value"]/ 10
df = df_og.copy()

(4541, 41)
Index(['name', 'position', 'team', 'xP', 'assists', 'bonus', 'bps',
       'clean_sheets', 'creativity', 'element', 'expected_assists',
       'expected_goal_involvements', 'expected_goals',
       'expected_goals_conceded', 'fixture', 'goals_conceded', 'goals_scored',
       'ict_index', 'influence', 'kickoff_time', 'minutes', 'opponent_team',
       'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards',
       'round', 'saves', 'selected', 'starts', 'team_a_score', 'team_h_score',
       'threat', 'total_points', 'transfers_balance', 'transfers_in',
       'transfers_out', 'value', 'was_home', 'yellow_cards', 'GW'],
      dtype='object')


## Data Preprocessing

### Functions

In [6]:
def preprocess_fpl_data(df, rolling):
    # Calculate rolling averages
    transformed_df = df.copy()
    transformed_df["goal_involvements"] = transformed_df["goals_scored"] + transformed_df["assists"]
    
    rolling_columns = ["goal_involvements", 'assists', 'bonus', 'clean_sheets',
       'creativity', 'element', 'expected_assists',
       'expected_goal_involvements', 'expected_goals',
       'expected_goals_conceded', 'goals_conceded', 'goals_scored',
       'influence', 'minutes', 'own_goals',
       'penalties_missed', 'penalties_saved', 'red_cards', 'saves', 'starts',
       'threat', 'total_points',
       'yellow_cards']
    
    for col in rolling_columns:
        transformed_df[f'{col}_rolling_avg_{rolling}'] = transformed_df.groupby('name')[col].rolling(window=rolling, min_periods=1).mean().reset_index(0, drop=True)
    
    transformed_df.drop(columns= rolling_columns, inplace= True)
    # Calculate per 90 minutes stats
    
    # per_90_columns = ["goal_involvements", 'goals_scored', 'assists', "expected_goal_involvements", 'expected_goals', 'expected_assists', ]
    # for col in per_90_columns:
    #     transformed_df[f'{col}_per_90'] = transformed_df[col] / transformed_df['minutes'] * 90
    
    # Opponent team strength (example: based on goals conceded)
    # team_defence_strength = transformed_df.groupby('opponent_team')[f'goals_conceded_rolling_avg_{rolling}'].rolling(window= rolling, min_periods = 1).mean().reset_index()
    # team_defence_strength.columns = ['opponent_team', f'goals_conceded_rolling_avg_{rolling}']
    # transformed_df = transformed_df.merge(team_defence_strength, on='opponent_team', how='left')
    
    # Fixture difficulty (simplified example)
    # transformed_df['fixture_difficulty'] = transformed_df[f'goals_conceded_rolling_avg_{rolling}'].rank(pct=True)
    
    # Form (last 5 games point average)
    # transformed_df['form'] = transformed_df.groupby('name')['total_points'].rolling(window=5, min_periods=1).mean().reset_index(0, drop=True)
    
    # Add more feature engineering as needed
    
    return transformed_df


In [22]:
df = df_og.copy()

dropped_cols = ["xP", "bps", "fixture", "ict_index", "kickoff_time", "round", "selected", "transfers_balance", "transfers_in", "transfers_out", "value",
                "GW"]
df.drop(columns= dropped_cols, inplace= True)
preprocessed_df = preprocess_fpl_data(df, 3)
print(df.columns)

rolling_cols = ['assists', 'bonus', 'clean_sheets',
       'creativity', 'element', 'expected_assists',
       'expected_goal_involvements', 'expected_goals',
       'expected_goals_conceded', 'goals_conceded', 'goals_scored',
       'influence', 'minutes', 'own_goals',
       'penalties_missed', 'penalties_saved', 'red_cards', 'saves', 'starts',
       'threat', 'total_points',
       'yellow_cards']

df[df["name"] == "Cole Palmer"][rolling_cols]

Index(['name', 'position', 'team', 'assists', 'bonus', 'clean_sheets',
       'creativity', 'element', 'expected_assists',
       'expected_goal_involvements', 'expected_goals',
       'expected_goals_conceded', 'goals_conceded', 'goals_scored',
       'influence', 'minutes', 'opponent_team', 'own_goals',
       'penalties_missed', 'penalties_saved', 'red_cards', 'saves', 'starts',
       'team_a_score', 'team_h_score', 'threat', 'total_points', 'was_home',
       'yellow_cards'],
      dtype='object')


Unnamed: 0,assists,bonus,clean_sheets,creativity,element,expected_assists,expected_goal_involvements,expected_goals,expected_goals_conceded,goals_conceded,goals_scored,influence,minutes,own_goals,penalties_missed,penalties_saved,red_cards,saves,starts,threat,total_points,yellow_cards
13,0,0,0,37.8,182,0.19,0.26,0.07,0.77,2,0,20.4,90,0,0,0,0,0,1,10.0,2,0
629,3,2,0,43.8,182,0.15,0.82,0.67,1.8,2,1,82.8,82,0,0,0,0,0,1,21.0,17,1
1256,1,2,0,62.2,182,0.7,0.81,0.11,0.44,1,0,30.8,90,0,0,0,0,0,1,14.0,6,1
1904,0,0,1,13.3,182,0.03,0.03,0.0,1.78,0,0,11.8,90,0,0,0,0,0,1,0.0,3,0
2563,0,0,1,16.4,182,0.0,0.42,0.42,0.57,0,1,35.6,63,0,0,0,0,0,1,21.0,8,0
3224,0,3,0,54.7,182,0.62,3.03,2.41,1.11,2,4,149.0,90,0,0,0,0,0,1,82.0,25,0
3888,1,2,0,69.9,182,0.1,0.49,0.39,0.85,1,0,36.0,90,0,0,0,0,0,1,56.0,6,1


In [23]:
preprocessed_df[preprocessed_df["name"] == "Cole Palmer"]

Unnamed: 0,name,position,team,opponent_team,team_a_score,team_h_score,was_home,goal_involvements_rolling_avg_3,assists_rolling_avg_3,bonus_rolling_avg_3,clean_sheets_rolling_avg_3,creativity_rolling_avg_3,element_rolling_avg_3,expected_assists_rolling_avg_3,expected_goal_involvements_rolling_avg_3,expected_goals_rolling_avg_3,expected_goals_conceded_rolling_avg_3,goals_conceded_rolling_avg_3,goals_scored_rolling_avg_3,influence_rolling_avg_3,minutes_rolling_avg_3,own_goals_rolling_avg_3,penalties_missed_rolling_avg_3,penalties_saved_rolling_avg_3,red_cards_rolling_avg_3,saves_rolling_avg_3,starts_rolling_avg_3,threat_rolling_avg_3,total_points_rolling_avg_3,yellow_cards_rolling_avg_3
13,Cole Palmer,MID,Chelsea,13,2,0,True,0.0,0.0,0.0,0.0,37.8,182.0,0.19,0.26,0.07,0.77,2.0,0.0,20.4,90.0,0.0,0.0,0.0,0.0,0.0,1.0,10.0,2.0,0.0
629,Cole Palmer,MID,Chelsea,20,6,2,False,2.0,1.5,1.0,0.0,40.8,182.0,0.17,0.54,0.37,1.285,2.0,0.5,51.6,86.0,0.0,0.0,0.0,0.0,0.0,1.0,15.5,9.5,0.5
1256,Cole Palmer,MID,Chelsea,7,1,1,True,1.666667,1.333333,1.333333,0.0,47.933333,182.0,0.346667,0.63,0.283333,1.003333,1.666667,0.333333,44.666667,87.333333,0.0,0.0,0.0,0.0,0.0,1.0,15.0,8.333333,0.666667
1904,Cole Palmer,MID,Chelsea,3,1,0,False,1.666667,1.333333,1.333333,0.333333,39.766667,182.0,0.293333,0.553333,0.26,1.34,1.0,0.333333,41.8,87.333333,0.0,0.0,0.0,0.0,0.0,1.0,11.666667,8.666667,0.666667
2563,Cole Palmer,MID,Chelsea,19,3,0,False,0.666667,0.333333,0.666667,0.666667,30.633333,182.0,0.243333,0.42,0.176667,0.93,0.333333,0.333333,26.066667,81.0,0.0,0.0,0.0,0.0,0.0,1.0,11.666667,5.666667,0.333333
3224,Cole Palmer,MID,Chelsea,5,2,4,True,1.666667,0.0,1.0,0.666667,28.133333,182.0,0.216667,1.16,0.943333,1.153333,0.666667,1.666667,65.466667,81.0,0.0,0.0,0.0,0.0,0.0,1.0,34.333333,12.0,0.0
3888,Cole Palmer,MID,Chelsea,16,1,1,True,2.0,0.333333,1.666667,0.333333,47.0,182.0,0.24,1.313333,1.073333,0.843333,1.0,1.666667,73.533333,81.0,0.0,0.0,0.0,0.0,0.0,1.0,53.0,13.0,0.333333


In [9]:

print(df.columns)
pd.set_option('display.max_columns', None)

static_cols = ["name", "position", 'team', 'opponent_team', 'was_home']
df.head()


Index(['name', 'position', 'team', 'assists', 'bonus', 'clean_sheets',
       'creativity', 'element', 'expected_assists',
       'expected_goal_involvements', 'expected_goals',
       'expected_goals_conceded', 'goals_conceded', 'goals_scored',
       'influence', 'minutes', 'opponent_team', 'own_goals',
       'penalties_missed', 'penalties_saved', 'red_cards', 'saves', 'starts',
       'team_a_score', 'team_h_score', 'threat', 'total_points', 'was_home',
       'yellow_cards'],
      dtype='object')


Unnamed: 0,name,position,team,assists,bonus,clean_sheets,creativity,element,expected_assists,expected_goal_involvements,expected_goals,expected_goals_conceded,goals_conceded,goals_scored,influence,minutes,opponent_team,own_goals,penalties_missed,penalties_saved,red_cards,saves,starts,team_a_score,team_h_score,threat,total_points,was_home,yellow_cards
0,Alex Scott,MID,Bournemouth,0,0,0,12.8,77,0.01,0.01,0.0,1.02,1,0,22.8,62,16,0,0,0,0,0,1,1,1,0.0,2,False,0
1,Carlos Miguel dos Santos Pereira,GK,Nott'm Forest,0,0,0,0.0,427,0.0,0.0,0.0,0.0,0,0,0.0,0,3,0,0,0,0,0,0,1,1,0.0,0,True,0
2,Tomiyasu Takehiro,DEF,Arsenal,0,0,0,0.0,22,0.0,0.0,0.0,0.0,0,0,0.0,0,20,0,0,0,0,0,0,0,2,0.0,0,True,0
3,Malcolm Ebiowei,MID,Crystal Palace,0,0,0,0.0,197,0.0,0.0,0.0,0.0,0,0,0.0,0,4,0,0,0,0,0,0,1,2,0.0,0,False,0
4,Ben Brereton Díaz,MID,Southampton,0,0,0,14.0,584,0.02,0.32,0.3,0.25,1,0,2.6,70,15,0,0,0,0,0,1,0,1,16.0,1,False,1


In [10]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.feature_selection import SelectKBest, f_regression

def boolean_to_int(X):
    return X.astype(int)

def create_preprocessing_pipeline():
    # Categorical features
    categorical_features = ['position', 'team', 'opponent_team']
    
    # Numerical features (excluding the target variable and some identifier columns)
    numerical_features = [
        'assists', 'bonus', 'clean_sheets', 'creativity', 'expected_assists',
        'expected_goal_involvements', 'expected_goals', 'expected_goals_conceded',
        'goals_conceded', 'goals_scored', 'influence', 'minutes', 'own_goals',
        'penalties_missed', 'penalties_saved', 'red_cards', 'saves', 'threat',
        'yellow_cards', 'total_points_rolling_avg_3', 'minutes_rolling_avg_3',
        'goals_scored_rolling_avg_3', 'assists_rolling_avg_3', 'clean_sheets_rolling_avg_3',
        'saves_rolling_avg_3', 'bonus_rolling_avg_3', 'goal_involvements',
        'goal_involvements_per_90', 'goals_scored_per_90', 'assists_per_90',
        'expected_goal_involvements_per_90', 'expected_goals_per_90',
        'expected_assists_per_90', 'avg_goals_conceded', 'fixture_difficulty', 'form'
    ]
    
    # Boolean features
    boolean_features = ['was_home']
    
    # Create preprocessing steps for each type of feature
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    boolean_transformer = Pipeline(steps=[
        ('to_int', FunctionTransformer(boolean_to_int)),
        ('imputer', SimpleImputer(strategy='most_frequent'))
    ])
    
    # Combine all preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features),
            ('bool', boolean_transformer, boolean_features)
        ])
    
    # Create the full preprocessing pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('feature_selection', SelectKBest(f_regression, k=30))  # Select top 30 features
    ])
    
    return pipeline

def preprocess_data(df, target_column='total_points'):
    # Create preprocessing pipeline
    pipeline = create_preprocessing_pipeline()
    
    # Separate features and target
    X = df.drop(columns=[target_column, 'name', 'element'])  # Dropping identifier columns
    y = df[target_column]
    
    # Fit and transform the data
    X_preprocessed = pipeline.fit_transform(X, y)
    
    # Get feature names after preprocessing
    feature_names = (pipeline.named_steps['preprocessor']
                     .named_transformers_['num'].get_feature_names_out(numerical_features).tolist() +
                     pipeline.named_steps['preprocessor']
                     .named_transformers_['cat'].get_feature_names_out(categorical_features).tolist() +
                     boolean_features)
    
    # Create a new dataframe with preprocessed data
    X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=feature_names)
    
    return X_preprocessed_df, y, pipeline

# Usage example
# df = pd.read_csv('your_fpl_data.csv')
# X_preprocessed, y, preprocessing_pipeline = preprocess_data(df)

# Now X_preprocessed and y are ready for model training
# You can also use preprocessing_pipeline to transform new data consistently

In [11]:

X_train, X_test, y_train, y_test = train_test_split(preprocessed_df.drop(columns= "total_points").select_dtypes(include=np.number), preprocessed_df["total_points"], train_size= 0.6)

KeyError: "['total_points'] not found in axis"

In [9]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from scipy.stats import uniform, randint

In [10]:
reg = DecisionTreeRegressor(random_state= 1)

param_grid = {
    "max_leaf_nodes": randint(2, 50),
    "max_depth": randint(2, 50),
    "max_features": randint(2, X_train.shape[1]),
    "min_impurity_decrease": uniform(0,0.75)    
}

rscv = RandomizedSearchCV(reg, param_grid, n_iter= 100, random_state= 1)
rscv.fit(X_train, y_train)

In [11]:
pd.DataFrame(rscv.cv_results_).sort_values("rank_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,param_max_leaf_nodes,param_min_impurity_decrease,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
71,0.008077,0.000167,0.000994,0.000025,7,36,35,0.011264,"{'max_depth': 7, 'max_features': 36, 'max_leaf...",0.860056,0.879280,0.891513,0.872252,0.895767,0.879774,0.012955,1
69,0.003849,0.000089,0.000927,0.000042,14,14,28,0.026136,"{'max_depth': 14, 'max_features': 14, 'max_lea...",0.861582,0.892122,0.889842,0.863174,0.891556,0.879655,0.014136,2
36,0.004070,0.000127,0.000930,0.000029,27,23,41,0.052561,"{'max_depth': 27, 'max_features': 23, 'max_lea...",0.912676,0.846808,0.869729,0.885894,0.879381,0.878897,0.021466,3
80,0.004365,0.000226,0.000940,0.000026,14,20,44,0.040432,"{'max_depth': 14, 'max_features': 20, 'max_lea...",0.874922,0.877586,0.869416,0.888120,0.877602,0.877529,0.006079,4
70,0.003916,0.000065,0.000942,0.000012,25,14,40,0.016144,"{'max_depth': 25, 'max_features': 14, 'max_lea...",0.857485,0.892878,0.883995,0.862205,0.877790,0.874871,0.013257,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29,0.001695,0.000040,0.000903,0.000034,14,3,32,0.661006,"{'max_depth': 14, 'max_features': 3, 'max_leaf...",0.308174,0.356498,0.346989,0.417401,0.305622,0.346937,0.040660,96
62,0.001420,0.000030,0.000905,0.000026,46,2,24,0.432438,"{'max_depth': 46, 'max_features': 2, 'max_leaf...",0.284913,0.321988,0.234447,0.258220,0.350524,0.290019,0.041972,97
44,0.001534,0.000111,0.000880,0.000010,5,4,16,0.420773,"{'max_depth': 5, 'max_features': 4, 'max_leaf_...",0.095683,0.434354,0.175822,0.217763,0.523576,0.289439,0.162116,98
46,0.001535,0.000107,0.000900,0.000050,35,4,22,0.320182,"{'max_depth': 35, 'max_features': 4, 'max_leaf...",0.095683,0.434354,0.175822,0.217763,0.523576,0.289439,0.162116,98


In [12]:
cross_val_score(rscv.best_estimator_, X_train, y_train)

array([0.86005613, 0.87928045, 0.89151322, 0.87225203, 0.89576733])