# Initial Modeling

## Setup

In [212]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots
from scipy import stats

from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer, PowerTransformer, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, BaseCrossValidator
from scipy.stats import randint, uniform
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error, make_scorer

from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, StackingRegressor, BaggingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

pd.set_option("display.max_columns", None)

## Data Prep

### Initial Look

In [213]:
og_df = pd.read_csv("./marts.csv")
og_df.columns = [col.lower() for col in og_df.columns]
og_df.sort_values("fix_id", inplace= True)
og_df.reset_index(drop= True, inplace= True)
df = og_df.drop(columns= ["label_goals", "label_assists"]).copy()
labels = og_df[["label_goals", "label_assists"]].copy()
og_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13137 entries, 0 to 13136
Columns: 132 entries, label_goals to rolling_goals_conceded_ub
dtypes: float64(106), int64(25), object(1)
memory usage: 13.2+ MB


In [214]:
df.head()

Unnamed: 0,player_id,fix_id,season,game_week,team_id,team_strength,opponent_strength,rolling_ub_minutes,rolling_ub_np_expected_goals,rolling_ub_expected_goals,rolling_ub_goals,rolling_ub_expected_assists,rolling_ub_assists,rolling_ub_shots,rolling_ub_shots_on_target,rolling_ub_pk_made,rolling_ub_pk_attempt,rolling_ub_yellow_cards,rolling_ub_red_cards,rolling_ub_touches,rolling_ub_tackles,rolling_ub_interceptions,rolling_ub_blocks,rolling_ub_shot_creating_actions,rolling_ub_goal_creating_actions,rolling_ub_pass_attempted,rolling_ub_pass_completed,rolling_ub_pass_progressive,rolling_ub_carries,rolling_ub_carries_progressive,rolling_ub_take_ons_attempted,rolling_ub_take_ons_success,rolling_bound_last3_minutes,rolling_bound_last3_expected_goals,rolling_bound_last3_np_expected_goals,rolling_bound_last3_expected_assists,rolling_bound_last3_goals,rolling_bound_last3_assists,rolling_bound_last3_shots,rolling_bound_last3_pk_made,rolling_bound_last3_pk_attempt,rolling_bound_last3_shots_on_target,rolling_bound_last3_yellow_cards,rolling_bound_last3_red_cards,rolling_bound_last3_touches,rolling_bound_last3_tackles,rolling_bound_last3_interceptions,rolling_bound_last3_blocks,rolling_bound_last3_shot_creating_actions,rolling_bound_last3_goal_creating_actions,rolling_bound_last3_pass_completed,rolling_bound_last3_pass_attempted,rolling_bound_last3_pass_progressive,rolling_bound_last3_carries,rolling_bound_last3_carries_progressive,rolling_bound_last3_take_ons_attempted,rolling_bound_last3_take_ons_success,rolling_bound_last6_minutes,rolling_bound_last6_expected_goals,rolling_bound_last6_np_expected_goals,rolling_bound_last6_expected_assists,rolling_bound_last6_goals,rolling_bound_last6_assists,rolling_bound_last6_shots,rolling_bound_last6_pk_made,rolling_bound_last6_pk_attempt,rolling_bound_last6_shots_on_target,rolling_bound_last6_yellow_cards,rolling_bound_last6_red_cards,rolling_bound_last6_touches,rolling_bound_last6_tackles,rolling_bound_last6_interceptions,rolling_bound_last6_blocks,rolling_bound_last6_shot_creating_actions,rolling_bound_last6_goal_creating_actions,rolling_bound_last6_pass_completed,rolling_bound_last6_pass_attempted,rolling_bound_last6_pass_progressive,rolling_bound_last6_carries,rolling_bound_last6_carries_progressive,rolling_bound_last6_take_ons_attempted,rolling_bound_last6_take_ons_success,goals_efficiency_ub,assists_efficiency_ub,shots_efficiency_ub,contribution_real_ub,contribution_xgi_ub,contribution_npxgi_ub,contribution_efficiency_xgi_ub,contribution_efficiency_npxgi_ub,goals_efficiency_last3,assists_efficiency_last3,shots_efficiency_last3,contribution_real_last3,contribution_xgi_last3,contribution_npxgi_last3,contribution_efficiency_xgi_last3,contribution_efficiency_npxgi_last3,goals_per_90_ub,assists_per_90_ub,real_contribution_per_90_ub,xg_per_90_ub,npxg_per_90_ub,xa_per_90_ub,xgi_per_90_ub,npxgi_per_90_ub,shots_per_90_ub,shots_on_target_per_90_ub,shot_creating_actions_per_90_ub,goal_creating_actions_per_90_ub,goals_per_90_last3,assists_per_90_last3,real_contribution_per_90_last3,xg_per_90_last3,npxg_per_90_last3,xa_per_90_last3,xgi_per_90_last3,npxgi_per_90_last3,shots_per_90_last3,shots_on_target_per_90_last3,shot_creating_actions_per_90_last3,goal_creating_actions_per_90_last3,team_goals_scored,team_goals_conceded,rolling_goals_scored_last3,rolling_goals_conceded_last3,rolling_goals_scored_last6,rolling_goals_conceded_last6,rolling_goals_scored_ub,rolling_goals_conceded_ub
0,1005,15,2022-23,2,7,2330,2520,90,,,0,,0,,,0,0,0,0,,,,,,,,,,,,,,90.0,,,,0.0,0.0,,0.0,0.0,,0.0,0.0,,,,,,,,,,,,,,90.0,,,,0.0,0.0,,0.0,0.0,,0.0,0.0,,,,,,,,,,,,,,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,2,1,0,1,0,1,0
1,1355,15,2022-23,2,7,2330,2520,90,0.0,0.0,0,0.0,0,1.0,0.0,0,0,0,0,98.0,2.0,5.0,0.0,1.0,0.0,86.0,73.0,3.0,64.0,2.0,1.0,1.0,90.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,98.0,2.0,5.0,0.0,1.0,0.0,73.0,86.0,3.0,64.0,2.0,1.0,1.0,90.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,98.0,2.0,5.0,0.0,1.0,0.0,73.0,86.0,3.0,64.0,2.0,1.0,1.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2,2,1,0,1,0,1,0
2,520,17,2022-23,2,3,2070,2230,90,,,0,,0,,,0,0,0,0,,,,,,,,,,,,,,90.0,,,,0.0,0.0,,0.0,0.0,,0.0,0.0,,,,,,,,,,,,,,90.0,,,,0.0,0.0,,0.0,0.0,,0.0,0.0,,,,,,,,,,,,,,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,4,2,0,2,0,2,0
3,465,21,2022-23,3,1,2490,2190,90,0.4,0.4,1,0.1,1,2.0,1.0,0,0,0,0,53.0,0.0,0.0,1.0,3.0,1.0,47.0,40.0,4.0,34.0,2.0,0.0,0.0,90.0,0.4,0.4,0.1,1.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,53.0,0.0,0.0,1.0,3.0,1.0,40.0,47.0,4.0,34.0,2.0,0.0,0.0,90.0,0.4,0.4,0.1,1.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,53.0,0.0,0.0,1.0,3.0,1.0,40.0,47.0,4.0,34.0,2.0,0.0,0.0,2.5,10.0,0.5,2,0.5,0.5,4.0,4.0,2.5,10.0,0.5,2,0.5,0.5,4.0,4.0,1.0,1.0,2.0,0.4,0.4,0.1,0.5,0.5,2.0,1.0,3.0,1.0,1.0,1.0,2.0,0.4,0.4,0.1,0.5,0.5,2.0,1.0,3.0,1.0,3,0,6,2,6,2,6,2
4,237,21,2022-23,3,3,2070,2570,90,0.0,0.0,0,0.0,0,0.0,0.0,0,0,1,0,46.0,1.0,2.0,0.0,0.0,0.0,40.0,33.0,0.0,27.0,0.0,0.0,0.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,46.0,1.0,2.0,0.0,0.0,0.0,33.0,40.0,0.0,27.0,0.0,0.0,0.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,46.0,1.0,2.0,0.0,0.0,0.0,33.0,40.0,0.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3,2,4,2,4,2,4


In [215]:
cols_to_drop = []

#### Numerical Columns

In [216]:
num_df = df.select_dtypes(include= np.number)
num_df.head()

Unnamed: 0,player_id,fix_id,game_week,team_id,team_strength,opponent_strength,rolling_ub_minutes,rolling_ub_np_expected_goals,rolling_ub_expected_goals,rolling_ub_goals,rolling_ub_expected_assists,rolling_ub_assists,rolling_ub_shots,rolling_ub_shots_on_target,rolling_ub_pk_made,rolling_ub_pk_attempt,rolling_ub_yellow_cards,rolling_ub_red_cards,rolling_ub_touches,rolling_ub_tackles,rolling_ub_interceptions,rolling_ub_blocks,rolling_ub_shot_creating_actions,rolling_ub_goal_creating_actions,rolling_ub_pass_attempted,rolling_ub_pass_completed,rolling_ub_pass_progressive,rolling_ub_carries,rolling_ub_carries_progressive,rolling_ub_take_ons_attempted,rolling_ub_take_ons_success,rolling_bound_last3_minutes,rolling_bound_last3_expected_goals,rolling_bound_last3_np_expected_goals,rolling_bound_last3_expected_assists,rolling_bound_last3_goals,rolling_bound_last3_assists,rolling_bound_last3_shots,rolling_bound_last3_pk_made,rolling_bound_last3_pk_attempt,rolling_bound_last3_shots_on_target,rolling_bound_last3_yellow_cards,rolling_bound_last3_red_cards,rolling_bound_last3_touches,rolling_bound_last3_tackles,rolling_bound_last3_interceptions,rolling_bound_last3_blocks,rolling_bound_last3_shot_creating_actions,rolling_bound_last3_goal_creating_actions,rolling_bound_last3_pass_completed,rolling_bound_last3_pass_attempted,rolling_bound_last3_pass_progressive,rolling_bound_last3_carries,rolling_bound_last3_carries_progressive,rolling_bound_last3_take_ons_attempted,rolling_bound_last3_take_ons_success,rolling_bound_last6_minutes,rolling_bound_last6_expected_goals,rolling_bound_last6_np_expected_goals,rolling_bound_last6_expected_assists,rolling_bound_last6_goals,rolling_bound_last6_assists,rolling_bound_last6_shots,rolling_bound_last6_pk_made,rolling_bound_last6_pk_attempt,rolling_bound_last6_shots_on_target,rolling_bound_last6_yellow_cards,rolling_bound_last6_red_cards,rolling_bound_last6_touches,rolling_bound_last6_tackles,rolling_bound_last6_interceptions,rolling_bound_last6_blocks,rolling_bound_last6_shot_creating_actions,rolling_bound_last6_goal_creating_actions,rolling_bound_last6_pass_completed,rolling_bound_last6_pass_attempted,rolling_bound_last6_pass_progressive,rolling_bound_last6_carries,rolling_bound_last6_carries_progressive,rolling_bound_last6_take_ons_attempted,rolling_bound_last6_take_ons_success,goals_efficiency_ub,assists_efficiency_ub,shots_efficiency_ub,contribution_real_ub,contribution_xgi_ub,contribution_npxgi_ub,contribution_efficiency_xgi_ub,contribution_efficiency_npxgi_ub,goals_efficiency_last3,assists_efficiency_last3,shots_efficiency_last3,contribution_real_last3,contribution_xgi_last3,contribution_npxgi_last3,contribution_efficiency_xgi_last3,contribution_efficiency_npxgi_last3,goals_per_90_ub,assists_per_90_ub,real_contribution_per_90_ub,xg_per_90_ub,npxg_per_90_ub,xa_per_90_ub,xgi_per_90_ub,npxgi_per_90_ub,shots_per_90_ub,shots_on_target_per_90_ub,shot_creating_actions_per_90_ub,goal_creating_actions_per_90_ub,goals_per_90_last3,assists_per_90_last3,real_contribution_per_90_last3,xg_per_90_last3,npxg_per_90_last3,xa_per_90_last3,xgi_per_90_last3,npxgi_per_90_last3,shots_per_90_last3,shots_on_target_per_90_last3,shot_creating_actions_per_90_last3,goal_creating_actions_per_90_last3,team_goals_scored,team_goals_conceded,rolling_goals_scored_last3,rolling_goals_conceded_last3,rolling_goals_scored_last6,rolling_goals_conceded_last6,rolling_goals_scored_ub,rolling_goals_conceded_ub
0,1005,15,2,7,2330,2520,90,,,0,,0,,,0,0,0,0,,,,,,,,,,,,,,90.0,,,,0.0,0.0,,0.0,0.0,,0.0,0.0,,,,,,,,,,,,,,90.0,,,,0.0,0.0,,0.0,0.0,,0.0,0.0,,,,,,,,,,,,,,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,2,1,0,1,0,1,0
1,1355,15,2,7,2330,2520,90,0.0,0.0,0,0.0,0,1.0,0.0,0,0,0,0,98.0,2.0,5.0,0.0,1.0,0.0,86.0,73.0,3.0,64.0,2.0,1.0,1.0,90.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,98.0,2.0,5.0,0.0,1.0,0.0,73.0,86.0,3.0,64.0,2.0,1.0,1.0,90.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,98.0,2.0,5.0,0.0,1.0,0.0,73.0,86.0,3.0,64.0,2.0,1.0,1.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2,2,1,0,1,0,1,0
2,520,17,2,3,2070,2230,90,,,0,,0,,,0,0,0,0,,,,,,,,,,,,,,90.0,,,,0.0,0.0,,0.0,0.0,,0.0,0.0,,,,,,,,,,,,,,90.0,,,,0.0,0.0,,0.0,0.0,,0.0,0.0,,,,,,,,,,,,,,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,4,2,0,2,0,2,0
3,465,21,3,1,2490,2190,90,0.4,0.4,1,0.1,1,2.0,1.0,0,0,0,0,53.0,0.0,0.0,1.0,3.0,1.0,47.0,40.0,4.0,34.0,2.0,0.0,0.0,90.0,0.4,0.4,0.1,1.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,53.0,0.0,0.0,1.0,3.0,1.0,40.0,47.0,4.0,34.0,2.0,0.0,0.0,90.0,0.4,0.4,0.1,1.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,53.0,0.0,0.0,1.0,3.0,1.0,40.0,47.0,4.0,34.0,2.0,0.0,0.0,2.5,10.0,0.5,2,0.5,0.5,4.0,4.0,2.5,10.0,0.5,2,0.5,0.5,4.0,4.0,1.0,1.0,2.0,0.4,0.4,0.1,0.5,0.5,2.0,1.0,3.0,1.0,1.0,1.0,2.0,0.4,0.4,0.1,0.5,0.5,2.0,1.0,3.0,1.0,3,0,6,2,6,2,6,2
4,237,21,3,3,2070,2570,90,0.0,0.0,0,0.0,0,0.0,0.0,0,0,1,0,46.0,1.0,2.0,0.0,0.0,0.0,40.0,33.0,0.0,27.0,0.0,0.0,0.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,46.0,1.0,2.0,0.0,0.0,0.0,33.0,40.0,0.0,27.0,0.0,0.0,0.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,46.0,1.0,2.0,0.0,0.0,0.0,33.0,40.0,0.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3,2,4,2,4,2,4


In [217]:
num_df.describe()

Unnamed: 0,player_id,fix_id,game_week,team_id,team_strength,opponent_strength,rolling_ub_minutes,rolling_ub_np_expected_goals,rolling_ub_expected_goals,rolling_ub_goals,rolling_ub_expected_assists,rolling_ub_assists,rolling_ub_shots,rolling_ub_shots_on_target,rolling_ub_pk_made,rolling_ub_pk_attempt,rolling_ub_yellow_cards,rolling_ub_red_cards,rolling_ub_touches,rolling_ub_tackles,rolling_ub_interceptions,rolling_ub_blocks,rolling_ub_shot_creating_actions,rolling_ub_goal_creating_actions,rolling_ub_pass_attempted,rolling_ub_pass_completed,rolling_ub_pass_progressive,rolling_ub_carries,rolling_ub_carries_progressive,rolling_ub_take_ons_attempted,rolling_ub_take_ons_success,rolling_bound_last3_minutes,rolling_bound_last3_expected_goals,rolling_bound_last3_np_expected_goals,rolling_bound_last3_expected_assists,rolling_bound_last3_goals,rolling_bound_last3_assists,rolling_bound_last3_shots,rolling_bound_last3_pk_made,rolling_bound_last3_pk_attempt,rolling_bound_last3_shots_on_target,rolling_bound_last3_yellow_cards,rolling_bound_last3_red_cards,rolling_bound_last3_touches,rolling_bound_last3_tackles,rolling_bound_last3_interceptions,rolling_bound_last3_blocks,rolling_bound_last3_shot_creating_actions,rolling_bound_last3_goal_creating_actions,rolling_bound_last3_pass_completed,rolling_bound_last3_pass_attempted,rolling_bound_last3_pass_progressive,rolling_bound_last3_carries,rolling_bound_last3_carries_progressive,rolling_bound_last3_take_ons_attempted,rolling_bound_last3_take_ons_success,rolling_bound_last6_minutes,rolling_bound_last6_expected_goals,rolling_bound_last6_np_expected_goals,rolling_bound_last6_expected_assists,rolling_bound_last6_goals,rolling_bound_last6_assists,rolling_bound_last6_shots,rolling_bound_last6_pk_made,rolling_bound_last6_pk_attempt,rolling_bound_last6_shots_on_target,rolling_bound_last6_yellow_cards,rolling_bound_last6_red_cards,rolling_bound_last6_touches,rolling_bound_last6_tackles,rolling_bound_last6_interceptions,rolling_bound_last6_blocks,rolling_bound_last6_shot_creating_actions,rolling_bound_last6_goal_creating_actions,rolling_bound_last6_pass_completed,rolling_bound_last6_pass_attempted,rolling_bound_last6_pass_progressive,rolling_bound_last6_carries,rolling_bound_last6_carries_progressive,rolling_bound_last6_take_ons_attempted,rolling_bound_last6_take_ons_success,goals_efficiency_ub,assists_efficiency_ub,shots_efficiency_ub,contribution_real_ub,contribution_xgi_ub,contribution_npxgi_ub,contribution_efficiency_xgi_ub,contribution_efficiency_npxgi_ub,goals_efficiency_last3,assists_efficiency_last3,shots_efficiency_last3,contribution_real_last3,contribution_xgi_last3,contribution_npxgi_last3,contribution_efficiency_xgi_last3,contribution_efficiency_npxgi_last3,goals_per_90_ub,assists_per_90_ub,real_contribution_per_90_ub,xg_per_90_ub,npxg_per_90_ub,xa_per_90_ub,xgi_per_90_ub,npxgi_per_90_ub,shots_per_90_ub,shots_on_target_per_90_ub,shot_creating_actions_per_90_ub,goal_creating_actions_per_90_ub,goals_per_90_last3,assists_per_90_last3,real_contribution_per_90_last3,xg_per_90_last3,npxg_per_90_last3,xa_per_90_last3,xgi_per_90_last3,npxgi_per_90_last3,shots_per_90_last3,shots_on_target_per_90_last3,shot_creating_actions_per_90_last3,goal_creating_actions_per_90_last3,team_goals_scored,team_goals_conceded,rolling_goals_scored_last3,rolling_goals_conceded_last3,rolling_goals_scored_last6,rolling_goals_conceded_last6,rolling_goals_scored_ub,rolling_goals_conceded_ub
count,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13120.0,13120.0,13137.0,13120.0,13137.0,13120.0,13120.0,13137.0,13137.0,13137.0,13137.0,13120.0,13120.0,13120.0,13120.0,13120.0,13120.0,13120.0,13120.0,13120.0,13120.0,13120.0,13120.0,13120.0,13136.0,13119.0,13119.0,13119.0,13136.0,13136.0,13119.0,13136.0,13136.0,13119.0,13136.0,13136.0,13119.0,13119.0,13119.0,13119.0,13119.0,13119.0,13119.0,13119.0,13119.0,13119.0,13119.0,13119.0,13119.0,13136.0,13119.0,13119.0,13119.0,13136.0,13136.0,13119.0,13136.0,13136.0,13119.0,13136.0,13136.0,13119.0,13119.0,13119.0,13119.0,13119.0,13119.0,13119.0,13119.0,13119.0,13119.0,13119.0,13119.0,13119.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0,13137.0
mean,707.104971,424.748725,20.781,11.98508,2253.780163,2360.90051,786.063409,1.269756,1.373537,1.419274,0.94734,0.957677,11.730412,4.042378,0.113725,0.135495,1.673746,0.058841,493.828277,14.257012,7.310061,10.281631,19.889024,2.240244,399.019893,320.228125,32.224848,281.143521,15.704497,15.70747,6.875,196.955847,0.3323729,0.3074625,0.2289656,0.340667,0.228076,2.853038,0.026949,0.032202,0.981325,0.415956,0.013931,124.445232,3.62459,1.854181,2.594176,4.894199,0.550423,81.005488,100.711945,8.062047,70.770028,3.917524,3.957009,1.75646,357.762561,0.6087964,0.5629621,0.4173489,0.62363,0.416032,5.216404,0.050244,0.059455,1.794039,0.755557,0.025122,226.010138,6.584496,3.368245,4.716442,8.921793,1.000762,146.84511,182.759814,14.661941,128.505374,7.128059,7.218538,3.18599,0.887087,0.822096,0.093161,2.376951,2.317873,2.214227,0.958368,0.98525,19060680000000.0,17140900000000.0,0.086017,0.568699,0.5605694,0.5356931,18903960000000.0,21989320000000.0,0.155555,0.109079,0.264634,0.15537,0.145602,0.1065,0.26187,0.252102,1.377434,0.47008,2.276141,0.257435,0.163713,0.113429,0.277142,0.1606514,0.1496983,0.1104987,0.2711501,0.260197,1.400742,0.48323,2.334256,0.26825,1.536957,1.548832,4.514958,4.548223,8.620766,8.780696,28.521428,29.148207
std,412.626975,235.122166,10.538251,7.214718,160.999639,194.0605,622.6147,2.02872,2.30497,2.562832,1.305891,1.555766,14.085096,5.647579,0.548923,0.621064,1.995764,0.260807,438.5982,13.353826,8.068229,9.307882,21.189572,3.042033,380.19898,321.400801,32.81135,258.722043,18.740007,19.061527,8.468939,73.139086,0.5199204,0.4571216,0.3352986,0.713378,0.532083,2.887438,0.187232,0.205627,1.380953,0.639829,0.11978,66.904171,2.921185,1.95029,2.13815,4.209352,0.930026,55.758917,62.4598,6.237055,42.872983,4.026297,4.134643,2.117434,148.780111,0.8693948,0.7551568,0.5249131,1.106207,0.764425,4.937378,0.277961,0.305833,2.230598,0.914049,0.163635,128.026628,4.862244,3.168251,3.395547,7.226593,1.379671,103.712401,117.536477,10.884393,79.594322,6.945944,7.165227,3.466809,1.460785,1.770734,0.146283,3.635318,3.265096,3.001747,1.36624,1.398221,792072300000000.0,695343800000000.0,0.196088,0.94988,0.6904712,0.634144,725693000000000.0,791748900000000.0,0.246388,0.187571,0.34158,0.189662,0.170386,0.1168,0.249334,0.232263,1.099303,0.512745,1.441616,0.327533,0.425925,0.314614,0.547822,0.2822364,0.2624735,0.1799131,0.3544471,0.336856,1.462954,0.772994,1.968447,0.519373,1.325127,1.332078,2.554898,2.449019,4.143776,3.87714,19.349536,18.172196
min,2.0,15.0,2.0,1.0,2065.0,2080.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,4.0,0.0,0.0,0.0,4.0,-1.665335e-15,-7.771561e-16,-3.608225e-16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,-1.221245e-15,-1.221245e-15,-3.608225e-16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-8.326673e-16,-8.326673e-16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4.787849e-15,-4.787849e-15,-4.996009e-16,-5.204183e-15,-5.204183e-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,330.0,223.0,11.0,5.0,2140.0,2205.0,297.0,0.2,0.2,0.0,0.1,0.0,3.0,1.0,0.0,0.0,0.0,0.0,168.0,5.0,2.0,3.0,5.0,0.0,125.0,96.0,10.0,94.0,4.0,3.0,1.0,140.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,71.0,1.0,0.0,1.0,2.0,0.0,38.0,51.0,4.0,39.0,1.0,1.0,0.0,228.0,0.1,0.1,2.775558e-17,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,120.0,3.0,1.0,2.0,4.0,0.0,65.0,87.0,6.0,68.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.4,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.1,0.0,0.0,0.0,0.0,0.0,0.029126,0.028571,0.02236,0.079179,0.078488,0.523509,0.064011,1.122524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.03947369,0.333333,0.0,0.994475,0.0,1.0,1.0,3.0,3.0,6.0,6.0,12.0,14.0
50%,717.0,420.0,21.0,12.0,2200.0,2350.0,624.0,0.6,0.6,0.0,0.5,0.0,7.0,2.0,0.0,0.0,1.0,0.0,361.0,10.0,5.0,8.0,13.0,1.0,279.0,217.0,22.0,204.0,9.0,9.0,4.0,218.0,0.1,0.1,0.1,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,117.0,3.0,1.0,2.0,4.0,0.0,69.0,90.0,7.0,63.0,3.0,3.0,1.0,381.0,0.3,0.3,0.2,0.0,0.0,4.0,0.0,0.0,1.0,1.0,0.0,210.0,6.0,3.0,4.0,7.0,1.0,123.0,160.0,12.0,114.0,5.0,5.0,2.0,0.0,0.0,0.0,1.0,1.2,1.2,0.769231,0.793651,0.0,0.0,0.0,0.0,0.3,0.3,0.0,0.0,0.0,0.0,0.154639,0.084906,0.083591,0.074654,0.192351,0.189208,1.104295,0.314136,2.063319,0.162455,0.0,0.0,0.0,0.06666667,0.06666667,0.04687501,0.1656442,0.1616766,1.0,0.333333,2.0,0.0,1.0,1.0,4.0,4.0,8.0,9.0,26.0,27.0
75%,1062.0,629.0,30.0,18.0,2320.0,2520.0,1135.0,1.5,1.5,2.0,1.2,1.0,15.0,5.0,0.0,0.0,2.0,0.0,692.0,20.0,10.0,15.0,27.0,3.0,554.0,436.0,44.0,389.25,21.0,21.0,9.0,269.0,0.4,0.4,0.3,0.0,0.0,4.0,0.0,0.0,1.0,1.0,0.0,168.0,5.0,3.0,4.0,7.0,1.0,112.0,139.0,11.0,95.0,6.0,6.0,3.0,498.0,0.8,0.7,0.6,1.0,1.0,7.0,0.0,0.0,3.0,1.0,0.0,312.0,9.0,5.0,7.0,12.0,2.0,204.0,255.0,20.0,173.0,10.0,10.0,5.0,1.25,1.190476,0.15,3.0,2.8,2.7,1.304348,1.372549,0.0,0.0,0.0,1.0,0.8,0.8,1.111111,1.25,0.234528,0.166976,0.404192,0.213018,0.20339,0.155941,0.371329,0.36,2.066326,0.738255,3.207127,0.393228,0.0,0.0,0.393013,0.2,0.2,0.1516854,0.3721804,0.3666667,2.033898,0.725806,3.333333,0.375,2.0,2.0,6.0,6.0,11.0,11.0,40.0,42.0
max,1401.0,860.0,38.0,24.0,2710.0,2750.0,6820.0,21.5,27.1,25.0,11.5,12.0,127.0,50.0,8.0,8.0,30.0,4.0,4404.0,105.0,61.0,71.0,167.0,26.0,3831.0,3098.0,303.0,2528.0,183.0,181.0,97.0,270.0,6.2,4.6,3.3,8.0,5.0,23.0,3.0,3.0,13.0,4.0,3.0,442.0,22.0,15.0,15.0,30.0,9.0,390.0,426.0,61.0,355.0,54.0,30.0,16.0,540.0,8.0,7.2,5.8,11.0,6.0,36.0,4.0,4.0,21.0,6.0,4.0,829.0,34.0,23.0,25.0,49.0,13.0,733.0,786.0,90.0,601.0,77.0,56.0,29.0,10.0,60.0,3.0,32.0,31.4,28.8,30.0,30.0,3.60288e+16,3.60288e+16,2.0,9.0,7.0,5.4,3.60288e+16,3.60288e+16,3.01676,2.571428,4.285713,1.759776,1.357542,1.342105,1.810056,1.7,8.0,4.615385,13.0,3.781513,22.500225,5.624993,22.500225,13.50014,13.50014,3.374996,13.50014,13.50014,30.0,22.500225,26.470573,8.571441,9.0,9.0,15.0,15.0,26.0,22.0,94.0,101.0


In [218]:
nulls = num_df.isna().sum() / num_df.shape[0]
nulls[nulls > 0]

rolling_ub_np_expected_goals               0.001294
rolling_ub_expected_goals                  0.001294
rolling_ub_expected_assists                0.001294
rolling_ub_shots                           0.001294
rolling_ub_shots_on_target                 0.001294
                                             ...   
rolling_bound_last6_pass_progressive       0.001370
rolling_bound_last6_carries                0.001370
rolling_bound_last6_carries_progressive    0.001370
rolling_bound_last6_take_ons_attempted     0.001370
rolling_bound_last6_take_ons_success       0.001370
Length: 68, dtype: float64

In [219]:
cols_to_drop.extend(['player_id', 'game_week', 'team_id'])

#### Categorical Columns

In [220]:
cat_df = df.select_dtypes(include= "object")
cat_df.head()

Unnamed: 0,season
0,2022-23
1,2022-23
2,2022-23
3,2022-23
4,2022-23


In [221]:
cols_to_drop.extend(list(cat_df.columns))

### Preprocessing

In [222]:
unique_fixtures = df['fix_id'].unique().tolist()

In [223]:
df

Unnamed: 0,player_id,fix_id,season,game_week,team_id,team_strength,opponent_strength,rolling_ub_minutes,rolling_ub_np_expected_goals,rolling_ub_expected_goals,rolling_ub_goals,rolling_ub_expected_assists,rolling_ub_assists,rolling_ub_shots,rolling_ub_shots_on_target,rolling_ub_pk_made,rolling_ub_pk_attempt,rolling_ub_yellow_cards,rolling_ub_red_cards,rolling_ub_touches,rolling_ub_tackles,rolling_ub_interceptions,rolling_ub_blocks,rolling_ub_shot_creating_actions,rolling_ub_goal_creating_actions,rolling_ub_pass_attempted,rolling_ub_pass_completed,rolling_ub_pass_progressive,rolling_ub_carries,rolling_ub_carries_progressive,rolling_ub_take_ons_attempted,rolling_ub_take_ons_success,rolling_bound_last3_minutes,rolling_bound_last3_expected_goals,rolling_bound_last3_np_expected_goals,rolling_bound_last3_expected_assists,rolling_bound_last3_goals,rolling_bound_last3_assists,rolling_bound_last3_shots,rolling_bound_last3_pk_made,rolling_bound_last3_pk_attempt,rolling_bound_last3_shots_on_target,rolling_bound_last3_yellow_cards,rolling_bound_last3_red_cards,rolling_bound_last3_touches,rolling_bound_last3_tackles,rolling_bound_last3_interceptions,rolling_bound_last3_blocks,rolling_bound_last3_shot_creating_actions,rolling_bound_last3_goal_creating_actions,rolling_bound_last3_pass_completed,rolling_bound_last3_pass_attempted,rolling_bound_last3_pass_progressive,rolling_bound_last3_carries,rolling_bound_last3_carries_progressive,rolling_bound_last3_take_ons_attempted,rolling_bound_last3_take_ons_success,rolling_bound_last6_minutes,rolling_bound_last6_expected_goals,rolling_bound_last6_np_expected_goals,rolling_bound_last6_expected_assists,rolling_bound_last6_goals,rolling_bound_last6_assists,rolling_bound_last6_shots,rolling_bound_last6_pk_made,rolling_bound_last6_pk_attempt,rolling_bound_last6_shots_on_target,rolling_bound_last6_yellow_cards,rolling_bound_last6_red_cards,rolling_bound_last6_touches,rolling_bound_last6_tackles,rolling_bound_last6_interceptions,rolling_bound_last6_blocks,rolling_bound_last6_shot_creating_actions,rolling_bound_last6_goal_creating_actions,rolling_bound_last6_pass_completed,rolling_bound_last6_pass_attempted,rolling_bound_last6_pass_progressive,rolling_bound_last6_carries,rolling_bound_last6_carries_progressive,rolling_bound_last6_take_ons_attempted,rolling_bound_last6_take_ons_success,goals_efficiency_ub,assists_efficiency_ub,shots_efficiency_ub,contribution_real_ub,contribution_xgi_ub,contribution_npxgi_ub,contribution_efficiency_xgi_ub,contribution_efficiency_npxgi_ub,goals_efficiency_last3,assists_efficiency_last3,shots_efficiency_last3,contribution_real_last3,contribution_xgi_last3,contribution_npxgi_last3,contribution_efficiency_xgi_last3,contribution_efficiency_npxgi_last3,goals_per_90_ub,assists_per_90_ub,real_contribution_per_90_ub,xg_per_90_ub,npxg_per_90_ub,xa_per_90_ub,xgi_per_90_ub,npxgi_per_90_ub,shots_per_90_ub,shots_on_target_per_90_ub,shot_creating_actions_per_90_ub,goal_creating_actions_per_90_ub,goals_per_90_last3,assists_per_90_last3,real_contribution_per_90_last3,xg_per_90_last3,npxg_per_90_last3,xa_per_90_last3,xgi_per_90_last3,npxgi_per_90_last3,shots_per_90_last3,shots_on_target_per_90_last3,shot_creating_actions_per_90_last3,goal_creating_actions_per_90_last3,team_goals_scored,team_goals_conceded,rolling_goals_scored_last3,rolling_goals_conceded_last3,rolling_goals_scored_last6,rolling_goals_conceded_last6,rolling_goals_scored_ub,rolling_goals_conceded_ub
0,1005,15,2022-23,2,7,2330,2520,90,,,0,,0,,,0,0,0,0,,,,,,,,,,,,,,90.0,,,,0.0,0.0,,0.0,0.0,,0.0,0.0,,,,,,,,,,,,,,90.0,,,,0.0,0.0,,0.0,0.0,,0.0,0.0,,,,,,,,,,,,,,0.000000,0.0,0.000000,0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0,0.0,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2,2,1,0,1,0,1,0
1,1355,15,2022-23,2,7,2330,2520,90,0.0,0.0,0,0.0,0,1.0,0.0,0,0,0,0,98.0,2.0,5.0,0.0,1.0,0.0,86.0,73.0,3.0,64.0,2.0,1.0,1.0,90.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,98.0,2.0,5.0,0.0,1.0,0.0,73.0,86.0,3.0,64.0,2.0,1.0,1.0,90.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,98.0,2.0,5.0,0.0,1.0,0.0,73.0,86.0,3.0,64.0,2.0,1.0,1.0,0.000000,0.0,0.000000,0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0,0.0,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,1.000000,0.000000,0.000000,0.000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,1.000000,0.000000,1.000000,0.000000,2,2,1,0,1,0,1,0
2,520,17,2022-23,2,3,2070,2230,90,,,0,,0,,,0,0,0,0,,,,,,,,,,,,,,90.0,,,,0.0,0.0,,0.0,0.0,,0.0,0.0,,,,,,,,,,,,,,90.0,,,,0.0,0.0,,0.0,0.0,,0.0,0.0,,,,,,,,,,,,,,0.000000,0.0,0.000000,0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0,0.0,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,4,2,0,2,0,2,0
3,465,21,2022-23,3,1,2490,2190,90,0.4,0.4,1,0.1,1,2.0,1.0,0,0,0,0,53.0,0.0,0.0,1.0,3.0,1.0,47.0,40.0,4.0,34.0,2.0,0.0,0.0,90.0,0.4,0.4,0.1,1.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,53.0,0.0,0.0,1.0,3.0,1.0,40.0,47.0,4.0,34.0,2.0,0.0,0.0,90.0,0.4,0.4,0.1,1.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,53.0,0.0,0.0,1.0,3.0,1.0,40.0,47.0,4.0,34.0,2.0,0.0,0.0,2.500000,10.0,0.500000,2,0.5,0.5,4.000000,4.000000,2.500000,10.0,0.5,2,0.5,0.5,4.000000,4.000000,1.00000,1.000000,2.000000,0.400000,0.400000,0.100000,0.500000,0.500000,2.000000,1.000000,3.000000,1.000000,1.000000,1.000,2.000000,0.400000,0.400000,0.1000,0.500000,0.500000,2.000000,1.000000,3.000000,1.000000,3,0,6,2,6,2,6,2
4,237,21,2022-23,3,3,2070,2570,90,0.0,0.0,0,0.0,0,0.0,0.0,0,0,1,0,46.0,1.0,2.0,0.0,0.0,0.0,40.0,33.0,0.0,27.0,0.0,0.0,0.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,46.0,1.0,2.0,0.0,0.0,0.0,33.0,40.0,0.0,27.0,0.0,0.0,0.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,46.0,1.0,2.0,0.0,0.0,0.0,33.0,40.0,0.0,27.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0,0.0,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,3,2,4,2,4,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13132,732,860,2024-25,10,24,2150,2460,407,0.9,0.9,2,0.1,0,6.0,5.0,0,0,1,0,101.0,2.0,1.0,3.0,10.0,0.0,58.0,34.0,3.0,43.0,1.0,0.0,0.0,258.0,0.9,0.9,0.0,2.0,0.0,4.0,0.0,0.0,4.0,1.0,0.0,67.0,2.0,0.0,3.0,8.0,0.0,19.0,37.0,1.0,29.0,0.0,0.0,0.0,407.0,0.9,0.9,0.1,2.0,0.0,6.0,0.0,0.0,5.0,1.0,0.0,101.0,2.0,1.0,3.0,10.0,0.0,34.0,58.0,3.0,43.0,1.0,0.0,0.0,2.222222,0.0,0.333333,2,1.0,1.0,2.000000,2.000000,2.222222,0.0,0.5,2,0.9,0.9,2.222222,2.222222,0.44226,0.000000,0.442260,0.199017,0.199017,0.022113,0.221130,0.221130,1.326781,1.105651,2.211302,0.000000,0.697674,0.000,0.697674,0.313953,0.313953,0.0000,0.313953,0.313953,1.395349,1.395349,2.790697,0.000000,2,2,6,9,9,16,12,25
13133,910,860,2024-25,10,24,2150,2460,495,0.4,0.4,0,0.0,1,6.0,3.0,0,0,2,0,250.0,23.0,11.0,7.0,6.0,1.0,167.0,134.0,12.0,99.0,1.0,5.0,1.0,225.0,0.2,0.2,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,0.0,112.0,8.0,4.0,3.0,2.0,0.0,61.0,74.0,6.0,39.0,0.0,1.0,0.0,495.0,0.4,0.4,0.0,0.0,1.0,6.0,0.0,0.0,3.0,2.0,0.0,250.0,23.0,11.0,7.0,6.0,1.0,134.0,167.0,12.0,99.0,1.0,5.0,1.0,0.000000,0.0,0.000000,1,0.4,0.4,2.500000,2.500000,0.000000,0.0,0.0,0,0.2,0.2,0.000000,0.000000,0.00000,0.181818,0.181818,0.072727,0.072727,0.000000,0.072727,0.072727,1.090909,0.545455,1.090909,0.181818,0.000000,0.000,0.000000,0.080000,0.080000,0.0000,0.080000,0.080000,1.600000,0.400000,0.800000,0.000000,2,2,6,9,9,16,12,25
13134,1308,860,2024-25,10,24,2150,2460,175,0.2,0.2,0,0.1,1,3.0,0.0,0,0,0,0,120.0,2.0,1.0,3.0,7.0,2.0,98.0,79.0,12.0,65.0,0.0,3.0,2.0,144.0,0.2,0.2,0.1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,94.0,2.0,1.0,3.0,5.0,2.0,65.0,75.0,10.0,52.0,0.0,3.0,2.0,175.0,0.2,0.2,0.1,0.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,120.0,2.0,1.0,3.0,7.0,2.0,79.0,98.0,12.0,65.0,0.0,3.0,2.0,0.000000,10.0,0.000000,1,0.3,0.3,3.333333,3.333333,0.000000,10.0,0.0,1,0.3,0.3,3.333333,3.333333,0.00000,0.514286,0.514286,0.102857,0.102857,0.051429,0.154286,0.154286,1.542857,0.000000,3.600001,1.028572,0.000000,0.625,0.625000,0.125000,0.125000,0.0625,0.187500,0.187500,0.625000,0.000000,3.125000,1.250000,2,2,6,9,9,16,12,25
13135,1211,860,2024-25,10,24,2150,2460,189,0.2,0.2,0,0.0,0,1.0,1.0,0,0,0,0,98.0,2.0,3.0,6.0,1.0,1.0,79.0,62.0,2.0,53.0,0.0,0.0,0.0,189.0,0.2,0.2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,98.0,2.0,3.0,6.0,1.0,1.0,62.0,79.0,2.0,53.0,0.0,0.0,0.0,189.0,0.2,0.2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,98.0,2.0,3.0,6.0,1.0,1.0,62.0,79.0,2.0,53.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0,0.2,0.2,0.000000,0.000000,0.000000,0.0,0.0,0,0.2,0.2,0.000000,0.000000,0.00000,0.000000,0.000000,0.095238,0.095238,0.000000,0.095238,0.095238,0.476190,0.476190,0.476190,0.476190,0.000000,0.000,0.000000,0.095238,0.095238,0.0000,0.095238,0.095238,0.476190,0.476190,0.476190,0.476190,2,2,6,9,9,16,12,25


In [224]:
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop
        
    def fit(self, X, y= None):
        return self
    
    def transform(self, X):
        X = X.copy()
        return X.drop(columns= self.columns_to_drop, errors= "ignore")
    
    def get_feature_names_out(self, names= None):
        return
            
class SkewNormalizationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, skew_threshold=1.0, method='auto', verbose=False):
        self.skew_threshold = skew_threshold
        self.method = method
        self.verbose = verbose
        self.transformations_ = {}
    
    def _determine_transformation(self, series):
        skew = stats.skew(series.dropna())
        
        if self.method == 'log':
            return 'log'
        elif self.method == 'exp':
            return 'exp'
        elif self.method == 'yeo-johnson':
            return 'yeo-johnson'
        
        if abs(skew) > self.skew_threshold:
            if skew > 0:
                return 'log'
            else:
                return 'exp'
        return None
    
    def _create_transformer(self, transformation):
        if transformation == 'log':
            return FunctionTransformer(np.log1p)
        elif transformation == 'exp':
            return FunctionTransformer(np.expm1)
        elif transformation == 'yeo-johnson':
            return PowerTransformer(method='yeo-johnson')
        return None
    
    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        
        self.transformations_ = {}
        for col in X.columns:
            if not np.issubdtype(X[col].dtype, np.number):
                continue
            
            transformation = self._determine_transformation(X[col])
            
            # if transformation:
            transformer = self._create_transformer(transformation)
            
            if self.verbose:
                print(f"Column {col}: Skew = {stats.skew(X[col].dropna()):.2f}, "
                        f"Transformation = {transformation}")
            
            self.transformations_[col] = transformer
        
        return self
    
    def transform(self, X):
        X = pd.DataFrame(X).copy()
        
        for col, transformer in self.transformations_.items():
            if transformer != None:
                X[col] = transformer.fit_transform(X[[col]])
        
        return X
    
    def get_feature_names_out(self, input_features=None):
        return list(self.transformations_.keys())
    
num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy= "constant", fill_value= 0)),
    ("skew", SkewNormalizationTransformer()),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy= "most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown= "ignore", sparse_output= False))
])

column_transformer = ColumnTransformer([
    ("num", num_transformer, [col for col in df.select_dtypes(include= np.number) if col != 'fix_id' and col not in cols_to_drop]),
    ("cat", cat_transformer, make_column_selector(dtype_include= "object"))
], remainder= "passthrough")

col_dropper = ColumnDropper(cols_to_drop)
preprocessor = Pipeline([
    ("dropper", col_dropper),
    ("preprocessing", column_transformer)
])

X_preprocessed = pd.DataFrame(preprocessor.fit_transform(df), columns= preprocessor.get_feature_names_out())

outlier_detection = IsolationForest(random_state= 42)
outlier_pred = outlier_detection.fit_predict(X_preprocessed)
X_preprocessed = X_preprocessed[outlier_pred == 1].reset_index(drop= True)
labels = labels[outlier_pred == 1].reset_index(drop= True)

y_goals = labels["label_goals"]
y_assists = labels["label_assists"]

## Modeling

In [225]:
class CustomTimeSeriesCV(BaseCrossValidator):
    def __init__(self, unique_fixtures, folds=4):
        if not isinstance(unique_fixtures, list):
            raise TypeError("unique_fixtures must be a list")
        
        if len(unique_fixtures) == 0:
            raise ValueError("unique_fixtures cannot be empty")
        
        if folds <= 0:
            raise ValueError("Number of folds must be positive")
        
        try:
            self.fix_per_fold = max(1, len(unique_fixtures) // (folds + 1))
        except ZeroDivisionError:
            raise ValueError("Not enough unique fixtures for the specified number of folds")
        
        self.unique_fixtures = unique_fixtures
        self.folds = folds

    def __repr__(self):
        return (f"CustomTimeSeriesCV(n_fixtures={len(self.unique_fixtures)}, "
                f"folds={self.folds}, "
                f"fixtures_per_fold={self.fix_per_fold})")

    def __str__(self):
        return (f"Time Series Cross-Validator\n"
                f"Total Fixtures: {len(self.unique_fixtures)}\n"
                f"Number of Folds: {self.folds}\n"
                f"Fixtures per Fold: {self.fix_per_fold}")

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.folds

    def split(self, X, y=None, groups=None):
        fixture_id_col = 'remainder__fix_id'
        unique_fixt = self.unique_fixtures

        for fold in range(1, self.folds + 1):
            train_end = fold * self.fix_per_fold
            test_start = train_end
            test_end = test_start + self.fix_per_fold

            if test_end > len(unique_fixt):
                test_end = len(unique_fixt)
            
            train_fixtures = unique_fixt[:train_end]
            test_fixtures = unique_fixt[test_start:test_end]

            train_idx = X[X[fixture_id_col].isin(train_fixtures)].index.values
            test_idx = X[X[fixture_id_col].isin(test_fixtures)].index.values

            yield (train_idx, test_idx)

def output1_rmse(y_true, y_pred):
    y_true = y_true[:, 0] if isinstance(y_true, np.ndarray) else y_true.iloc[:, 0].to_numpy()
    y_pred = y_pred[:, 0] if isinstance(y_pred, np.ndarray) else y_pred.iloc[:, 0].to_numpy()
    return np.sqrt(mean_squared_error(y_true, y_pred))

def output2_rmse(y_true, y_pred):
    y_true = y_true[:, 1] if isinstance(y_true, np.ndarray) else y_true.iloc[:, 1].to_numpy()
    y_pred = y_pred[:, 1] if isinstance(y_pred, np.ndarray) else y_pred.iloc[:, 1].to_numpy()
    return np.sqrt(mean_squared_error(y_true, y_pred))

def output1_mae(y_true, y_pred):
    y_true = y_true[:, 0] if isinstance(y_true, np.ndarray) else y_true.iloc[:, 0].to_numpy()
    y_pred = y_pred[:, 0] if isinstance(y_pred, np.ndarray) else y_pred.iloc[:, 0].to_numpy()
    return mean_absolute_error(y_true, y_pred)

def output2_mae(y_true, y_pred):
    y_true = y_true[:, 1] if isinstance(y_true, np.ndarray) else y_true.iloc[:, 1].to_numpy()
    y_pred = y_pred[:, 1] if isinstance(y_pred, np.ndarray) else y_pred.iloc[:, 1].to_numpy()
    return mean_absolute_error(y_true, y_pred)



def train_models(model_pipelines, param_grids, X_train, y_train, n_iter, folds=10, refit="output1_rmse", n_jobs=-1):
    scorers = {
        'output1_rmse': make_scorer(output1_rmse, greater_is_better=False),
        'output1_mae': make_scorer(output1_mae, greater_is_better=False),
        'output2_rmse': make_scorer(output2_rmse, greater_is_better=False),
        'output2_mae': make_scorer(output2_mae, greater_is_better=False),
    }
    
    unique_fixtures = X_train["remainder__fix_id"].unique().tolist()
    cv = CustomTimeSeriesCV(unique_fixtures, folds=folds)
    grids = {}
    
    for model_name, pipeline in model_pipelines.items():
        print(f"Training and tuning {model_name}")
        
        grids[model_name] = RandomizedSearchCV(
            estimator=pipeline,
            param_distributions=param_grids[model_name],
            n_iter=n_iter,
            cv=cv,
            scoring=scorers,
            verbose=3,
            n_jobs=n_jobs,
            random_state=42,
            refit=refit,
        )
        
        grids[model_name].fit(X_train, y_train)
        
        if isinstance(scorers, dict) or isinstance(scorers, list):
            print(f"Best parameters for {model_name} based on '{refit}': {grids[model_name].best_params_}")
            print(f"Best Score for {model_name} based on '{refit}': {grids[model_name].best_score_}\n")
        else:
            print(f"Best parameters for {model_name}: {grids[model_name].best_params_}")
            print(f"Best Score for {model_name}: {grids[model_name].best_score_}\n")
    
    return grids


def combine_grid_results(grids, sort_column=None):
    dfs = []
    for name, grid in grids.items():
        df = pd.DataFrame(grid.cv_results_)
        df["model"] = name
        dfs.append(df)
    
    if sort_column != None:
        combined_df = pd.concat(dfs).sort_values(sort_column, ascending=False).reset_index(drop=True)
    else:
        combined_df = pd.concat(dfs).reset_index(drop= True)
    return combined_df

In [226]:
model_num = 10

last_10_perc = len(X_preprocessed["remainder__fix_id"].unique()) // 10
test_fixtures = (X_preprocessed["remainder__fix_id"].tolist()[-last_10_perc:])

train_idx = X_preprocessed[~X_preprocessed["remainder__fix_id"].isin(test_fixtures)].index.values
test_idx = X_preprocessed[X_preprocessed["remainder__fix_id"].isin(test_fixtures)].index.values

X_train = X_preprocessed.iloc[train_idx]
y_train = labels.iloc[train_idx]

X_test = X_preprocessed.iloc[test_idx]
y_test = labels.iloc[test_idx]


model_pipelines = {
    "Linear Regression": Pipeline([
        # ("poly", PolynomialFeatures()),
        ("lr", MultiOutputRegressor(LinearRegression(), n_jobs= -1))
    ]),
    
    "Ridge": Pipeline([
        ("poly", PolynomialFeatures()),
        ("ridge", Ridge(random_state= 42))
    ]),
    
    "Lasso": Pipeline([
        ("poly", PolynomialFeatures()),
        ("lasso", Lasso(random_state= 42))
    ]),

    "Elastic Net": Pipeline([
        ("poly", PolynomialFeatures()),
        ("e_net", ElasticNet(random_state= 42))
    ]),
    
    # "SVR": Pipeline([
    #     ("poly", PolynomialFeatures()),
    #     ("SVC", SVR())
    # ]),

    **{f"Decision Tree {i}": Pipeline(
        [("dt", DecisionTreeRegressor(random_state=42, max_features= "sqrt"))]) for i in range(1, model_num + 1)}
}

param_grids = {
    "Linear Regression": {
        # "poly__degree": randint(1, 2),
    },

    "Ridge": {
        "poly__degree": randint(1, 2),
        "ridge__alpha": uniform(0.01, 10),
    },

    "Lasso": {
        "poly__degree": randint(1, 2),
        "lasso__alpha": uniform(0.01, 1),
    },

    "Elastic Net": {
        "poly__degree": randint(1, 2),
        "e_net__alpha": uniform(0.01, 1),
        "e_net__l1_ratio": uniform(0.01, 1),
    },

    "SVR": {
        "poly__degree": randint(1, 2),
        "SVC__C": uniform(0.1, 10),
        "SVC__epsilon": uniform(0.01, 1),
        "SVC__kernel": ["linear", "poly", "rbf"],
        "SVC__degree": randint(2, 5),
        "SVC__gamma": uniform(0.01, 1),
    },

    **{f"Decision Tree {i}": {
            "dt__max_depth": randint(1, 10 * i),
            "dt__min_samples_split": randint(2 * i, 10 * i),
            "dt__min_samples_leaf": randint(1, i+1),
        }
        for i in range(1, model_num + 1)
    },
}

# scoring = {
#     "neg_root_mean_squared_error": "neg_root_mean_squared_error",
#     "neg_mean_absolute_error": "neg_mean_absolute_error"
# }

base_model_grids = train_models(model_pipelines, param_grids, X_train, y_train, n_iter=10, folds=5)
base_results = combine_grid_results(base_model_grids)

Training and tuning Linear Regression
Fitting 5 folds for each of 1 candidates, totalling 5 fits


python(8095) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(8096) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(8097) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(8098) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(8099) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(8100) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(8101) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(8102) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[CV 1/5] END  output1_mae: (test=-0.221) output1_rmse: (test=-0.344) output2_mae: (test=-0.147) output2_rmse: (test=-0.259) total time=   0.1s
[CV 2/5] END  output1_mae: (test=-0.168) output1_rmse: (test=-0.305) output2_mae: (test=-0.118) output2_rmse: (test=-0.252) total time=   0.1s
[CV 3/5] END  output1_mae: (test=-0.203) output1_rmse: (test=-0.629) output2_mae: (test=-0.141) output2_rmse: (test=-0.352) total time=   0.2s
[CV 4/5] END  output1_mae: (test=-0.172) output1_rmse: (test=-0.294) output2_mae: (test=-0.145) output2_rmse: (test=-0.274) total time=   0.3s
[CV 5/5] END  output1_mae: (test=-0.152) output1_rmse: (test=-0.294) output2_mae: (test=-0.126) output2_rmse: (test=-0.252) total time=   0.3s
Best parameters for Linear Regression based on 'output1_rmse': {}
Best Score for Linear Regression based on 'output1_rmse': -0.3733535678650357

Training and tuning Ridge
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END poly__degree=1, ridge__alpha=3.755401188

In [227]:
base_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_output1_rmse,split1_test_output1_rmse,split2_test_output1_rmse,split3_test_output1_rmse,split4_test_output1_rmse,mean_test_output1_rmse,std_test_output1_rmse,rank_test_output1_rmse,split0_test_output1_mae,split1_test_output1_mae,split2_test_output1_mae,split3_test_output1_mae,split4_test_output1_mae,mean_test_output1_mae,std_test_output1_mae,rank_test_output1_mae,split0_test_output2_rmse,split1_test_output2_rmse,split2_test_output2_rmse,split3_test_output2_rmse,split4_test_output2_rmse,mean_test_output2_rmse,std_test_output2_rmse,rank_test_output2_rmse,split0_test_output2_mae,split1_test_output2_mae,split2_test_output2_mae,split3_test_output2_mae,split4_test_output2_mae,mean_test_output2_mae,std_test_output2_mae,rank_test_output2_mae,model,param_poly__degree,param_ridge__alpha,param_lasso__alpha,param_e_net__alpha,param_e_net__l1_ratio,param_dt__max_depth,param_dt__min_samples_leaf,param_dt__min_samples_split
0,0.183251,0.076161,0.023396,0.003944,{},-0.344315,-0.304983,-0.629454,-0.293747,-0.294268,-0.373354,0.129381,1,-0.220658,-0.167931,-0.203306,-0.171752,-0.152151,-0.183159,0.025042,1,-0.258517,-0.252149,-0.352397,-0.273729,-0.251849,-0.277728,0.038170,1,-0.146963,-0.118303,-0.140856,-0.145490,-0.125841,-0.135491,0.011392,1,Linear Regression,,,,,,,,
1,0.063055,0.020426,0.010719,0.005190,"{'poly__degree': 1, 'ridge__alpha': 3.75540118...",-0.303620,-0.302440,-0.317721,-0.293124,-0.294052,-0.302191,0.008851,7,-0.194391,-0.164401,-0.190132,-0.170965,-0.151927,-0.174363,0.015899,7,-0.243009,-0.252046,-0.276041,-0.273098,-0.251759,-0.259191,0.013004,7,-0.134472,-0.116985,-0.135126,-0.143737,-0.126183,-0.131301,0.009061,7,Ridge,1.0,3.755401,,,,,,
2,0.050631,0.020251,0.011716,0.010132,"{'poly__degree': 1, 'ridge__alpha': 9.51714306...",-0.294374,-0.301544,-0.317158,-0.292813,-0.293969,-0.299972,0.009126,1,-0.186000,-0.162573,-0.189384,-0.170583,-0.151632,-0.172034,0.014170,1,-0.236671,-0.252104,-0.275781,-0.272960,-0.251708,-0.257845,0.014623,1,-0.129901,-0.116160,-0.134788,-0.143181,-0.126220,-0.130050,0.008972,1,Ridge,1.0,9.517143,,,,,,
3,0.033915,0.013139,0.011363,0.003042,"{'poly__degree': 1, 'ridge__alpha': 7.32993941...",-0.297156,-0.301811,-0.317326,-0.292906,-0.293990,-0.300638,0.008899,3,-0.188525,-0.163132,-0.189641,-0.170711,-0.151732,-0.172748,0.014646,3,-0.238642,-0.252088,-0.275837,-0.273004,-0.251720,-0.258258,0.014086,3,-0.131362,-0.116407,-0.134876,-0.143338,-0.126210,-0.130439,0.008963,3,Ridge,1.0,7.329939,,,,,,
4,0.043022,0.022942,0.021844,0.017448,"{'poly__degree': 1, 'ridge__alpha': 5.99658484...",-0.299217,-0.302008,-0.317451,-0.292974,-0.294008,-0.301132,0.008810,6,-0.190390,-0.163539,-0.189812,-0.170797,-0.151800,-0.173268,0.015023,6,-0.240067,-0.252074,-0.275884,-0.273035,-0.251730,-0.258558,0.013714,6,-0.132392,-0.116590,-0.134946,-0.143460,-0.126203,-0.130718,0.008978,6,Ridge,1.0,5.996585,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,0.154878,0.056451,0.009757,0.002798,"{'dt__max_depth': 38, 'dt__min_samples_leaf': ...",-0.342867,-0.330900,-0.346055,-0.353466,-0.331684,-0.340994,0.008640,8,-0.203220,-0.147992,-0.163646,-0.195950,-0.155651,-0.173292,0.022151,8,-0.270638,-0.275069,-0.291631,-0.317832,-0.271765,-0.285387,0.017895,9,-0.120580,-0.119575,-0.117209,-0.168276,-0.120408,-0.129209,0.019570,7,Decision Tree 10,,,,,,38.0,2.0,83.0
127,0.120184,0.074834,0.010278,0.007352,"{'dt__max_depth': 60, 'dt__min_samples_leaf': ...",-0.365341,-0.339579,-0.352287,-0.333610,-0.326325,-0.343428,0.013874,9,-0.208520,-0.173865,-0.170101,-0.163086,-0.155093,-0.174133,0.018349,9,-0.243958,-0.263940,-0.296044,-0.298569,-0.274244,-0.275351,0.020417,5,-0.116977,-0.115269,-0.123241,-0.145201,-0.124517,-0.125041,0.010681,4,Decision Tree 10,,,,,,60.0,5.0,52.0
128,0.074648,0.038353,0.004022,0.001482,"{'dt__max_depth': 76, 'dt__min_samples_leaf': ...",-0.320847,-0.326199,-0.343602,-0.329365,-0.328218,-0.329646,0.007565,7,-0.173554,-0.148259,-0.171814,-0.165680,-0.153962,-0.162654,0.009946,5,-0.261042,-0.273721,-0.315174,-0.286431,-0.271167,-0.281507,0.018677,8,-0.132868,-0.118164,-0.177056,-0.127863,-0.123878,-0.135966,0.021103,10,Decision Tree 10,,,,,,76.0,10.0,41.0
129,0.091167,0.040982,0.005042,0.002916,"{'dt__max_depth': 89, 'dt__min_samples_leaf': ...",-0.345786,-0.368088,-0.374980,-0.614095,-0.335184,-0.407627,0.104241,10,-0.201340,-0.177464,-0.186486,-0.256430,-0.156465,-0.195637,0.033695,10,-0.270484,-0.306158,-0.320986,-0.311779,-0.294199,-0.300721,0.017432,10,-0.128528,-0.130748,-0.143577,-0.132546,-0.132574,-0.133594,0.005207,9,Decision Tree 10,,,,,,89.0,1.0,78.0


In [240]:
ensemble_model_num = 10
base_estimators = [(name, model.best_estimator_) for name, model in base_model_grids.items()]
ensemble_pipelines = {
    # "VotingClassifier": Pipeline([
    #     ("voting", MultiOutputRegressor(VotingRegressor(estimators=base_estimators, n_jobs=-1)))
    # ]),
    
    # "StackingClassifier": Pipeline([
    #     ("stacking", StackingRegressor(estimators=base_estimators, n_jobs=-1, final_estimator= ElasticNet()))
    # ]),
    
    **{f"Bagging_DecisionTree_{i}": Pipeline([
        ("bc", BaggingRegressor(DecisionTreeRegressor(random_state=42), random_state=42))
        ])
    for i in range(1, ensemble_model_num)},

    **{f"XGBoost_{i}": XGBRegressor(random_state=42)
    for i in range(1, ensemble_model_num + 1)},
    
    **{f"RandomForest_{i}": Pipeline([
        ("rf", RandomForestRegressor(random_state=42, max_features= "sqrt"))
        ]) 
    for i in range(1, ensemble_model_num + 1)}
}

ensemble_param_grids = {
    "VotingClassifier": {
        "voting__estimator__weights": [[1] * len(base_estimators), 
                            [2] * len(base_estimators), 
                            [i for i in range(1, len(base_estimators) + 1)]],
    },

    "StackingClassifier": {
        "stacking__final_estimator__alpha": uniform(0.1, 20),
        # "stacking__final_estimator__poly__degree": randint(1, 2),
        "stacking__final_estimator__l1_ratio": uniform(0.01, 1),
    },

    **{
        f"Bagging_DecisionTree_{i}": {
            "bc__n_estimators": randint(10, 100 * i),
            "bc__max_samples": uniform(0.1, 0.1 + (1 - 0.1) / i),
            "bc__max_features": uniform(0.1, 0.1 + (1 - 0.1) / i),
            "bc__bootstrap": [True, False],
            "bc__bootstrap_features": [True, False]
        }
        for i in range(1, ensemble_model_num + 1)
    },

    **{
        f"XGBoost_{i}": {
            "max_depth": randint(3 , 10 * i),
            "learning_rate": uniform(0.01, 0.2 / i),
            "n_estimators": randint(50, 200 * i),
            "subsample": uniform(0.1, 0.1 + (1 - 0.1) / i),
            "colsample_bytree": uniform(0.1, 0.1 + (1 - 0.1) / i),
        }
        for i in range(1, ensemble_model_num + 1)
    },

    **{
        f"RandomForest_{i}": {
            "rf__n_estimators": randint(100, 300 * i), 
            "rf__max_depth": randint(5 * i, 20 * i),
            "rf__min_samples_split": randint(2 * i, 10 * i),
            "rf__min_samples_leaf": randint(1, 5 * i),
            "rf__bootstrap": [True, False],
        }
        for i in range(1, ensemble_model_num + 1)
    }
}

ensemble_model_grids = train_models(ensemble_pipelines, ensemble_param_grids, X_train, y_train, 10, 5)
ensemble_results = combine_grid_results(ensemble_model_grids)

Training and tuning Bagging_DecisionTree_1
Fitting 5 folds for each of 10 candidates, totalling 50 fits


python(23297) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(23298) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(23299) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(23300) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(23301) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(23302) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(23303) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(23304) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[CV 2/5] END bc__bootstrap=True, bc__bootstrap_features=False, bc__max_features=1.0507143064099163, bc__max_samples=0.8319939418114051, bc__n_estimators=70; output1_mae: (test=nan) output1_rmse: (test=nan) output2_mae: (test=nan) output2_rmse: (test=nan) total time=   0.0s
[CV 3/5] END bc__bootstrap=True, bc__bootstrap_features=False, bc__max_features=1.0507143064099163, bc__max_samples=0.8319939418114051, bc__n_estimators=70; output1_mae: (test=nan) output1_rmse: (test=nan) output2_mae: (test=nan) output2_rmse: (test=nan) total time=   0.0s
[CV 1/5] END bc__bootstrap=True, bc__bootstrap_features=False, bc__max_features=1.0507143064099163, bc__max_samples=0.8319939418114051, bc__n_estimators=70; output1_mae: (test=nan) output1_rmse: (test=nan) output2_mae: (test=nan) output2_rmse: (test=nan) total time=   0.0s
[CV 5/5] END bc__bootstrap=True, bc__bootstrap_features=False, bc__max_features=1.0507143064099163, bc__max_samples=0.8319939418114051, bc__n_estimators=70; output1_mae: (test=na

20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/samiadam/miniconda3/envs/fpl-project/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/samiadam/miniconda3/envs/fpl-project/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/samiadam/miniconda3/envs/fpl-project/lib/python3.10/site-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/Users/samiadam/minicond

Best parameters for Bagging_DecisionTree_1 based on 'output1_rmse': {'bc__bootstrap': True, 'bc__bootstrap_features': False, 'bc__max_features': 0.12306242504141576, 'bc__max_samples': 0.6247746602583891, 'bc__n_estimators': 51}
Best Score for Bagging_DecisionTree_1 based on 'output1_rmse': -0.3114683685173281

Training and tuning Bagging_DecisionTree_2
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END bc__bootstrap=True, bc__bootstrap_features=True, bc__max_features=0.34520801406947516, bc__max_samples=0.1549862036999016, bc__n_estimators=97; output1_mae: (test=-0.230) output1_rmse: (test=-0.307) output2_mae: (test=-0.157) output2_rmse: (test=-0.238) total time=   1.0s
[CV 2/5] END bc__bootstrap=True, bc__bootstrap_features=True, bc__max_features=0.34520801406947516, bc__max_samples=0.1549862036999016, bc__n_estimators=97; output1_mae: (test=-0.202) output1_rmse: (test=-0.316) output2_mae: (test=-0.145) output2_rmse: (test=-0.257) total time=   1.8s
[CV 3/5] EN

15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/samiadam/miniconda3/envs/fpl-project/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/samiadam/miniconda3/envs/fpl-project/lib/python3.10/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/Users/samiadam/miniconda3/envs/fpl-project/lib/python3.10/site-packages/xgboost/sklearn.py", line 1108, in fit
    self._Booster = train(
  File "/Users/samiadam/miniconda3/envs/fpl-project/lib/python3.10/site-packages/xgboost/core.

Best parameters for XGBoost_1 based on 'output1_rmse': {'colsample_bytree': 0.7116531604882809, 'learning_rate': 0.011413261043943482, 'max_depth': 3, 'n_estimators': 98, 'subsample': 0.6247746602583891}
Best Score for XGBoost_1 based on 'output1_rmse': -0.29281072637674327

Training and tuning XGBoost_2
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END colsample_bytree=0.18581025224334008, learning_rate=0.025599452033620268, max_depth=13, n_estimators=137, subsample=0.283539736126462; output1_mae: (test=-0.218) output1_rmse: (test=-0.303) output2_mae: (test=-0.148) output2_rmse: (test=-0.240) total time=   6.4s
[CV 1/5] END colsample_bytree=0.3059970653660494, learning_rate=0.10507143064099161, max_depth=13, n_estimators=121, subsample=0.4292621663083701; output1_mae: (test=-0.251) output1_rmse: (test=-0.352) output2_mae: (test=-0.200) output2_rmse: (test=-0.293) total time=  14.3s
[CV 2/5] END colsample_bytree=0.18581025224334008, learning_rate=0.0255994520336

KeyboardInterrupt: 

In [232]:
ensemble_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rf__bootstrap,param_rf__max_depth,param_rf__min_samples_leaf,param_rf__min_samples_split,param_rf__n_estimators,params,split0_test_output1_rmse,split1_test_output1_rmse,split2_test_output1_rmse,split3_test_output1_rmse,split4_test_output1_rmse,mean_test_output1_rmse,std_test_output1_rmse,rank_test_output1_rmse,split0_test_output1_mae,split1_test_output1_mae,split2_test_output1_mae,split3_test_output1_mae,split4_test_output1_mae,mean_test_output1_mae,std_test_output1_mae,rank_test_output1_mae,split0_test_output2_rmse,split1_test_output2_rmse,split2_test_output2_rmse,split3_test_output2_rmse,split4_test_output2_rmse,mean_test_output2_rmse,std_test_output2_rmse,rank_test_output2_rmse,split0_test_output2_mae,split1_test_output2_mae,split2_test_output2_mae,split3_test_output2_mae,split4_test_output2_mae,mean_test_output2_mae,std_test_output2_mae,rank_test_output2_mae,model
0,10.790712,5.921968,0.314331,0.242007,True,8,1,8,206,"{'rf__bootstrap': True, 'rf__max_depth': 8, 'r...",-0.289298,-0.302435,-0.319823,-0.297112,-0.292975,-0.300329,0.010682,7,-0.198508,-0.162347,-0.173356,-0.178515,-0.149459,-0.172437,0.016415,4,-0.226211,-0.251648,-0.277273,-0.273249,-0.250282,-0.255733,0.018378,5,-0.130457,-0.121454,-0.130142,-0.145066,-0.122893,-0.130003,0.008376,4,RandomForest_1
1,25.821606,10.040684,0.220999,0.135276,False,17,1,8,221,"{'rf__bootstrap': False, 'rf__max_depth': 17, ...",-0.313596,-0.309073,-0.319884,-0.306565,-0.293747,-0.308573,0.008686,10,-0.228426,-0.184281,-0.192096,-0.196757,-0.164743,-0.19326,0.020708,10,-0.233032,-0.25625,-0.279762,-0.278558,-0.254248,-0.26037,0.017371,10,-0.143439,-0.139684,-0.154654,-0.163718,-0.138114,-0.147922,0.009788,10,RandomForest_1
2,9.979763,4.373586,0.162154,0.102893,True,11,3,4,187,"{'rf__bootstrap': True, 'rf__max_depth': 11, '...",-0.288212,-0.301517,-0.320122,-0.296318,-0.291604,-0.299554,0.011218,3,-0.199985,-0.166264,-0.175288,-0.178009,-0.152018,-0.174313,0.015719,5,-0.223704,-0.251704,-0.277382,-0.272835,-0.251217,-0.255368,0.019095,4,-0.125406,-0.122903,-0.139888,-0.145116,-0.12713,-0.132089,0.008766,6,RandomForest_1
3,8.714122,4.12933,0.066019,0.02897,True,8,4,9,230,"{'rf__bootstrap': True, 'rf__max_depth': 8, 'r...",-0.28039,-0.30043,-0.319091,-0.29352,-0.291496,-0.296986,0.012791,2,-0.184664,-0.159629,-0.169728,-0.168798,-0.148124,-0.166189,0.012083,2,-0.222081,-0.250245,-0.276618,-0.272461,-0.250504,-0.254382,0.019477,2,-0.121269,-0.118574,-0.129643,-0.138397,-0.122515,-0.12608,0.007164,2,RandomForest_1
4,8.844819,2.805459,0.052489,0.012754,False,9,2,9,257,"{'rf__bootstrap': False, 'rf__max_depth': 9, '...",-0.289355,-0.301791,-0.318787,-0.29558,-0.292397,-0.299582,0.010449,5,-0.19778,-0.162626,-0.175371,-0.17301,-0.149513,-0.17166,0.015937,3,-0.223495,-0.251842,-0.277227,-0.27257,-0.25007,-0.255041,0.019129,3,-0.122476,-0.121269,-0.133517,-0.139192,-0.123407,-0.127972,0.007108,3,RandomForest_1
5,2.366542,1.055815,0.022957,0.004688,False,6,4,5,120,"{'rf__bootstrap': False, 'rf__max_depth': 6, '...",-0.277785,-0.299983,-0.31973,-0.292626,-0.294204,-0.296865,0.013577,1,-0.17944,-0.154392,-0.167432,-0.16309,-0.147494,-0.162369,0.010977,1,-0.22123,-0.250591,-0.277729,-0.272127,-0.250124,-0.25436,0.019958,1,-0.116768,-0.114463,-0.123173,-0.132853,-0.120025,-0.121456,0.006415,1,RandomForest_1
6,6.185487,3.243997,0.055053,0.009146,True,16,2,7,188,"{'rf__bootstrap': True, 'rf__max_depth': 16, '...",-0.298391,-0.303461,-0.319267,-0.301455,-0.292,-0.302915,0.009049,9,-0.215095,-0.175859,-0.183875,-0.191972,-0.158486,-0.185058,0.018666,9,-0.228024,-0.252716,-0.278533,-0.273504,-0.253743,-0.257304,0.017912,9,-0.136444,-0.133317,-0.150462,-0.153019,-0.134146,-0.141478,0.00848,9,RandomForest_1
7,7.889487,3.974906,0.079998,0.037923,True,15,3,8,269,"{'rf__bootstrap': True, 'rf__max_depth': 15, '...",-0.288164,-0.302457,-0.318777,-0.297273,-0.291194,-0.299573,0.010796,4,-0.201507,-0.172538,-0.178605,-0.180555,-0.156954,-0.178032,0.014368,7,-0.224724,-0.251165,-0.277049,-0.273745,-0.252017,-0.25574,0.018842,6,-0.128178,-0.128559,-0.141938,-0.150471,-0.130592,-0.135947,0.008837,8,RandomForest_1
8,5.378677,2.714079,0.044149,0.028793,False,16,4,9,114,"{'rf__bootstrap': False, 'rf__max_depth': 16, ...",-0.287355,-0.302499,-0.318893,-0.298396,-0.292399,-0.299908,0.010802,6,-0.195653,-0.168626,-0.178897,-0.179417,-0.158118,-0.176142,0.012493,6,-0.224157,-0.252965,-0.278359,-0.273586,-0.251515,-0.256116,0.019248,7,-0.121066,-0.125599,-0.137893,-0.145698,-0.129593,-0.13197,0.008817,5,RandomForest_1
9,5.668709,2.458649,0.044305,0.018442,False,18,3,7,150,"{'rf__bootstrap': False, 'rf__max_depth': 18, ...",-0.290867,-0.304222,-0.318533,-0.299038,-0.291693,-0.300871,0.010108,8,-0.200596,-0.173084,-0.181319,-0.182959,-0.157636,-0.179119,0.013991,8,-0.2243,-0.252545,-0.278461,-0.274494,-0.25293,-0.256546,0.019346,8,-0.122749,-0.128415,-0.142812,-0.15199,-0.132632,-0.13572,0.010451,7,RandomForest_1


In [239]:
for i in range(1, 20):
    print((0.1, 0.1 + (1 - 0.1) / i))

(0.1, 1.0)
(0.1, 0.55)
(0.1, 0.4)
(0.1, 0.325)
(0.1, 0.28)
(0.1, 0.25)
(0.1, 0.2285714285714286)
(0.1, 0.21250000000000002)
(0.1, 0.2)
(0.1, 0.19)
(0.1, 0.18181818181818182)
(0.1, 0.175)
(0.1, 0.16923076923076924)
(0.1, 0.1642857142857143)
(0.1, 0.16)
(0.1, 0.15625)
(0.1, 0.15294117647058825)
(0.1, 0.15000000000000002)
(0.1, 0.1473684210526316)
