In [1]:
import pandas as pd
df=pd.read_csv('fpl_fwd_data.csv')

In [2]:
df.columns

Index(['season', 'gameweek', 'name', 'position', 'team', 'xP', 'assists',
       'bonus', 'bps', 'clean_sheets', 'creativity', 'element', 'fixture',
       'goals_conceded', 'goals_scored', 'ict_index', 'influence',
       'kickoff_time', 'minutes', 'opponent_team', 'own_goals',
       'penalties_missed', 'penalties_saved', 'red_cards', 'round', 'saves',
       'selected', 'team_a_score', 'team_h_score', 'threat', 'total_points',
       'transfers_balance', 'transfers_in', 'transfers_out', 'value',
       'was_home', 'yellow_cards', 'expected_assists',
       'expected_goal_involvements', 'expected_goals',
       'expected_goals_conceded', 'starts', 'future_points'],
      dtype='object')

In [3]:
df.shape

(9705, 43)

In [4]:
import pandas as pd
import numpy as np

# Load your data
fwd_df = pd.read_csv("fpl_fwd_data.csv")

# Sort by player and gameweek
fwd_df = fwd_df.sort_values(by=['element', 'gameweek']).reset_index(drop=True)

# 🔹 Rolling averages (last 3 GWs)
rolling_cols = ['goals_scored', 'assists', 'minutes', 'bonus', 'total_points']
for col in rolling_cols:
    fwd_df[f'{col}_last_3'] = fwd_df.groupby('element')[col].transform(lambda x: x.shift(1).rolling(window=3).mean())

# 🔹 Per 90 stats
fwd_df['goals_per_90'] = (fwd_df['goals_scored'] / (fwd_df['minutes'] + 1e-5)) * 90
fwd_df['assists_per_90'] = (fwd_df['assists'] / (fwd_df['minutes'] + 1e-5)) * 90
fwd_df['xg_per_90'] = (fwd_df['expected_goals'] / (fwd_df['minutes'] + 1e-5)) * 90
fwd_df['xa_per_90'] = (fwd_df['expected_assists'] / (fwd_df['minutes'] + 1e-5)) * 90

# 🔹 Involvements
fwd_df['involvements_per_90'] = ((fwd_df['goals_scored'] + fwd_df['assists']) / (fwd_df['minutes'] + 1e-5)) * 90
fwd_df['expected_involvements'] = fwd_df['expected_goal_involvements']

# 🔹 Goal efficiency (actual vs expected)
fwd_df['goal_efficiency'] = fwd_df['goals_scored'] - fwd_df['expected_goals']

# 🔹 Form features
fwd_df['attacking_threat'] = fwd_df['threat'] + fwd_df['creativity'] + fwd_df['influence']

# 🔹 Clean up: drop rows with NaNs from rolling
fwd_df = fwd_df.dropna()

# Save engineered version
fwd_df.to_csv("fpl_fwd_data.csv", index=False)

print("✅ FWD Feature Engineering Complete! New shape:", fwd_df.shape)


✅ FWD Feature Engineering Complete! New shape: (8924, 56)


In [5]:
fwd_df.corr(numeric_only=True)['future_points'].sort_values(ascending=False)

future_points                 1.000000
minutes                       0.519317
xP                            0.440234
starts                        0.434625
minutes_last_3                0.430538
attacking_threat              0.424709
ict_index                     0.423546
total_points                  0.400387
threat                        0.398073
total_points_last_3           0.392931
expected_goals_conceded       0.366349
influence                     0.355705
bps                           0.354085
creativity                    0.349143
selected                      0.341706
expected_goal_involvements    0.339566
expected_involvements         0.339566
value                         0.325884
goals_conceded                0.319210
expected_goals                0.318286
goals_scored_last_3           0.301094
goals_scored                  0.286042
transfers_in                  0.284600
bonus_last_3                  0.274750
bonus                         0.248919
clean_sheets             

In [6]:
fwd_df.drop(['team_h_score','team_a_score','was_home','own_goals','red_cards','fixture','round','element','saves','penalties_saved'],axis=1,inplace=True)

In [7]:
fwd_df.to_csv('fpl_fwd_data.csv',index=False)

# ML Model

In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

df=pd.read_csv('fpl_fwd_data.csv')

# 🔹 Drop non-model-useful columns
df = df.drop(columns=['season', 'name', 'position', 'team', 'kickoff_time'],axis=1)

# 🔹 Drop NaNs due to rolling features
df = df.dropna()

# 🔹 Sort by time (so future rows don't leak into training)
df = df.sort_values(by=['gameweek']).reset_index(drop=True)

# ✅ Time-based split: train on GWs 1–30, test on GWs 31–38
train_df = df[df['gameweek'] <= 30]
test_df  = df[df['gameweek'] > 30]

# 🔹 Split into features and target
X_train = train_df.drop(columns=['future_points'])
y_train = train_df['future_points']

X_test = test_df.drop(columns=['future_points'])
y_test = test_df['future_points']


models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    "LightGBM": LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
    "HistGradient":HistGradientBoostingRegressor(min_samples_leaf= 50, max_leaf_nodes= 15, max_iter= 150, max_depth= 3, learning_rate= 0.1, l2_regularization= 0.1),
    "Ridge": GridSearchCV(Ridge(), {'alpha': [0.01, 0.1, 1.0, 10]}, cv=5),
    "Lasso": GridSearchCV(Lasso(max_iter=10000), {'alpha': [0.01, 0.1, 1.0, 10]}, cv=5),
    "ElasticNet": GridSearchCV(ElasticNet(max_iter=10000), {'alpha': [0.01, 0.1, 1.0, 10]}, cv=5),
}

# ===============================
# Train & Evaluate
# ===============================
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    mae = mean_absolute_error(y_test, preds)
    mse = mean_squared_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    results.append({
        "Model": name,
        "MAE": round(mae, 4),
        "MSE": round(mse, 4),
        "R²": round(r2, 4)
    })

# ===============================
# Show Results
# ===============================
results_df = pd.DataFrame(results).sort_values(by="MAE")
print("📊 Model Performance Comparison:")
print(results_df.to_string(index=False))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001510 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4867
[LightGBM] [Info] Number of data points in the train set: 6688, number of used features: 39
[LightGBM] [Info] Start training from score 1.334181


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  mod

📊 Model Performance Comparison:
            Model    MAE    MSE     R²
     HistGradient 1.0326 4.6998 0.2958
Linear Regression 1.0431 4.6347 0.3055
            Ridge 1.0433 4.6320 0.3059
         LightGBM 1.0468 4.9136 0.2637
          XGBoost 1.0684 4.9407 0.2597
            Lasso 1.0925 4.6732 0.2997
         CatBoost 1.0982 5.0646 0.2411
       ElasticNet 1.1182 4.6785 0.2990
    Random Forest 1.1421 4.8684 0.2705


In [11]:
#Ridge regression model performs best at 0.306 

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

df=pd.read_csv('fpl_fwd_data.csv')

# 🔹 Drop non-model-useful columns
df = df.drop(columns=['season', 'name', 'position', 'team', 'kickoff_time'],axis=1)

# 🔹 Drop NaNs due to rolling features
df = df.dropna()

# 🔹 Sort by time (so future rows don't leak into training)
df = df.sort_values(by=['gameweek']).reset_index(drop=True)

# ✅ Time-based split: train on GWs 1–30, test on GWs 31–38
train_df = df[df['gameweek'] <= 30]
test_df  = df[df['gameweek'] > 30]

# 🔹 Split into features and target
X_train = train_df.drop(columns=['future_points'])
y_train = train_df['future_points']

X_test = test_df.drop(columns=['future_points'])
y_test = test_df['future_points']

model=GridSearchCV(Ridge(), {'alpha': [0.01, 0.1, 1.0, 10]}, cv=5)

model.fit(X_train, y_train)
preds = model.predict(X_test)

mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
r2 = r2_score(y_test, preds)


print(f"\n📊 Test MAE: {mae:.4f}")
print(f"📈 Test R² Score: {r2:.4f}")

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T



📊 Test MAE: 1.0433
📈 Test R² Score: 0.3059


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [3]:
import joblib
joblib.dump(model,'fwd_model.pkl')

['fwd_model.pkl']