In [1]:
import pandas as pd
df=pd.read_csv('fpl_mid_data.csv')

In [2]:
df.isna().sum()

season                        0
gameweek                      0
name                          0
position                      0
team                          0
xP                            0
assists                       0
bonus                         0
bps                           0
clean_sheets                  0
creativity                    0
element                       0
fixture                       0
goals_conceded                0
goals_scored                  0
ict_index                     0
influence                     0
kickoff_time                  0
minutes                       0
opponent_team                 0
own_goals                     0
penalties_missed              0
penalties_saved               0
red_cards                     0
round                         0
saves                         0
selected                      0
team_a_score                  0
team_h_score                  0
threat                        0
total_points                  0
transfer

In [4]:
df.shape

(35452, 43)

In [1]:
import pandas as pd
import numpy as np

# Load your midfielder data
mid_df = pd.read_csv("fpl_mid_data.csv")

# Sort by player & gameweek to compute rolling stats
mid_df = mid_df.sort_values(by=['element', 'gameweek']).reset_index(drop=True)

# Rolling average of last 3 gameweeks
rolling_features = ['total_points', 'minutes', 'goals_scored', 'assists', 'bonus']
for col in rolling_features:
    mid_df[f'{col}_last_3'] = mid_df.groupby('element')[col].transform(lambda x: x.shift(1).rolling(window=3).mean())

# 🔹 Involvement per 90
mid_df['goals_per_90'] = (mid_df['goals_scored'] / (mid_df['minutes'] + 1e-5)) * 90
mid_df['assists_per_90'] = (mid_df['assists'] / (mid_df['minutes'] + 1e-5)) * 90
mid_df['involvements_per_90'] = ((mid_df['goals_scored'] + mid_df['assists']) / (mid_df['minutes'] + 1e-5)) * 90

# 🔹 Expected contribution
mid_df['expected_contribution'] = mid_df['expected_goals'] + mid_df['expected_assists']

# 🔹 Attacking intensity
mid_df['attacking_threat'] = mid_df['threat'] + mid_df['creativity']

# 🔹 Clean sheet bonus potential
mid_df['clean_sheet_bonus'] = mid_df['clean_sheets'] * 1  # MID gets 1pt for CS

# Drop rows with NaNs from rolling features
mid_df = mid_df.dropna()

# Save engineered version
mid_df.to_csv("fpl_mid_data.csv", index=False)

print("✅ Feature engineering for midfielders complete. New shape:", mid_df.shape)


✅ Feature engineering for midfielders complete. New shape: (33414, 54)


In [2]:
df=pd.read_csv('fpl_mid_data.csv')

In [3]:
df.corr(numeric_only=True)['future_points'].sort_values(ascending=False)

future_points                 1.000000
minutes                       0.504226
xP                            0.468975
ict_index                     0.454706
attacking_threat              0.451669
starts                        0.425444
bps                           0.415879
total_points                  0.405791
creativity                    0.395623
threat                        0.390117
influence                     0.386685
expected_contribution         0.368059
expected_goal_involvements    0.365097
value                         0.359462
expected_goals_conceded       0.352064
minutes_last_3                0.350374
selected                      0.327996
total_points_last_3           0.320647
expected_assists              0.308990
goals_conceded                0.302676
expected_goals                0.299347
transfers_in                  0.252877
clean_sheet_bonus             0.238502
clean_sheets                  0.238502
goals_scored                  0.214034
goals_scored_last_3      

In [6]:
cols_to_drop = [
    'penalties_saved',
    'saves',
    'element',
    'round',
    'fixture',
    'was_home',
    'red_cards',
    'opponent_team',
    'team_a_score',
    'team_h_score',
    'own_goals',
    'penalties_missed',
    'yellow_cards',
    'transfers_balance',         
    'involvements_per_90' ,         
    'goals_per_90',             
    'assists_per_90'              
]

# Drop irrelevant columns
df = df.drop(columns=cols_to_drop)

# Save cleaned version (optional)
df.to_csv("fpl_mid_data.csv", index=False)

print("✅ Columns dropped. New shape:", df.shape)



✅ Columns dropped. New shape: (33414, 54)


# Ml Model

In [1]:
import pandas as pd
df=pd.read_csv('fpl_mid_data.csv')

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import HistGradientBoostingRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor


# 🔹 Drop non-model-useful columns
df = df.drop(columns=['season', 'name', 'position', 'team', 'kickoff_time'],axis=1)

# 🔹 Drop NaNs due to rolling features
df = df.dropna()

# 🔹 Sort by time (so future rows don't leak into training)
df = df.sort_values(by=['gameweek']).reset_index(drop=True)

# ✅ Time-based split: train on GWs 1–30, test on GWs 31–38
train_df = df[df['gameweek'] <= 30]
test_df  = df[df['gameweek'] > 30]

# 🔹 Split into features and target
X_train = train_df.drop(columns=['future_points'])
y_train = train_df['future_points']

X_test = test_df.drop(columns=['future_points'])
y_test = test_df['future_points']


models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    "LightGBM": LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
    "HistGradient":HistGradientBoostingRegressor(min_samples_leaf= 50, max_leaf_nodes= 15, max_iter= 150, max_depth= 3, learning_rate= 0.1, l2_regularization= 0.1)
}

# ===============================
# Train & Evaluate
# ===============================
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    mae = mean_absolute_error(y_test, preds)
    mse = mean_squared_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    results.append({
        "Model": name,
        "MAE": round(mae, 4),
        "MSE": round(mse, 4),
        "R²": round(r2, 4)
    })

# ===============================
# Show Results
# ===============================
results_df = pd.DataFrame(results).sort_values(by="MAE")
print("📊 Model Performance Comparison:")
print(results_df.to_string(index=False))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001769 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3813
[LightGBM] [Info] Number of data points in the train set: 25357, number of used features: 31
[LightGBM] [Info] Start training from score 1.250621
📊 Model Performance Comparison:
            Model    MAE    MSE     R²
     HistGradient 0.9238 3.6862 0.3351
         LightGBM 0.9457 3.8100 0.3127
          XGBoost 0.9485 3.8217 0.3106
         CatBoost 0.9555 3.8553 0.3046
Linear Regression 0.9851 3.7507 0.3234
    Random Forest 1.0674 4.0518 0.2691


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
models={
    "LinearRegression": LinearRegression(),
    "Ridge": GridSearchCV(Ridge(), {'alpha': [0.01, 0.1, 1.0, 10]}, cv=5),
    "Lasso": GridSearchCV(Lasso(max_iter=10000), {'alpha': [0.01, 0.1, 1.0, 10]}, cv=5),
    "ElasticNet": GridSearchCV(ElasticNet(max_iter=10000), {'alpha': [0.01, 0.1, 1.0, 10]}, cv=5),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    }

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    mae = mean_absolute_error(y_test, preds)
    mse = mean_squared_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    results.append({
        "Model": name,
        "MAE": round(mae, 4),
        "MSE": round(mse, 4),
        "R²": round(r2, 4)
    })

# ===============================
# Show Results
# ===============================
results_df = pd.DataFrame(results).sort_values(by="MAE")
print("📊 Model Performance Comparison:")
print(results_df.to_string(index=False))

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


📊 Model Performance Comparison:
           Model    MAE    MSE     R²
LinearRegression 0.9851 3.7507 0.3234
           Ridge 0.9855 3.7494 0.3237
      ElasticNet 0.9938 3.7588 0.3220
           Lasso 0.9941 3.7640 0.3210
    RandomForest 1.0674 4.0518 0.2691


In [6]:
from sklearn.model_selection import RandomizedSearchCV

# Define model
model = HistGradientBoostingRegressor(random_state=42)

# Hyperparameter grid
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_iter': [100, 200, 300],
    'max_leaf_nodes': [15, 31, 63],
    'max_depth': [3, 5, 7, None],
    'min_samples_leaf': [10, 20, 30, 50],
    'l2_regularization': [0.0, 0.1, 0.5, 1.0]
}

# Randomized search with 5-fold CV on training data
search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=30,
                            scoring='neg_mean_absolute_error', cv=5,
                            random_state=42, n_jobs=-1, verbose=1)

# Fit on training data only
search.fit(X_train, y_train)

# Best model
best_model = search.best_estimator_
print("✅ Best Params:", search.best_params_)

# Predict on test set
y_pred = best_model.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\n📊 Test MAE: {mae:.4f}")
print(f"📈 Test R² Score: {r2:.4f}")


Fitting 5 folds for each of 30 candidates, totalling 150 fits
✅ Best Params: {'min_samples_leaf': 50, 'max_leaf_nodes': 15, 'max_iter': 100, 'max_depth': None, 'learning_rate': 0.2, 'l2_regularization': 1.0}

📊 Test MAE: 0.9232
📈 Test R² Score: 0.3249


# Best model found to HistGradientBoosting with R2 score of 0.34
min_samples_leaf= 50, max_leaf_nodes= 15, max_iter= 150, max_depth= 3, learning_rate= 0.1, l2_regularization= 0.1

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import HistGradientBoostingRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor


# 🔹 Drop non-model-useful columns
df = df.drop(columns=['season', 'name', 'position', 'team', 'kickoff_time'],axis=1)

# 🔹 Drop NaNs due to rolling features
df = df.dropna()

# 🔹 Sort by time (so future rows don't leak into training)
df = df.sort_values(by=['gameweek']).reset_index(drop=True)

# ✅ Time-based split: train on GWs 1–30, test on GWs 31–38
train_df = df[df['gameweek'] <= 30]
test_df  = df[df['gameweek'] > 30]

# 🔹 Split into features and target
X_train = train_df.drop(columns=['future_points'])
y_train = train_df['future_points']

X_test = test_df.drop(columns=['future_points'])
y_test = test_df['future_points']

model=HistGradientBoostingRegressor(min_samples_leaf= 50, max_leaf_nodes= 15, max_iter= 150, max_depth= 3, learning_rate= 0.1, l2_regularization= 0.1)

model.fit(X_train, y_train)
preds = model.predict(X_test)

mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
r2 = r2_score(y_test, preds)

print(f"\n📊 Test MAE: {mae:.4f}")
print(f"📈 Test R² Score: {r2:.4f}")




📊 Test MAE: 0.9216
📈 Test R² Score: 0.3348


In [3]:
import joblib
joblib.dump(model,'mid_model.pkl')

['mid_model.pkl']