In [19]:
import lightgbm as lgb
import optuna as opt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [12]:
match_data = pd.read_csv('fbref_prem_data.csv')

In [13]:
train_data = match_data[match_data['Date'] < "2023-01-01"]
test_data = match_data[match_data['Date'] >= "2023-01-01"]

In [16]:
predictors = ['Team_Code', 'Opp_Code', 'Venue_Code', 'GF_rolling', 'GA_rolling', 'Poss_rolling', 'xG_rolling',
       'xGA_rolling', 'Sh_rolling', 'SoT_rolling', 'Dist_rolling',
       'SoTA_rolling', 'Cmp_rolling', 'Att_rolling', 'TotDist_rolling',
       'PrgDist_rolling', 'SCA_rolling', 'GCA_rolling']


In [17]:
scaler = StandardScaler(with_mean=False)
train_data[predictors] = scaler.fit_transform(train_data[predictors])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[predictors] = scaler.fit_transform(train_data[predictors])


In [24]:
def objective(trial):
    param = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 1e-8, 1e-1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }


    model = lgb.LGBMClassifier(**param)
    model.fit(train_data[predictors], train_data['Target'])
    predictions = model.predict(test_data[predictors])
    f1 = f1_score(test_data['Target'], predictions, average='weighted')

    return f1



In [25]:
study = opt.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

[I 2024-10-01 11:24:45,101] A new study created in memory with name: no-name-1e976fd1-3cdb-45d2-a0cf-7ef333cee71a
[I 2024-10-01 11:24:46,258] Trial 0 finished with value: 0.21491628099802865 and parameters: {'lambda_l1': 0.5672617759531154, 'lambda_l2': 1.3884574763960698, 'learning_rate': 0.013173919817441627, 'num_leaves': 40, 'feature_fraction': 0.9389049915430647, 'bagging_fraction': 0.922806205509885, 'bagging_freq': 6, 'min_child_samples': 17}. Best is trial 0 with value: 0.21491628099802865.
[I 2024-10-01 11:24:47,159] Trial 1 finished with value: 0.21491628099802865 and parameters: {'lambda_l1': 1.619773725779366e-07, 'lambda_l2': 1.9702922135771813e-05, 'learning_rate': 0.0017325922047733574, 'num_leaves': 217, 'feature_fraction': 0.6176719784471181, 'bagging_fraction': 0.924462701466454, 'bagging_freq': 6, 'min_child_samples': 74}. Best is trial 0 with value: 0.21491628099802865.
[I 2024-10-01 11:24:47,669] Trial 2 finished with value: 0.21491628099802865 and parameters: {'la

In [26]:
best_params = {
    "objective": "multiclass",
    "metric": "multi_logloss",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "num_class": 103,
    "lambda_l1": study.best_params["lambda_l1"],
    "lambda_l2": study.best_params["lambda_l2"],
    "learning_rate": study.best_params["learning_rate"],
    "num_leaves": study.best_params["num_leaves"],
    "feature_fraction": study.best_params["feature_fraction"],
    "bagging_fraction": study.best_params["bagging_fraction"],
    "bagging_freq": study.best_params["bagging_freq"],
    "min_child_samples": study.best_params["min_child_samples"],
}

In [28]:
final_model = lgb.LGBMClassifier(**best_params)
final_model.fit(train_data[predictors], train_data['Target'])
predictions = final_model.predict(test_data[predictors])

In [29]:
accuracy = accuracy_score(test_data['Target'], predictions)
precision = precision_score(test_data['Target'], predictions, average='weighted')
recall = recall_score(test_data['Target'], predictions, average='weighted')
f1 = f1_score(test_data['Target'], predictions, average='weighted')

In [30]:
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

Accuracy: 0.3943338437978561
Precision: 0.4473122541512776
Recall: 0.3943338437978561
F1: 0.24078703113646974
