In [64]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import optuna as opt
import pickle as pkl

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [19]:
rf = RandomForestClassifier(n_estimators=100, min_samples_split= 10 ,random_state=1)

In [20]:
match_data = pd.read_csv('fbref_prem_data.csv')

In [21]:
train_data = match_data[match_data['Date'] < "2023-01-01"]
test_data = match_data[match_data['Date'] >= "2023-01-01"]

In [35]:
predictors = ['Opp_Code', 'Venue_Code', 'GF_rolling', 'GA_rolling', 'Poss_rolling', 'xG_rolling',
       'xGA_rolling', 'Sh_rolling', 'SoT_rolling', 'Dist_rolling',
       'SoTA_rolling', 'Cmp_rolling', 'Att_rolling', 'TotDist_rolling',
       'PrgDist_rolling', 'SCA_rolling', 'GCA_rolling']



In [36]:
rf.fit(train_data[predictors], train_data['Simple Target'])

In [37]:
predictions = rf.predict(test_data[predictors])


In [42]:
accuracy = accuracy_score(test_data['Simple Target'], predictions)
precision = precision_score(test_data['Simple Target'], predictions, average='weighted')
recall = recall_score(test_data['Simple Target'], predictions, average='weighted')
f1 = f1_score(test_data['Simple Target'], predictions, average='weighted')


In [45]:
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1: ", f1)

Accuracy:  0.6516079632465543
Precision:  0.6376315978966559
Recall:  0.6516079632465543
F1:  0.6229808893442819


In [57]:
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 200, 500, log=True)
    max_depth = trial.suggest_int("max_depth", 2, 32)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    max_terminal_nodes = trial.suggest_int("max_terminal_nodes", 2, 100)

    rf = RandomForestClassifier(n_estimators=n_estimators, 
                                min_samples_split=min_samples_split, 
                                max_depth=max_depth, 
                                min_samples_leaf=min_samples_leaf, 
                                #max_terminal_nodes=max_terminal_nodes,
                                random_state=1)
    rf.fit(train_data[predictors], train_data['Simple Target'])
    predictions = rf.predict(test_data[predictors])
    return f1_score(test_data['Simple Target'], predictions, average='weighted')

In [58]:
# Create study object
study = opt.create_study(direction="maximize")

# Run optimization process
study.optimize(objective, n_trials=20, show_progress_bar=True)

[I 2024-09-30 17:23:05,350] A new study created in memory with name: no-name-f44cf008-6397-4817-8adb-8958d6452f4c


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2024-09-30 17:23:07,391] Trial 0 finished with value: 0.6290342337883222 and parameters: {'n_estimators': 367, 'max_depth': 13, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_terminal_nodes': 17}. Best is trial 0 with value: 0.6290342337883222.
[I 2024-09-30 17:23:08,179] Trial 1 finished with value: 0.6205019479828062 and parameters: {'n_estimators': 289, 'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 10, 'max_terminal_nodes': 48}. Best is trial 0 with value: 0.6290342337883222.
[I 2024-09-30 17:23:10,581] Trial 2 finished with value: 0.6332032965497589 and parameters: {'n_estimators': 396, 'max_depth': 16, 'min_samples_split': 7, 'min_samples_leaf': 2, 'max_terminal_nodes': 27}. Best is trial 2 with value: 0.6332032965497589.
[I 2024-09-30 17:23:13,191] Trial 3 finished with value: 0.6267390215992906 and parameters: {'n_estimators': 458, 'max_depth': 24, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_terminal_nodes': 47}. Best is trial 2 with value: 0.6332

In [59]:
print("Best trial:", study.best_trial)
print("Best hyperparameters:", study.best_params)

Best trial: FrozenTrial(number=15, state=1, values=[0.6338373551540358], datetime_start=datetime.datetime(2024, 9, 30, 17, 23, 32, 85019), datetime_complete=datetime.datetime(2024, 9, 30, 17, 23, 34, 150955), params={'n_estimators': 327, 'max_depth': 19, 'min_samples_split': 6, 'min_samples_leaf': 3, 'max_terminal_nodes': 32}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=500, log=True, low=200, step=1), 'max_depth': IntDistribution(high=32, log=False, low=2, step=1), 'min_samples_split': IntDistribution(high=10, log=False, low=2, step=1), 'min_samples_leaf': IntDistribution(high=10, log=False, low=1, step=1), 'max_terminal_nodes': IntDistribution(high=100, log=False, low=2, step=1)}, trial_id=15, value=None)
Best hyperparameters: {'n_estimators': 327, 'max_depth': 19, 'min_samples_split': 6, 'min_samples_leaf': 3, 'max_terminal_nodes': 32}


In [60]:
#get the best params and put them in the model
best_params = study.best_params
rf = RandomForestClassifier(n_estimators=best_params['n_estimators'], 
                            min_samples_split=best_params['min_samples_split'], 
                            max_depth=best_params['max_depth'], 
                            min_samples_leaf=best_params['min_samples_leaf'], 
                            #max_terminal_nodes=max_terminal_nodes,
                            random_state=1)

In [61]:
#calculate the metrics
rf.fit(train_data[predictors], train_data['Simple Target'])
predictions = rf.predict(test_data[predictors])


In [62]:
precision = precision_score(test_data['Simple Target'], predictions, average='weighted')
recall = recall_score(test_data['Simple Target'], predictions, average='weighted')
f1 = f1_score(test_data['Simple Target'], predictions, average='weighted')
accuracy = accuracy_score(test_data['Simple Target'], predictions)

In [63]:
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1: ", f1)

Accuracy:  0.6653905053598775
Precision:  0.6569554675146235
Recall:  0.6653905053598775
F1:  0.6338373551540358


In [67]:
#Serialize the best params

with open('rf_model_params.pkl', 'wb') as file:
    pkl.dump(best_params, file)