In [1]:
''' 
Notes:
- since only looking at 2012-15, can just do cross-val. Don't expect temporal differences
(as opposed to if training from 1950-2023, where causal differences in league)
- want to rank all prospects, so no test set 
'''

" \nNotes:\n- since only looking at 2012-15, can just do cross-val. Don't expect temporal differences\n(as opposed to if training from 1950-2023, where causal differences in league)\n- only care about ranking current prospects, not how metrics generalize to unseen prospects.\nso no need for test set \n"

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
import pickle

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import cross_val_predict

In [4]:
import xgboost as xgb
from xgboost import plot_importance

In [5]:
dataset = pd.read_csv("model_data/input_dataset.csv", index_col=0)


In [9]:
# drop targets and NFL production
X = dataset.drop(["Score", "Success"] + ['num_seasons', 'GamesPlayed', 'GamesStarted', 'Plays', 'PositivePlays',
       'NegativePlays', 'GP%', 'GS%', 'PosPlay%', 'NegPlay%', 'NeutPlay%'],axis=1)
X["ProPosition"] = X["ProPosition"].astype("category")
X["IndyInvite"] = X["IndyInvite"].astype("category")

In [1]:
# return score and params
def grid_search(model, X, y, param_grid, num_folds, eval_scoring_str):
    clf = GridSearchCV(
        model,
        param_grid,
        verbose=1,
        cv=num_folds, 
        refit=True,
        scoring=eval_scoring_str  #accuracy, neg_log_loss, roc_auc
    )
    clf.fit(X, y)
    print(f"{clf.best_score_}")
    print(f"{clf.best_params_}")
    return clf

def run_grid_search():
    param_grid = {"max_depth":    [6, 4],
              "learning_rate": [0.1],
              "n_estimators": [100, 500, 600]
            }
    NUM_FOLDS=5
    grid_search_xg = xgb.XGBRegressor(tree_method="hist", enable_categorical=True, 
                                        objective='reg:squarederror')
    grid_search_cv = grid_search(grid_search_xg, X, dataset['Score'], 
                                param_grid, num_folds=NUM_FOLDS, eval_scoring_str='neg_mean_squared_error')    # neg_mean_squared_error
    grid_search_model = grid_search_cv.best_estimator_
    grid_search_preds = cross_val_predict(grid_search_model, X, dataset['Score'], cv=X.shape[0])



In [10]:
def objective(trial):
    param = {"max_depth":    trial.suggest_categorical('max_depth', [2, 3, 4, 5, 6, 7, 8, 9]),
              "learning_rate": trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
              "n_estimators": trial.suggest_int('n_estimators', 100, 1000,step=100),
              "subsample" : trial.suggest_float('subsample', 0.1, 1, step=0.1),
              "min_child_weight" : trial.suggest_int('min_child_weight', 1, 10, step=1), 
              "colsample_bytree" : trial.suggest_float('subsample', 0.1, 1, step=0.1),
            }
    
    clf = xgb.XGBRegressor(tree_method="hist", enable_categorical=True, 
                                      objective='reg:absoluteerror', **param)
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(clf, X, dataset['Score'], cv=kfold, scoring='neg_mean_absolute_error')
    score = np.mean(scores)
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)
print(study.best_trial)

[I 2024-04-07 13:52:01,022] A new study created in memory with name: no-name-54f33364-a79a-4c16-af7d-542f49eecd92


[I 2024-04-07 13:52:03,896] Trial 0 finished with value: -0.06296795641091547 and parameters: {'max_depth': 9, 'learning_rate': 0.05625242735857511, 'n_estimators': 300, 'subsample': 0.2, 'min_child_weight': 3}. Best is trial 0 with value: -0.06296795641091547.
[I 2024-04-07 13:52:07,757] Trial 1 finished with value: -0.05704431929763076 and parameters: {'max_depth': 5, 'learning_rate': 0.0160986838287145, 'n_estimators': 500, 'subsample': 1.0, 'min_child_weight': 5}. Best is trial 1 with value: -0.05704431929763076.
[I 2024-04-07 13:52:16,311] Trial 2 finished with value: -0.05967539246698013 and parameters: {'max_depth': 9, 'learning_rate': 0.24129133929841357, 'n_estimators': 700, 'subsample': 0.9, 'min_child_weight': 6}. Best is trial 1 with value: -0.05704431929763076.
[I 2024-04-07 13:52:16,740] Trial 3 finished with value: -0.0652410051602136 and parameters: {'max_depth': 3, 'learning_rate': 0.014483856214201587, 'n_estimators': 100, 'subsample': 0.5, 'min_child_weight': 4}. Bes

FrozenTrial(number=22, state=TrialState.COMPLETE, values=[-0.056016701589095205], datetime_start=datetime.datetime(2024, 4, 7, 13, 53, 14, 269439), datetime_complete=datetime.datetime(2024, 4, 7, 13, 53, 15, 758795), params={'max_depth': 2, 'learning_rate': 0.017640083858920422, 'n_estimators': 400, 'subsample': 0.8, 'min_child_weight': 7}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'max_depth': CategoricalDistribution(choices=(2, 3, 4, 5, 6, 7, 8, 9)), 'learning_rate': FloatDistribution(high=0.5, log=True, low=0.01, step=None), 'n_estimators': IntDistribution(high=1000, log=False, low=100, step=100), 'subsample': FloatDistribution(high=1.0, log=False, low=0.1, step=0.1), 'min_child_weight': IntDistribution(high=10, log=False, low=1, step=1)}, trial_id=22, value=None)


In [11]:
study.best_value

-0.056016701589095205

In [12]:
study.best_params

{'max_depth': 2,
 'learning_rate': 0.017640083858920422,
 'n_estimators': 400,
 'subsample': 0.8,
 'min_child_weight': 7}

In [14]:
clf = xgb.XGBRegressor(tree_method="hist", enable_categorical=True, 
                                      objective='reg:squarederror', **study.best_params)
clf.fit(X, dataset['Score'])


In [16]:
model_pred = cross_val_predict(clf, X, dataset['Score'], cv=X.shape[0])

In [17]:
dataset_with_scores = dataset.copy(deep=True)
dataset_with_scores['Pred_Score'] = model_pred
dataset_with_scores['Pred_Error'] = dataset_with_scores.Score - dataset_with_scores.Pred_Score

In [18]:
dataset_with_scores = dataset_with_scores[['Pred_Score', 'Pred_Error'] + list(dataset.columns)]

In [19]:
final_mse = np.power(dataset_with_scores.Pred_Score - dataset_with_scores.Score, 2).mean()
print(f"MSE = {np.round(final_mse, 5)}")
print(f"RMSE = {np.round(np.sqrt(final_mse), 5)}")
final_mae = abs(dataset_with_scores.Pred_Score - dataset_with_scores.Score).mean()
print(f"MAE = {np.round(final_mae, 5)}")

MSE = 0.00519
RMSE = 0.07203
MAE = 0.05594


In [22]:
pickle.dump(clf, open("model_data/xgb_model.pkl", "wb"))

In [23]:
dataset_with_scores.to_csv("data/dataset_with_preds.csv")