In [None]:
# load necessary libraries
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from xgboost import plot_importance

In [None]:
# load 2022 data
# clean 2022 data

In [None]:
# split data into train/test sets
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2)

In [None]:
# run algorithm without hyperparameter tuning
xgbr = xgb.XGBRegressor(objective='reg:squarederror')
xgbr.fit(xtrain, ytrain)

In [None]:
# calculate the RMSE and accuracy of the algorithm
ypred = xgbr.predict(xtest)
mse = mean_squared_error(ytest, ypred)
print("RMSE: %.5f" % (mse**(1/2.0)))

predictions = [round(value) for value in ypred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
# use grid search to tune hyperparameters
params = { 'max_depth': [3, 5, 6, 10, 15, 20],
           'learning_rate': [0.01, 0.1, 0.2, 0.3],
           'subsample': np.arange(0.5, 1.0, 0.1),
           'colsample_bytree': np.arange(0.4, 1.0, 0.1),
           'colsample_bylevel': np.arange(0.4, 1.0, 0.1),
           'n_estimators': [100, 500, 1000]}
           
xgbr = xgb.XGBRegressor(seed = 20)
clf = GridSearchCV(estimator=xgbr, 
                   param_grid=params,
                   scoring='neg_mean_squared_error', 
                   verbose=1)
clf.fit(X, y)
print("Best parameters:", clf.best_params_)
print("Lowest RMSE: ", (-clf.best_score_)**(1/2.0))

In [None]:
# update algorithm with tuned hyperparameters
xgb1 = xgb.XGBClassifier(
 learning_rate = 0.05,
 n_estimators= 150,
 max_depth=3,
 colsample_bytree=0.8,
 objective= 'reg:squarederror',
 seed=20)

xgb1.fit(xtrain, ytrain)

In [None]:
# calculate the RMSE and accuracy of the updated algorithm
ypred = xgb1.predict(xtest)
mse = mean_squared_error(ytest, ypred)
print("RMSE: %.5f" % (mse**(1/2.0)))

predictions = [round(value) for value in predictions]
accuracy = accuracy_score(ytest, ypred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
# plot feature importance of model
plot_importance(xgb1)

In [None]:
# load 2023 Spring Training data
# clean 2023 Spring Training data

In [None]:
# use model to predict results for 2023 Spring Training data
ST_pred = xgb1.predict(dfST)