In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
import time
import numpy as np

df = pd.read_csv("/Users/ryansteele/my_repo2/data/processed/matchups/matchups_all_seasons.csv")

target = "point_diff"
df = df.dropna(subset=[target])

feature_cols = [
    "home_elo_pre",
    "away_elo_pre",
    "diff_elo_pre",

    "passingattempts",
    "rushingyardsperattempt",
    "quarterbackhits",
    "opp_rolling_win_rate_5",
    "timessackedyards",
    "opponentpenaltyyards",

    "rolling_yards_total_3",
    "rolling_points_for_3",
    "rolling_points_against_3",
    "opp_rolling_yards_total_3",
    "opp_rolling_points_for_3",
    "opp_rolling_points_against_3",
    
    "off_epa_per_play_rolling_3",
    "off_dropback_epa_rolling_3",
    "def_epa_per_play_rolling_3",
    "def_dropback_epa_against_rolling_3",
    "def_success_rate_rolling_3",
    "def_rush_epa_against_rolling_3",
    "off_success_rate_rolling_3",
    "off_rush_epa_rolling_3",
    "epa_off_diff_rolling_3",
    "epa_def_diff_rolling_3",
    "off_epa_per_play_rolling_5",
    "def_epa_per_play_rolling_5",
]

X = df[feature_cols]
y = df[target]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [16]:
def evaluate(model, X_val, y_val, train_time):
    preds = model.predict(X_val)

    mae = mean_absolute_error(y_val, preds)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    r2 = r2_score(y_val, preds)

    win_true = (y_val > 0).astype(int)
    win_pred = (preds > 0).astype(int)
    acc = accuracy_score(win_true, win_pred)

    return {
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
        "Win Accuracy": acc,
        "Train Time (s)": train_time
    }


In [17]:
start = time.time()

xgb_model = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=400,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=34
)

xgb_model.fit(X_train, y_train)

xgb_time = time.time() - start
xgb_results = evaluate(xgb_model, X_val, y_val, xgb_time)

In [18]:
start = time.time()

rf_model = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    random_state=34,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

rf_time = time.time() - start
rf_results = evaluate(rf_model, X_val, y_val, rf_time)


In [19]:
comparison = pd.DataFrame([
    {"Model": "XGBoost", **xgb_results},
    {"Model": "Random Forest", **rf_results}
])

comparison


Unnamed: 0,Model,MAE,RMSE,R2,Win Accuracy,Train Time (s)
0,XGBoost,10.996313,13.617373,0.123517,0.605911,0.305751
1,Random Forest,11.060421,13.775733,0.103013,0.571429,0.939565


### Model Comparison: XGBoost vs. RandomForsest

Though both performed well, XGBoost performs slightly better across all 
evaluation metrics and had a signifigantly shorter train time.