In [1]:
import pandas as pd
import numpy as np

  from pandas.core import (


In [73]:
df = pd.read_csv('FINAL.csv', low_memory=False).drop(columns='Unnamed: 0')
df.shape

(6638, 284)

In [85]:
test = pd.Series([1,2,-4,7,10,52])
shift = df.SPREAD.mean() - test.mean()
round(test + shift)

0    -8.0
1    -7.0
2   -13.0
3    -2.0
4     1.0
5    43.0
dtype: float64

In [76]:
RM = [col for col in df.columns if "AVG" in col or "MOMENTUM" in col]
len(RM)

198

In [77]:
df = df.dropna(subset=RM)

In [78]:
df = df.dropna(subset=['H_WIN_PCT', 'A_WIN_PCT'])

In [79]:
lower_bound = -36.5
upper_bound = 39.5

# Cap the target variable (e.g., spread) within the defined bounds
df["SPREAD_CAPPED"] = df["SPREAD"].clip(lower=lower_bound, upper=upper_bound)

In [80]:
df.shape

(6194, 285)

In [81]:
features = [col for col in df.columns if "AVG" in col or "MOMENTUM" in col or "WIN_PCT" in col]
len(features)

200

## Dimensionality reduction

In [82]:
corr_matrix = df[features].corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

high_corr_features = [column for column in upper.columns if any(upper[column] > 0.9)]

df = df.drop(columns=high_corr_features)

df.shape

(6194, 245)

In [83]:
features = [col for col in df.columns if "AVG" in col or "MOMENTUM" in col or "WIN_PCT" in col]
len(features)

160

In [36]:
from scipy.stats import zscore

## Normalize numerical features

In [37]:
zscores = np.abs(df[features].apply(zscore))

## Handle outliers

In [38]:
df = df[(zscores < 5).all(axis=1)]

In [39]:
TRAIN = df[df.SEASON_YEAR != '2024-25']
TEST = df[df.SEASON_YEAR == '2024-25']

In [40]:
TRAIN.shape[0] / (TRAIN.shape[0] + TEST.shape[0]) * 100

89.42165101334652

In [41]:
X_train = TRAIN[features]
X_test = TEST[features]

X_train.shape, X_test.shape

((5427, 105), (642, 105))

In [42]:
y_train = TRAIN['SPREAD_CAPPED']
y_test = TEST['SPREAD_CAPPED']

y_train.shape[0], y_test.shape[0]

(5427, 642)

In [43]:
from sklearn.metrics import mean_absolute_error, r2_score

In [44]:
rfr = RandomForestRegressor(n_estimators=100, criterion='absolute_error', random_state=42, n_jobs=-1)
rfr.fit(X_train, y_train)

NameError: name 'RandomForestRegressor' is not defined

In [125]:
y_pred = rfr.predict(X_test)

In [126]:
mae = mean_absolute_error(y_test, y_pred)

In [127]:
print("Mean Absolute Error:", mae)

Mean Absolute Error: 11.449283489096574


## Further dimensionality reduction

In [128]:
feature_importances = pd.Series(rfr.feature_importances_, index=features)
top_features = feature_importances.nlargest(50).index

In [130]:
X_train = X_train[top_features]
X_test = X_test[top_features]

## Baseline Ridge Regression Model

In [133]:
from sklearn.linear_model import Ridge

In [134]:
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

y_pred = ridge.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print(f"Baseline Ridge Regression MAE: {mae:.4f}")

Baseline Ridge Regression MAE: 11.0996


## Alpha parameter tuning

In [135]:
from sklearn.model_selection import GridSearchCV

alpha_values = np.logspace(-3, 3, 7)  # [0.001, 0.01, 0.1, 1, 10, 100, 1000]

ridge_grid = GridSearchCV(Ridge(), {'alpha': alpha_values}, scoring='neg_mean_absolute_error', cv=5)
ridge_grid.fit(X_train, y_train)

best_alpha = ridge_grid.best_params_['alpha']
best_ridge = ridge_grid.best_estimator_
y_pred = best_ridge.predict(X_test)
mae_tuned = mean_absolute_error(y_test, y_pred)

print(f"Best alpha: {best_alpha}")
print(f"Tuned Ridge Regression MAE: {mae_tuned:.4f}")


Best alpha: 10.0
Tuned Ridge Regression MAE: 11.0752


In [136]:
fine_alpha_values = np.linspace(best_alpha / 2, best_alpha * 2, 10)

ridge_grid_fine = GridSearchCV(Ridge(), {'alpha': fine_alpha_values}, scoring='neg_mean_absolute_error', cv=5)
ridge_grid_fine.fit(X_train, y_train)

best_alpha_fine = ridge_grid_fine.best_params_['alpha']
y_pred_fine = ridge_grid_fine.best_estimator_.predict(X_test)
mae_tuned_fine = mean_absolute_error(y_test, y_pred_fine)

print(f"Refined best alpha: {best_alpha_fine}")
print(f"Refined Ridge Regression MAE: {mae_tuned_fine:.4f}")


Refined best alpha: 16.666666666666668
Refined Ridge Regression MAE: 11.0705


## Trying tree models

In [45]:
from xgboost import XGBRegressor

In [46]:
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, objective="reg:absoluteerror", random_state=42)
xgb.fit(X_train, y_train)

In [47]:
y_pred_xgb = xgb.predict(X_test)

In [48]:
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
print("XGBoost MAE:", mae_xgb)

XGBoost MAE: 11.419951627780959


In [171]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 1, 10],
    'reg_lambda': [0, 0.1, 1, 10]
}

# Grid Search
xgb = XGBRegressor(random_state=42)
grid_search = GridSearchCV(xgb, param_grid, scoring='neg_mean_absolute_error', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get best model and evaluate
best_xgb = grid_search.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)
mae_xgb_tuned = mean_absolute_error(y_test, y_pred_xgb)

print(f"Tuned XGBoost MAE: {mae_xgb_tuned:.4f}")
print(f"Best Params: {grid_search.best_params_}")


Fitting 5 folds for each of 5184 candidates, totalling 25920 fits
Tuned XGBoost MAE: 11.2259
Best Params: {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'reg_alpha': 1, 'reg_lambda': 10, 'subsample': 0.8}


In [49]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=200, max_depth=5, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print(f"Random Forest MAE: {mae_rf:.4f}")


KeyboardInterrupt: 

## Back to linear regression

In [50]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

ridge_poly = make_pipeline(PolynomialFeatures(degree=2, include_bias=False), Ridge(alpha=best_alpha_fine))
ridge_poly.fit(X_train, y_train)

# Predict and evaluate
y_pred_poly = ridge_poly.predict(X_test)
mae_poly = mean_absolute_error(y_test, y_pred_poly)

print(f"Ridge with Polynomial Features MAE: {mae_poly:.4f}")


NameError: name 'Ridge' is not defined

In [169]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge

interaction = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

ridge_interaction = make_pipeline(interaction, Ridge(alpha=best_alpha_fine))
ridge_interaction.fit(X_train, y_train)

# Predict and evaluate
y_pred_interaction = ridge_interaction.predict(X_test)
mae_interaction = mean_absolute_error(y_test, y_pred_interaction)

print(f"Ridge with Interaction Terms MAE: {mae_interaction:.4f}")


Ridge with Interaction Terms MAE: 11.5551


## Optimizing Ridge Regression (SPREAD)

In [271]:
from sklearn.feature_selection import RFE, SelectKBest, f_regression
from sklearn.decomposition import PCA

In [272]:
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
coef_magnitudes = np.abs(ridge.coef_)
important_features = np.argsort(coef_magnitudes)[-100:]
X_train_ridge = X_train.iloc[:, important_features]
X_test_ridge = X_test.iloc[:, important_features]

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [273]:
rfe = RFE(Ridge(alpha=1.0), n_features_to_select=50)
rfe.fit(X_train, y_train)
X_train_rfe = X_train.iloc[:, rfe.support_]
X_test_rfe = X_test.iloc[:, rfe.support_]

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [274]:
pca = PCA(n_components=50)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [295]:
selector = SelectKBest(f_regression, k=50)
X_train_kbest = selector.fit_transform(X_train, y_train)
X_test_kbest = selector.transform(X_test)

In [296]:
feature_sets = {
    'Ridge Coefficients': (X_train_ridge, X_test_ridge),
    'Recursive Feature Elimination': (X_train_rfe, X_test_rfe),
    'Principal Component Analysis': (X_train_pca, X_test_pca),
    'SelectKBest': (X_train_kbest, X_test_kbest),
}

In [297]:
mae_results = {}

for method, (X_train_fs, X_test_fs) in feature_sets.items():
    param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}
    grid_search = GridSearchCV(Ridge(), param_grid, cv=5, scoring='neg_mean_absolute_error')
    grid_search.fit(X_train_fs, y_train)
    best_ridge = grid_search.best_estimator_
    y_pred = best_ridge.predict(X_test_fs)
    mae_results[method] = mean_absolute_error(y_test, y_pred)
    print(f"{method} - Best Alpha: {grid_search.best_params_['alpha']}, MAE: {mae_results[method]}")


Ridge Coefficients - Best Alpha: 10, MAE: 10.988955396021392
Recursive Feature Elimination - Best Alpha: 10, MAE: 11.077957803403292
Principal Component Analysis - Best Alpha: 0.01, MAE: 11.098252646040526


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, 

SelectKBest - Best Alpha: 1, MAE: 10.984891599972451


In [316]:
param_grid = {'alpha': [0.35]}
grid_search = GridSearchCV(Ridge(), param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(X_train_kbest, y_train)

# Best model
best_ridge = grid_search.best_estimator_

# Predictions and evaluation
y_pred = best_ridge.predict(X_test_kbest)
mae = mean_absolute_error(y_test, y_pred)

# Display final results
best_ridge_params = grid_search.best_params_
print(f"Final Ridge Alpha: {best_ridge_params['alpha']}")
print(f"Final Mean Absolute Error: {mae}")

Final Ridge Alpha: 0.35
Final Mean Absolute Error: 10.98244153012179
