# College Football Spread Betting Model

## Imports

In [6]:
import sqlite3
import pandas as pd
import numpy as np
import os
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from opponent_adjustments import get_opponent_adjustments
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import time
import warnings
import math
from tqdm import tqdm
import optuna
import Model_Functions as MF

AttributeError: module 'pandas.errors' has no attribute 'SettingWithCopyWarning'

## Config Inputs

In [4]:
# Config - Major Inputs
DB_PATH = "cfb_data.db"
PRE_GAME_ELO_CSV_PATH = 'games_with_pregame_elo.csv'
#Define RP metrics to load and use
RP_METRICS_TO_USE =['usage','percentPPA']
# Define dfault value for missing RP data (e.g., average)
DEFAULT_RP_VALUE = 0.5
# Define how many weeks RP features should be active
RP_ACTIVE_WEEKS = 4
#Betting Parameters
BET_THRESHOLD = 0.5
WIN_PAYOUT = 0.909
LOSS_AMOUNT = 1
# EWMA Parameters
EWMA_SPAN = 5
min_periods_for_ewma = max(1, EWMA_SPAN // 2)
# Train / Test Split Years
TRAIN_END_SEASON = 2020
VALIDATION_END_SEASON = 2022
TEST_START_SEASON = VALIDATION_END_SEASON + 1
# XG Boost Params
# Define XGBoost parameters (use reasonable defaults or slightly tuned values)
# We are NOT tuning hyperparameters here, just evaluating feature sets
XGB_PARAMS = {
    'objective': 'reg:squarederror', # Regression task
    'eval_metric': 'rmse',           # Evaluation metric for XGBoost internal use
    'eta': 0.1,                      # Learning rate
    'max_depth': 5,                  # Max tree depth (control complexity)
    'subsample': 0.8,                # Fraction of samples used per tree
    'colsample_bytree': 0.8,         # Fraction of features used per tree
    'seed': 42,
    'nthread': -1, # Use all available CPU threads
    'device': 'cuda'
    # Enable internal NaN handling if using non-imputed data:
    # 'missing': np.nan # Tells XGBoost to handle NaNs
}
NUM_BOOST_ROUNDS = 100 # Number of boosting rounds (trees)

# Val Required Cols
VAL_REQUIRED_COLS = ['avg_opening_spread', 'home_points', 'away_points', 'neutral_site', 'id', 'season', 'week', 'home_team', 'away_team', 'home_pregame_elo_calc', 'away_pregame_elo_calc']

In [None]:
# RUN IF USING GOOGLE COLAB
MF.mount_with_colab()

## Phase 1: Data Foundation and Feature Engineering

In [5]:
# Load and Pre-Process Games Data
games_df = MF.preprocess_games_data(MF.load_games_data(DB_PATH))
# Add in Returning Production Data
rp_df = MF.preprocess_returning_prod_data(MF.load_returning_prod_data(DB_PATH, RP_METRICS_TO_USE))
# Load Pre-Calculated ELO Ratings
pre_game_elo_df = MF.load_ELO_ratings(PRE_GAME_ELO_CSV_PATH)
# Merge Games and ELO Data
master_df = MF.merge_elo_to_games(games_df, pre_game_elo_df)
# Merge Returning Production to Games
master_df = MF.merge_returning_production_to_games(master_df, rp_df, RP_METRICS_TO_USE, DEFAULT_RP_VALUE)
# Add Opponent Adjustments to the Master DF
master_df = MF.add_opponent_adjustments(master_df)
# Drop Missing Targets and Sort Chronologically
master_df = MF.drop_missing_target_sort_chronologically(master_df)

NameError: name 'MF' is not defined

In [None]:
# Inspect Consolidated Data
print("\n--- Master DataFrame Info (Now includes all stats) ---")
master_df.info() # Will show many more columns now

# Displaying head/tail might be too wide, focus on key columns
print("\n--- Master DataFrame Head (Key Columns) ---")
print(master_df[['id', 'season', 'week', 'home_team', 'away_team',
                 'avg_closing_spread', 'avg_opening_spread',
                 'home_pregame_elo_calc', 'away_pregame_elo_calc']].head())

print("\n--- Master DataFrame Tail (Check Sorting - Key Columns) ---")
print(master_df[['id', 'season', 'week', 'home_team', 'away_team',
                 'avg_closing_spread', 'avg_opening_spread',
                 'home_pregame_elo_calc', 'away_pregame_elo_calc']].tail())

In [None]:
# Define Target Variable and Basic Features
target_variable, basic_features, master_df = MF.define_target_variable_basic_features(master_df)
# Identify Stats to Roll
stats_to_roll = MF.identify_stats_to_roll(EWMA_SPAN)
# Reshape data to team-centric format
team_game_df = MF.reshape_to_team_centric(master_df, stats_to_roll)
# Calculate Lagged EWMAs
team_game_df, ewma_cols_generated = MF.calculate_lagged_ewma(team_game_df, stats_to_roll, EWMA_SPAN, min_periods_for_ewma)
# Merge Back to Master DF
master_df = MF.merge_ewma_to_master_df(master_df, team_game_df, ewma_cols_generated)
# Create Matchup Features
master_df = MF.create_matchup_features(master_df, stats_to_roll)
# Create Returning Production Features
master_df, potential_features, basic_features = MF.create_returning_prod_features(master_df, RP_METRICS_TO_USE, RP_ACTIVE_WEEKS)

In [None]:
# Identify and Quantify Missing Data in Features
# Use the 'potential_features' list created at the end of Step 3
# If you didn't create it, define it again:
# all_engineered_features = [col for col in master_df.columns if '_ewma_lag1' in col or 'matchup_' in col]
# potential_features = basic_features + all_engineered_features # basic_features defined in step 2

# Calculate missing percentage for features we might use
print(f"Checking missing values for {len(potential_features)} potential features...")
missing_summary = master_df[potential_features].isnull().mean().sort_values(ascending=False) * 100
missing_summary = missing_summary[missing_summary > 0] # Filter to only show columns with missing data

print("\nFeatures with Missing Values (%):")
if missing_summary.empty:
    print("No missing values found in the potential feature set.")
else:
    with pd.option_context('display.max_rows', None): # Ensure all rows are printed
        print(missing_summary)

In [None]:
# Drop FCS Games
master_df = MF.drop_fcs_games(master_df)

## Phase 2: Feature Selection and Training

In [None]:
# Pepare Data and do Temporal Split
y_train, X_train, y_val, X_val, X_train_analysis, X_val_analysis, val_df, train_df = MF.temporal_split(TRAIN_END_SEASON, VALIDATION_END_SEASON, master_df, target_variable, potential_features)
# Do Initial Filtering
current_features = MF.perform_initial_filtering(X_train_analysis, X_train, train_df)
# Perform the Correlation Analysis
features_to_consider_dropping_corr, correlations_abs = MF.perform_target_correlation_analysis(X_train_analysis, y_train, current_features, target_variable)
# Perform Model Based Importance Analysis
features_after_initial_analysis, feature_importance_df = MF.perform_model_based_importance(X_train, X_train_analysis, y_train, current_features)
# Define Candidate Feature Sets
candidate_feature_sets = MF.define_candidate_feature_sets(basic_features, features_after_initial_analysis, features_to_consider_dropping_corr, feature_importance_df, correlations_abs)
# Run Feature Set Evaluation
all_results = MF.run_feature_set_evaluation(candidate_feature_sets, XGB_PARAMS, NUM_BOOST_ROUNDS, X_train, y_train, X_val, y_val, val_df)
# Present Feature Set Evaluation Results
results_df = MF.present_feature_set_evaluation_results(all_results)

## Phase 3: Model Selection and Training

In [None]:
# Select Best Feature SEt
best_features = MF.select_best_feature_set(results_df, candidate_feature_sets, X_train, X_val)

In [None]:
# Run Optuna Study - LONG RUNNING CELL
study_hp = MF.run_optuna_study(X_train, X_val, y_train, y_val, val_df, best_features, WIN_PAYOUT, LOSS_AMOUNT, BET_THRESHOLD)

In [None]:
# Identify Best Hyperparameters
best_xgb_params = MF.identify_best_hyperparameters(study_hp)

## Phase 4: Final Evaluation

In [None]:
# Define Train, Validation, and Test sets
train_val_df, test_df = MF.define_train_val_test_sets(master_df, VALIDATION_END_SEASON, TEST_START_SEASON)
# Prepare Data for Final Model
X_train_val_nan, y_train_val, X_test_nan, y_test, dtrain_val, dtest = MF.prepare_data_for_final_model(train_val_df, test_df, best_features, target_variable)
# Train Final XGBoost Model
final_model = MF.train_final_model(best_xgb_params, best_features, dtrain_val, dtest)
# Predict on the Test SEt
predictions_test_series, y_pred_test = MF.predict_test_set(final_model, dtest, y_test)

In [None]:
# Evaluate Statistical Metrics on Test Set
MF.evaluate_model_statistics(y_test, y_pred_test, predictions_test_series, test_df, VAL_REQUIRED_COLS)