In [1]:
import Model_Functions as MF
import Datapull_Functions as DP
import json

# API Setup
BASE_URL = 'https://api.collegefootballdata.com/'
with open("API_KEY.txt","r") as file:
    API_KEY = file.read()
DB_PATH = "cfb_data.db"

# Config - Major Inputs
PRE_GAME_ELO_CSV_PATH = 'games_with_pregame_elo.csv'
#Define RP metrics to load and use
RP_METRICS_TO_USE =['usage','percentPPA']
# Define dfault value for missing RP data (e.g., average)
DEFAULT_RP_VALUE = 0.5
# Define how many weeks RP features should be active
RP_ACTIVE_WEEKS = 4
#Betting Parameters
BET_THRESHOLD = 1.5
WIN_PAYOUT = 0.909
LOSS_AMOUNT = 1
# EWMA Parameters
EWMA_SPAN = 5
min_periods_for_ewma = max(1, EWMA_SPAN // 2)
# Train / Test Split Years
TRAIN_END_SEASON = 2020
VALIDATION_END_SEASON = 2022
TEST_START_SEASON = VALIDATION_END_SEASON + 1
# XG Boost Params
# Define XGBoost parameters (use reasonable defaults or slightly tuned values)
# We are NOT tuning hyperparameters here, just evaluating feature sets
XGB_PARAMS = {
    'objective': 'reg:squarederror', # Regression task
    'eval_metric': 'rmse',           # Evaluation metric for XGBoost internal use
    'eta': 0.1,                      # Learning rate
    'max_depth': 5,                  # Max tree depth (control complexity)
    'subsample': 0.8,                # Fraction of samples used per tree
    'colsample_bytree': 0.8,         # Fraction of features used per tree
    'seed': 42,
    'nthread': -1, # Use all available CPU threads
    'device': 'cuda'
    # Enable internal NaN handling if using non-imputed data:
    # 'missing': np.nan # Tells XGBoost to handle NaNs
}
NUM_BOOST_ROUNDS = 100 # Number of boosting rounds (trees)

# Val Required Cols
VAL_REQUIRED_COLS = ['avg_opening_spread', 'home_points', 'away_points', 'neutral_site', 'id', 'season', 'week', 'home_team', 'away_team', 'home_pregame_elo_calc', 'away_pregame_elo_calc']

In [3]:
# Load and Pre-Process Games Data
games_df = MF.preprocess_games_data(MF.load_games_data(DB_PATH))
# Add in Returning Production Data
rp_df = MF.preprocess_returning_prod_data(MF.load_returning_prod_data(DB_PATH, RP_METRICS_TO_USE), RP_METRICS_TO_USE, DEFAULT_RP_VALUE)
# Load Pre-Calculated ELO Ratings
pre_game_elo_df = MF.load_ELO_ratings(PRE_GAME_ELO_CSV_PATH)
# Merge Games and ELO Data
master_df = MF.merge_elo_to_games(games_df, pre_game_elo_df)
# Merge Returning Production to Games
master_df = MF.merge_returning_production_to_games(master_df, rp_df, RP_METRICS_TO_USE, DEFAULT_RP_VALUE)
# Add Opponent Adjustments to the Master DF
master_df = MF.add_opponent_adjustments(master_df)
# Drop Missing Targets and Sort Chronologically
master_df = MF.drop_missing_target_sort_chronologically(master_df)
# Define Target Variable and Basic Features (Necessary?)
target_variable, basic_features, master_df = MF.define_target_variable_basic_features(master_df)
# Identify Stats to Roll
stats_to_roll = MF.identify_stats_to_roll(EWMA_SPAN)
# Reshape data to team-centric format
team_game_df = MF.reshape_to_team_centric(master_df, stats_to_roll)
# Calculate Lagged EWMAs
team_game_df, ewma_cols_generated = MF.calculate_lagged_ewma(team_game_df, stats_to_roll, EWMA_SPAN, min_periods_for_ewma)
# Merge Back to Master DF
master_df = MF.merge_ewma_to_master_df(master_df, team_game_df, ewma_cols_generated)
# Create Matchup Features
master_df = MF.create_matchup_features(master_df, stats_to_roll)
# Create Returning Production Features
master_df, potential_features, basic_features = MF.create_returning_prod_features(master_df, RP_METRICS_TO_USE, RP_ACTIVE_WEEKS)
# Drop FCS Games
master_df = MF.drop_fcs_games(master_df)

Connecting to database: cfb_data.db
Loading ALL games data (including advanced stats)...
Loaded 10287 completed games with 142 columns.
Database connection closed.
Converting relevant columns to numeric...
Missing value check (post-numeric conversion):
  Column 'avg_closing_spread' missing: 0.93%
  Column 'homePoints' missing: 0.00%
  Column 'awayPoints' missing: 0.00%
Loading returning production data (['usage', 'percentPPA'])...
Loaded 1420 returning production records.
Database connection closed.
Preprocessing returning production data...
Filling NaNs in RP data with default: 0.5
Loading pre-game Elo ratings from: games_with_pregame_elo.csv
Loaded Elo ratings for 10287 games.
Loaded pre-game Elo ratings.
Merging games data with pre-game Elo ratings...
Merging returning production data...
Filling NaNs in merged RP column 'home_rp_usage' with 0.5
Filling NaNs in merged RP column 'home_rp_percentPPA' with 0.5
Filling NaNs in merged RP column 'away_rp_usage' with 0.5
Filling NaNs in mer

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  master_df[col].fillna(DEFAULT_RP_VALUE, inplace=True)



Creating hybrid league average columns...

Calculating lagged expanding team averages...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[prev_season_col].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[rolling_avg_col].fillna(0, inplace=True) # Fill any remaining NaNs
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which w

Filling initial team NaNs with 0.
DataFrame shape after revised merging: (10287, 298)

Applying Opponent Adjustments with Hybrid League Average...

Hybrid Opponent Adjustments Applied.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  team_centric_df[col].fillna(0, inplace=True)


Dropping potentially duplicated columns: ['id_y', 'seasonType_y', 'completed_y', 'neutralSite_y', 'conferenceGame_y', 'attendance_y', 'homeConference_y', 'homeClassification_y', 'homePoints_y', 'homePostgameWinProbability_y', 'homePregameElo_y', 'homePostgameElo_y', 'awayConference_y', 'awayClassification_y', 'awayPoints_y', 'awayPostgameWinProbability_y', 'awayPregameElo_y', 'awayPostgameElo_y', 'avg_closing_spread_y', 'avg_closing_total_y', 'avg_opening_spread_y', 'avg_opening_total_y', 'home_offense_plays_y', 'home_offense_drives_y', 'home_offense_ppa_y', 'home_offense_totalPPA_y', 'home_offense_successRate_y', 'home_offense_explosiveness_y', 'home_offense_powerSuccess_y', 'home_offense_stuffRate_y', 'home_offense_lineYards_y', 'home_offense_lineYardsTotal_y', 'home_offense_secondLevelYards_y', 'home_offense_secondLevelYardsTotal_y', 'home_offense_openFieldYards_y', 'home_offense_openFieldYardsTotal_y', 'home_offense_standardDowns_ppa_y', 'home_offense_standardDowns_successRate_y', 

In [4]:
import pandas as pd
import numpy as np
import json
import xgboost as xgb
import os
from IPython.display import display, HTML # For pretty printing later

# --- Configuration: File Paths ---
MODEL_FILENAME = "final_model.json"
BEST_FEATURES_FILENAME = "final_model_best_features.json"

# --- Configuration: Prediction Target ---
# Define the season you want to generate predictions for from the master_df
PREDICT_SEASON = 2024
# Optionally, predict for a specific week or all future weeks in that season
# PREDICT_WEEK = 5 # Set to None to predict all future games in PREDICT_SEASON

print("--- Prediction Notebook Setup ---")
print(f"Model Path:     {MODEL_FILENAME}")
print(f"Features Path:  {BEST_FEATURES_FILENAME}")

# --- Load Best Features List ---
if not os.path.exists(BEST_FEATURES_FILENAME):
    raise FileNotFoundError(f"Best features file not found: {BEST_FEATURES_FILENAME}")
with open(BEST_FEATURES_FILENAME, 'r') as f:
    best_features = json.load(f)
print(f"Successfully loaded {len(best_features)} feature names from {BEST_FEATURES_FILENAME}.")

# --- Load Trained XGBoost Model ---
if not os.path.exists(MODEL_FILENAME):
    raise FileNotFoundError(f"Trained XGBoost model file not found: {MODEL_FILENAME}")
model = xgb.Booster() # Initialize empty booster
model.load_model(MODEL_FILENAME)
print(f"Successfully loaded trained XGBoost model from {MODEL_FILENAME}.")

--- Prediction Notebook Setup ---
Model Path:     final_model.json
Features Path:  final_model_best_features.json
Successfully loaded 92 feature names from final_model_best_features.json.
Successfully loaded trained XGBoost model from final_model.json.


In [5]:
# master_df loaded above

# --- Filter for Games to Predict ---
# We want to predict for games in PREDICT_SEASON
# (assuming 'completed' column exists and is 0 or False for future games)
games_to_predict_df = master_df[
    (master_df['season'] == PREDICT_SEASON)
].copy()

# Optional: Filter by specific week if PREDICT_WEEK is set
# if PREDICT_WEEK is not None and 'week' in games_to_predict_df.columns:
#     games_to_predict_df = games_to_predict_df[games_to_predict_df['week'] == PREDICT_WEEK].copy()
#     print(f"Filtered for Season {PREDICT_SEASON}, Week {PREDICT_WEEK}.")

if games_to_predict_df.empty:
    print(f"No games found in master_df for Season {PREDICT_SEASON} (and specified week, if any) to predict.")
    # Exit or handle as appropriate
else:
    print(f"Found {len(games_to_predict_df)} games from Season {PREDICT_SEASON} to predict.")

    # --- Select Best Features for Prediction ---
    # Ensure all 'best_features' are present in the loaded master_df
    missing_model_features = [f for f in best_features if f not in games_to_predict_df.columns]
    if missing_model_features:
        raise ValueError(f"The following features required by the model are missing from master_df: {missing_model_features}")

    X_predict = games_to_predict_df[best_features].copy() # Use .copy() to avoid SettingWithCopyWarning

    # --- Handle NaNs in Prediction Input (Consistent with Training) ---
    # If your model was trained with XGBoost's internal NaN handling (default),
    # then no explicit imputation is needed here.
    # If you *did* use imputation during training and saved imputation_values.joblib,
    # you would load and apply it here.
    nan_in_pred_features = X_predict.isnull().sum().sum()
    if nan_in_pred_features > 0:
        print(f"Warning: {nan_in_pred_features} NaN values found in features for prediction. XGBoost will attempt to handle.")
        # If you had a specific strategy like filling with 0 for unexpected NaNs during prediction:
        # X_predict.fillna(0, inplace=True) # Example: fill with 0
        # print("Filled unexpected NaNs with 0 for prediction.")
    else:
        print("No NaNs found in the selected features for prediction.")


    # Prepare DMatrix for XGBoost
    dpredict = xgb.DMatrix(X_predict)
    print("DMatrix for prediction created.")

Found 798 games from Season 2024 to predict.
No NaNs found in the selected features for prediction.
DMatrix for prediction created.


In [7]:
if not games_to_predict_df.empty: # Renamed from games_to_predict_df
    print("\n--- Making Predictions ---")
    predicted_spreads = model.predict(dpredict)
    print("Predictions complete.")

    # --- Prepare Results Table ---
    # Start with more columns from games_to_analyze_df for outcome grading
    display_cols_base = ['id', 'season', 'week', 'homeTeam', 'awayTeam',
                         'homePoints', 'awayPoints'] # Added scores
    if 'avg_opening_spread' in games_to_predict_df.columns:
        display_cols_base.append('avg_opening_spread')

    results_display_df = games_to_predict_df[display_cols_base].copy()
    results_display_df['predicted_closing_spread'] = predicted_spreads
    results_display_df['predicted_closing_spread'] = results_display_df['predicted_closing_spread'].round(2)

    # Calculate model lean and suggested bet side
    if 'avg_opening_spread' in results_display_df.columns:
        results_display_df['avg_opening_spread'] = pd.to_numeric(results_display_df['avg_opening_spread'], errors='coerce')
        results_display_df['model_lean_vs_open'] = (
            results_display_df['predicted_closing_spread'] - results_display_df['avg_opening_spread']
        ).round(2)
        results_display_df['abs_model_vs_open_discrepancy'] = results_display_df['model_lean_vs_open'].abs()

        # Determine suggested bet TEAM NAME based on the threshold
        results_display_df['suggested_bet_on_team'] = np.select(
            [
                results_display_df['model_lean_vs_open'] < -BET_THRESHOLD,
                results_display_df['model_lean_vs_open'] > BET_THRESHOLD
            ],
            [
                results_display_df['homeTeam'],
                results_display_df['awayTeam']
            ],
            default='No Edge'
        )
        # Determine suggested bet SIDE ('home' or 'away') for grading
        results_display_df['suggested_bet_side_for_grading'] = np.select(
            [
                results_display_df['model_lean_vs_open'] < -BET_THRESHOLD,
                results_display_df['model_lean_vs_open'] > BET_THRESHOLD
            ],
            ['home', 'away'],
            default=None # No bet placed if no edge
        )

        mask_no_open = results_display_df['avg_opening_spread'].isna()
        results_display_df.loc[mask_no_open, 'model_lean_vs_open'] = np.nan
        results_display_df.loc[mask_no_open, 'suggested_bet_on_team'] = 'No Opening Line'
        results_display_df.loc[mask_no_open, 'suggested_bet_side_for_grading'] = None

        # Sort by discrepancy
        results_display_df.sort_values(by='abs_model_vs_open_discrepancy', ascending=False, inplace=True, na_position='last')
    else:
        results_display_df['model_lean_vs_open'] = "N/A"
        results_display_df['suggested_bet_on_team'] = "N/A"
        results_display_df['suggested_bet_side_for_grading'] = None
        results_display_df['abs_model_vs_open_discrepancy'] = np.nan
        results_display_df.sort_values(by=['week', 'homeTeam'], inplace=True)

    # --- Grade Bets and Determine Actual Outcome ---
    results_display_df['actual_margin_away'] = np.nan # Initialize
    results_display_df['bet_result'] = "N/A (No Bet Placed/Scores)" # Default

    # Check if scores are available to grade
    if results_display_df['homePoints'].notna().all() and results_display_df['awayPoints'].notna().all():
        print("Grading bets based on available scores...")
        results_display_df['actual_margin_away'] = results_display_df['awayPoints'] - results_display_df['homePoints']

        # Define constants if not globally available
        WIN_PAYOUT_TABLE = 1.0
        LOSS_AMOUNT_TABLE = 1.1 # Assuming -110
        PUSH_PAYOUT_TABLE = 0.0

        bet_outcomes = []
        for index, row in results_display_df.iterrows():
            bet_side = row['suggested_bet_side_for_grading']
            opening_spread = row['avg_opening_spread']
            actual_margin = row['actual_margin_away']
            outcome_text = "N/A" # Default for no bet or missing data

            if pd.notna(bet_side) and pd.notna(opening_spread) and pd.notna(actual_margin):
                if bet_side == 'away':
                    if actual_margin > opening_spread: outcome_text = 'WIN'
                    elif actual_margin < opening_spread: outcome_text = 'LOSS'
                    else: outcome_text = 'PUSH'
                elif bet_side == 'home':
                    if actual_margin < opening_spread: outcome_text = 'WIN'
                    elif actual_margin > opening_spread: outcome_text = 'LOSS'
                    else: outcome_text = 'PUSH'
            elif pd.notna(bet_side) and (pd.isna(opening_spread) or pd.isna(actual_margin)):
                outcome_text = "N/A (Missing Line/Score)"
            elif pd.isna(bet_side):
                outcome_text = "No Bet (No Edge)"


            bet_outcomes.append(outcome_text)
        results_display_df['bet_result'] = bet_outcomes
    else:
        print("Scores not fully available for all games; 'bet_result' will be 'N/A (No Bet Placed/Scores)'.")


    print(f"\n--- Predictions & Outcomes for Season {PREDICT_SEASON} ---")

    # --- Create "Pretty Table" ---
    def highlight_discrepancy_pred_outcome(val): # Renamed for clarity
        if pd.isna(val): return ''
        HIGHLIGHT_THRESHOLD_PRED = 1.5 # Defined earlier
        color = 'background-color: lightgreen' if val > HIGHLIGHT_THRESHOLD_PRED else ''
        return color

    def color_bet_result(val):
        if val == 'WIN': return 'background-color: palegreen'
        elif val == 'LOSS': return 'background-color: lightcoral'
        elif val == 'PUSH': return 'background-color: lightyellow'
        return ''

    # Select columns for the styled table
    styled_table_cols = ['homeTeam', 'awayTeam']
    if 'avg_opening_spread' in results_display_df.columns:
        styled_table_cols.extend(['avg_opening_spread', 'model_lean_vs_open', 'abs_model_vs_open_discrepancy'])
    styled_table_cols.extend(['predicted_closing_spread', 'suggested_bet_on_team', 'bet_result'])
    # Add actual scores if desired for the table
    styled_table_cols.extend(['homePoints', 'awayPoints'])


    styled_table_cols = [col for col in styled_table_cols if col in results_display_df.columns]

    styled_df_outcome = results_display_df[styled_table_cols].style.set_properties(**{
        'border': '1px solid black', 'width': 'auto', 'text-align': 'center' # auto width
    }).set_caption(f"Season {PREDICT_SEASON} Predictions & Outcomes").format({
        'avg_opening_spread': "{:+.1f}".format,
        'predicted_closing_spread': "{:+.1f}".format,
        'model_lean_vs_open': "{:+.2f}".format,
        'abs_model_vs_open_discrepancy': "{:.2f}".format,
        'homePoints': "{:.0f}".format, # No decimals for scores
        'awayPoints': "{:.0f}".format
    }, na_rep="-")

    if 'abs_model_vs_open_discrepancy' in results_display_df.columns:
        styled_df_outcome = styled_df_outcome.applymap(highlight_discrepancy_pred_outcome, subset=['abs_model_vs_open_discrepancy'])
    if 'bet_result' in results_display_df.columns:
        styled_df_outcome = styled_df_outcome.applymap(color_bet_result, subset=['bet_result'])


    display(HTML(styled_df_outcome.to_html(escape=False))) # escape=False if team names have '&' etc.

else:
    print("No predictions to display as no games were found for the specified criteria.")


--- Making Predictions ---
Predictions complete.
Grading bets based on available scores...

--- Predictions & Outcomes for Season 2024 ---


  styled_df_outcome = styled_df_outcome.applymap(highlight_discrepancy_pred_outcome, subset=['abs_model_vs_open_discrepancy'])
  styled_df_outcome = styled_df_outcome.applymap(color_bet_result, subset=['bet_result'])


Unnamed: 0,homeTeam,awayTeam,avg_opening_spread,model_lean_vs_open,abs_model_vs_open_discrepancy,predicted_closing_spread,suggested_bet_on_team,bet_result,homePoints,awayPoints
8383,Miami,Ball State,-36.7,20.86,20.86,-15.8,Ball State,LOSS,62,0
8344,New Mexico State,Liberty,21.3,-20.21,20.21,1.1,New Mexico State,WIN,24,30
8404,Fresno State,New Mexico State,-19.7,18.05,18.05,-1.6,New Mexico State,LOSS,48,0
8376,Oregon State,Oregon,19.0,-17.67,17.67,1.3,Oregon State,LOSS,14,49
8268,Florida,Tulane,-12.5,17.57,17.57,5.1,Tulane,LOSS,33,8
8317,Michigan,Texas,5.7,-16.02,16.02,-10.4,Michigan,LOSS,12,31
8435,South Florida,Miami,17.5,-15.66,15.66,1.8,South Florida,LOSS,15,50
8354,Memphis,Troy,-17.3,15.57,15.57,-1.8,Troy,LOSS,38,17
8246,Syracuse,Ohio,-17.2,15.4,15.4,-1.9,Ohio,WIN,38,22
8355,Florida Atlantic,Army,-3.2,14.78,14.78,11.6,Army,WIN,7,24


In [9]:
print(master_df.columns.tolist())

['id', 'season', 'week', 'season_type', 'completed', 'neutral_site', 'conference_game', 'attendance', 'home_team', 'home_conference', 'home_division', 'home_points', 'home_post_win_prob', 'home_pregame_elo', 'home_postgame_elo', 'away_team', 'away_conference', 'away_division', 'away_points', 'away_post_win_prob', 'away_pregame_elo', 'away_postgame_elo', 'avg_closing_spread', 'avg_closing_total', 'avg_opening_spread', 'avg_opening_total', 'home_offense_plays', 'home_offense_drives', 'home_offense_ppa', 'home_offense_totalPPA', 'home_offense_successRate', 'home_offense_explosiveness', 'home_offense_powerSuccess', 'home_offense_stuffRate', 'home_offense_lineYards', 'home_offense_lineYardsTotal', 'home_offense_secondLevelYards', 'home_offense_secondLevelYardsTotal', 'home_offense_openFieldYards', 'home_offense_openFieldYardsTotal', 'home_offense_standardDowns_ppa', 'home_offense_standardDowns_successRate', 'home_offense_standardDowns_explosiveness', 'home_offense_passingDowns_ppa', 'home_o