In [2]:
# Import packages
import sqlite3
import pandas as pd
import numpy as np
import os

In [3]:
# Config
DB_PATH = "cfb_data.db"
PRE_GAME_ELO_CSV_PATH = 'games_with_pregame_elo.csv'

In [4]:
from google.colab import drive
drive.mount('/content/drive')
# Define the path to your desired directory in Google Drive
drive_path = '/content/drive/MyDrive/Betting/BettingModels'

# Change the current working directory to the desired location
os.chdir(drive_path)

# Verify the current working directory (optional)
print(f"Current working directory: {os.getcwd()}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Current working directory: /content/drive/MyDrive/Betting/BettingModels


# Phase 1: Data Foundation and Feature Engineering

In [5]:
print(f"Connecting to database: {DB_PATH}")
conn = sqlite3.connect(DB_PATH)

# Load games_full data - Select ALL potentially relevant columns
# Explicitly listing columns is generally better than SELECT *
# Make sure this list matches the columns in your 'games_full' table
# (Derived from your initial dataset_head.csv)
all_feature_columns = [
    'id', 'season', 'week', 'season_type', 'completed', 'neutral_site',
    'conference_game', 'attendance', 'home_team', 'home_conference',
    'home_division', 'home_points', 'home_post_win_prob', 'home_pregame_elo', # Note: We use our CALC'd elo later
    'home_postgame_elo', 'away_team', 'away_conference', 'away_division',
    'away_points', 'away_post_win_prob', 'away_pregame_elo', # Note: We use our CALC'd elo later
    'away_postgame_elo', 'avg_closing_spread', 'avg_closing_total',
    'avg_opening_spread', 'avg_opening_total',
    # Home Offense Stats
    'home_offense_plays', 'home_offense_drives', 'home_offense_ppa',
    'home_offense_totalPPA', 'home_offense_successRate', 'home_offense_explosiveness',
    'home_offense_powerSuccess', 'home_offense_stuffRate', 'home_offense_lineYards',
    'home_offense_lineYardsTotal', 'home_offense_secondLevelYards',
    'home_offense_secondLevelYardsTotal', 'home_offense_openFieldYards',
    'home_offense_openFieldYardsTotal', 'home_offense_standardDowns_ppa',
    'home_offense_standardDowns_successRate', 'home_offense_standardDowns_explosiveness',
    'home_offense_passingDowns_ppa', 'home_offense_passingDowns_successRate',
    'home_offense_passingDowns_explosiveness', 'home_offense_rushingPlays_ppa',
    'home_offense_rushingPlays_totalPPA', 'home_offense_rushingPlays_successRate',
    'home_offense_rushingPlays_explosiveness', 'home_offense_passingPlays_ppa',
    'home_offense_passingPlays_totalPPA', 'home_offense_passingPlays_successRate',
    'home_offense_passingPlays_explosiveness',
    # Home Defense Stats
    'home_defense_plays', 'home_defense_drives', 'home_defense_ppa',
    'home_defense_totalPPA', 'home_defense_successRate', 'home_defense_explosiveness',
    'home_defense_powerSuccess', 'home_defense_stuffRate', 'home_defense_lineYards',
    'home_defense_lineYardsTotal', 'home_defense_secondLevelYards',
    'home_defense_secondLevelYardsTotal', 'home_defense_openFieldYards',
    'home_defense_openFieldYardsTotal', 'home_defense_standardDowns_ppa',
    'home_defense_standardDowns_successRate', 'home_defense_standardDowns_explosiveness',
    'home_defense_passingDowns_ppa', 'home_defense_passingDowns_successRate',
    'home_defense_passingDowns_explosiveness', 'home_defense_rushingPlays_ppa',
    'home_defense_rushingPlays_totalPPA', 'home_defense_rushingPlays_successRate',
    'home_defense_rushingPlays_explosiveness', 'home_defense_passingPlays_ppa',
    'home_defense_passingPlays_totalPPA', 'home_defense_passingPlays_successRate',
    'home_defense_passingPlays_explosiveness',
    # Away Offense Stats (matches home offense structure)
    'away_offense_plays', 'away_offense_drives', 'away_offense_ppa',
    'away_offense_totalPPA', 'away_offense_successRate', 'away_offense_explosiveness',
    'away_offense_powerSuccess', 'away_offense_stuffRate', 'away_offense_lineYards',
    'away_offense_lineYardsTotal', 'away_offense_secondLevelYards',
    'away_offense_secondLevelYardsTotal', 'away_offense_openFieldYards',
    'away_offense_openFieldYardsTotal', 'away_offense_standardDowns_ppa',
    'away_offense_standardDowns_successRate', 'away_offense_standardDowns_explosiveness',
    'away_offense_passingDowns_ppa', 'away_offense_passingDowns_successRate',
    'away_offense_passingDowns_explosiveness', 'away_offense_rushingPlays_ppa',
    'away_offense_rushingPlays_totalPPA', 'away_offense_rushingPlays_successRate',
    'away_offense_rushingPlays_explosiveness', 'away_offense_passingPlays_ppa',
    'away_offense_passingPlays_totalPPA', 'away_offense_passingPlays_successRate',
    'away_offense_passingPlays_explosiveness',
    # Away Defense Stats (matches home defense structure)
    'away_defense_plays', 'away_defense_drives', 'away_defense_ppa',
    'away_defense_totalPPA', 'away_defense_successRate', 'away_defense_explosiveness',
    'away_defense_powerSuccess', 'away_defense_stuffRate', 'away_defense_lineYards',
    'away_defense_lineYardsTotal', 'away_defense_secondLevelYards',
    'away_defense_secondLevelYardsTotal', 'away_defense_openFieldYards',
    'away_defense_openFieldYardsTotal', 'away_defense_standardDowns_ppa',
    'away_defense_standardDowns_successRate', 'away_defense_standardDowns_explosiveness',
    'away_defense_passingDowns_ppa', 'away_defense_passingDowns_successRate',
    'away_defense_passingDowns_explosiveness', 'away_defense_rushingPlays_ppa',
    'away_defense_rushingPlays_totalPPA', 'away_defense_rushingPlays_successRate',
    'away_defense_rushingPlays_explosiveness', 'away_defense_passingPlays_ppa',
    'away_defense_passingPlays_totalPPA', 'away_defense_passingPlays_successRate',
    'away_defense_passingPlays_explosiveness',
    # Other stats
    'home_turnovers', 'home_possessionTime', 'away_turnovers', 'away_possessionTime'
]

# Construct the SQL query string dynamically
select_clause = ",\n    ".join([f"g.{col}" for col in all_feature_columns])
games_query = f"""
SELECT
    {select_clause}
FROM
    games_full g
WHERE
    g.completed = 1 -- Only use completed games
ORDER BY
    g.season, g.week, g.id;
"""
# print(games_query) # Optional: Print the generated query to verify

print("Loading ALL games data (including advanced stats)...")
games_df = pd.read_sql_query(games_query, conn)
print(f"Loaded {len(games_df)} completed games with {len(games_df.columns)} columns.")
conn.close()

Connecting to database: cfb_data.db
Loading ALL games data (including advanced stats)...
Loaded 9816 completed games with 142 columns.


### Pre-processing Games Data

In [6]:
# Convert boolean-like columns to integers (0 or 1) for models
games_df['neutral_site'] = games_df['neutral_site'].astype(int)
games_df['conference_game'] = games_df['conference_game'].astype(int)

# Convert spread/total/score columns to numeric, coercing errors to NaN
numeric_cols = ['avg_closing_spread', 'avg_closing_total', 'avg_opening_spread',
                'avg_opening_total', 'home_points', 'away_points',
                'attendance', 'home_possessionTime', 'away_possessionTime',
                # Add Elo/win prob if needed, though we'll use our own Elo primarily
                'home_post_win_prob', 'home_pregame_elo', 'home_postgame_elo',
                'away_post_win_prob', 'away_pregame_elo', 'away_postgame_elo']

# Convert all advanced stat columns to numeric
# Identify the first advanced stat column to loop from there
first_adv_stat_col = 'home_offense_plays'
first_adv_stat_idx = games_df.columns.get_loc(first_adv_stat_col)
adv_stat_cols = games_df.columns[first_adv_stat_idx:]

numeric_cols.extend(adv_stat_cols)

print("Converting relevant columns to numeric...")
for col in numeric_cols:
    if col in games_df.columns: # Check if column exists (robustness)
        games_df[col] = pd.to_numeric(games_df[col], errors='coerce')

# Report missing values for key targets/inputs after conversion
check_missing_cols = ['avg_closing_spread', 'home_points', 'away_points']
print("Missing value check (post-numeric conversion):")
for col in check_missing_cols:
     if col in games_df.columns:
        missing_pct = games_df[col].isnull().mean() * 100
        print(f"  Column '{col}' missing: {missing_pct:.2f}%")

# Drop rows where essential score data might be missing after conversion
games_df.dropna(subset=['home_points', 'away_points'], inplace=True)

Converting relevant columns to numeric...
Missing value check (post-numeric conversion):
  Column 'avg_closing_spread' missing: 0.95%
  Column 'home_points' missing: 0.00%
  Column 'away_points' missing: 0.00%


### Load Pre-Calculated Elo Ratings

In [7]:
print(f"Loading pre-game Elo ratings from: {PRE_GAME_ELO_CSV_PATH}")
pre_game_elo_df = pd.read_csv(PRE_GAME_ELO_CSV_PATH)
pre_game_elo_df = pre_game_elo_df[['game_id', 'home_pregame_elo', 'away_pregame_elo']]
print(f"Loaded Elo ratings for {len(pre_game_elo_df)} games.")

# Rename columns to avoid conflict with original Elo cols and clarify source
pre_game_elo_df.rename(columns={
    'home_pregame_elo': 'home_pregame_elo_calc',
    'away_pregame_elo': 'away_pregame_elo_calc'
}, inplace=True)

Loading pre-game Elo ratings from: games_with_pregame_elo.csv
Loaded Elo ratings for 9816 games.


### Merge Games and Elo Data

In [8]:
print("Merging games data with pre-game Elo ratings...")
master_df = pd.merge(
    games_df,
    pre_game_elo_df,
    left_on='id',
    right_on='game_id',
    how='left'
)

# Check for games potentially missed by the merge
missing_elo_count = master_df['home_pregame_elo_calc'].isnull().sum()
if missing_elo_count > 0:
    print(f"Warning: {missing_elo_count} games are missing calculated pre-game Elo ratings after merge.")
    # Depending on strategy, might drop these rows later if calc'd Elo is crucial
    # master_df.dropna(subset=['home_pregame_elo_calc', 'away_pregame_elo_calc'], inplace=True)

master_df.drop(columns=['game_id'], inplace=True)

Merging games data with pre-game Elo ratings...


### Critical: Drop Target Variable Missing Data and Sort Chronologically

In [9]:
print("Sorting final DataFrame chronologically...")
master_df.sort_values(by=['season', 'week', 'id'], inplace=True)
master_df.reset_index(drop=True, inplace=True)

Sorting final DataFrame chronologically...


### Inspect Consolidated Data

In [10]:
print("\n--- Master DataFrame Info (Now includes all stats) ---")
master_df.info() # Will show many more columns now

# Displaying head/tail might be too wide, focus on key columns
print("\n--- Master DataFrame Head (Key Columns) ---")
print(master_df[['id', 'season', 'week', 'home_team', 'away_team',
                 'avg_closing_spread', 'avg_opening_spread',
                 'home_pregame_elo_calc', 'away_pregame_elo_calc']].head())

print("\n--- Master DataFrame Tail (Check Sorting - Key Columns) ---")
print(master_df[['id', 'season', 'week', 'home_team', 'away_team',
                 'avg_closing_spread', 'avg_opening_spread',
                 'home_pregame_elo_calc', 'away_pregame_elo_calc']].tail())


--- Master DataFrame Info (Now includes all stats) ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9816 entries, 0 to 9815
Columns: 144 entries, id to away_pregame_elo_calc
dtypes: float64(131), int64(6), object(7)
memory usage: 10.8+ MB

--- Master DataFrame Head (Key Columns) ---
          id  season  week       home_team         away_team  \
0  332410006    2013     1   South Alabama     Southern Utah   
1  332410023    2013     1  San José State  Sacramento State   
2  332410041    2013     1           UConn            Towson   
3  332410062    2013     1         Hawai'i               USC   
4  332410084    2013     1         Indiana     Indiana State   

   avg_closing_spread  avg_opening_spread  home_pregame_elo_calc  \
0          -41.000000                 NaN                 1500.0   
1          -33.000000                 NaN                 1500.0   
2          -20.000000                 NaN                 1500.0   
3           22.833333                 NaN            

## Step 2: Definte Target Variable & Basic Features

In [11]:
# Define the primary target variable
target_variable = 'avg_closing_spread'
print(f"\nTarget Variable: '{target_variable}'")

# Check target missing values
target_missing_pct = master_df[target_variable].isnull().mean() * 100
print(f"Missing values in target ('{target_variable}'): {target_missing_pct:.2f}%")

# Drop rows where the target variable is NaN
master_df.dropna(subset=[target_variable], inplace=True)
master_df.reset_index(drop=True, inplace=True)

# Verify the target column now has no NaNs
print(f"Missing values in target after dropping: {master_df[target_variable].isnull().sum()}")

# Define initial, basic features (using our CALCULATED Elo)
# Note: We exclude the original home/away_pregame_elo from the DB unless needed for comparison
basic_features = [
    'home_pregame_elo_calc', # Our calculated Elo
    'away_pregame_elo_calc', # Our calculated Elo
    'neutral_site',
    'conference_game',
    'season',
    'week'
]
# Engineer Elo difference feature using our calculated Elo
master_df['elo_diff_calc'] = master_df['home_pregame_elo_calc'] - master_df['away_pregame_elo_calc']
basic_features.append('elo_diff_calc')

print(f"\nBasic Features Selected ({len(basic_features)}):")
print(basic_features)

# Display the target and basic features for a few rows
print("\n--- Target and Basic Features (Head) ---")
# Handle potential missing calculated Elo values if not dropped earlier
print(master_df[[target_variable] + basic_features].head())


Target Variable: 'avg_closing_spread'
Missing values in target ('avg_closing_spread'): 0.95%
Missing values in target after dropping: 0

Basic Features Selected (7):
['home_pregame_elo_calc', 'away_pregame_elo_calc', 'neutral_site', 'conference_game', 'season', 'week', 'elo_diff_calc']

--- Target and Basic Features (Head) ---
   avg_closing_spread  home_pregame_elo_calc  away_pregame_elo_calc  \
0          -41.000000                 1500.0                 1200.0   
1          -33.000000                 1500.0                 1200.0   
2          -20.000000                 1500.0                 1200.0   
3           22.833333                 1500.0                 1500.0   
4          -30.500000                 1500.0                 1200.0   

   neutral_site  conference_game  season  week  elo_diff_calc  
0             1                0    2013     1          300.0  
1             0                0    2013     1          300.0  
2             0                0    2013     1     

## Step 3 Advanced Stat Feature Engineering

### Identify Stats for Rolling Averages

In [60]:
# Select key efficiency/explosiveness metrics. Start with a focused list.
# Expand this list later if needed.
stats_to_roll = [
    # Overall Offense
    'offense_ppa',
    'offense_successRate',
    'offense_explosiveness',
    # Rushing Offense
    'offense_rushingPlays_ppa',
    'offense_rushingPlays_successRate',
    'offense_rushingPlays_explosiveness',
    'offense_lineYards', # Potentially useful offensive line proxy
    # Passing Offense
    'offense_passingPlays_ppa',
    'offense_passingPlays_successRate',
    'offense_passingPlays_explosiveness',
    # Standard Downs
    'offense_standardDowns_ppa',
    'offense_standardDowns_successRate',
    'offense_standardDowns_explosiveness',
    # Passing Downs
    'offense_passingDowns_ppa',
    'offense_passingDowns_successRate',
    'offense_passingDowns_explosiveness',
    # Overall Defense (using opponent's offensive stats)
    'defense_ppa',
    'defense_successRate',
    'defense_explosiveness',
    # Rushing Defense
    'defense_rushingPlays_ppa',
    'defense_rushingPlays_successRate',
    'defense_rushingPlays_explosiveness',
    'defense_lineYards', # Potentially useful defensive line proxy
    # Passing Defense
    'defense_passingPlays_ppa',
    'defense_passingPlays_successRate',
    'defense_passingPlays_explosiveness',
    # Standard Downs
    'defense_standardDowns_ppa',
    'defense_standardDowns_successRate',
    'defense_standardDowns_explosiveness',
    # Passing Downs
    'defense_passingDowns_ppa',
    'defense_passingDowns_successRate',
    'defense_passingDowns_explosiveness',
    # Special Teams / Other (Add if desired, e.g., avg starting field position - needs raw data)
    'turnovers' # Average turnovers forced/committed
]

# Define EWMA span (adjust as needed, smaller span = more weight on recent)
# A span of 5 roughly means the last ~5 games have the most influence.
ewma_span = 5
min_periods_for_ewma = max(1, ewma_span // 2) # Start calculating EWMA after a few games

print(f"Selected {len(stats_to_roll)} stats for EWMA (span={ewma_span}).")

Selected 33 stats for EWMA (span=5).


### Reshape Data to Team-Centric Format

In [61]:
# Create two temporary dataframes, one for home team stats, one for away
home_stats = master_df[['id', 'season', 'week', 'home_team', 'away_team']].copy()
away_stats = master_df[['id', 'season', 'week', 'away_team', 'home_team']].copy()

home_stats.rename(columns={'home_team': 'team', 'away_team': 'opponent'}, inplace=True)
away_stats.rename(columns={'away_team': 'team', 'home_team': 'opponent'}, inplace=True)

# Add actual stats, renaming columns to generic 'offense_*', 'defense_*'
print("Reshaping data to team-centric format...")
for stat_base in stats_to_roll:
    # Determine if it's an offense or defense stat based on original column name structure
    # This requires stats_to_roll names to match the generic part after home_/away_
    home_col = f'home_{stat_base}'
    away_col = f'away_{stat_base}'

    if home_col in master_df.columns and away_col in master_df.columns and stat_base != 'turnovers':
        # Offensive and Defensive stat for the teams
        home_stats[stat_base] = master_df[home_col]
        away_stats[stat_base] = master_df[away_col]

    # Handle turnovers specifically if included
    elif stat_base == 'turnovers':
      home_stats['turnovers_committed'] = master_df['home_turnovers']
      home_stats['turnovers_forced'] = master_df['away_turnovers'] # Home defense forced away turnovers
      away_stats['turnovers_committed'] = master_df['away_turnovers']
      away_stats['turnovers_forced'] = master_df['home_turnovers'] # Away defense forced home turnovers


# Combine home and away views
team_game_df = pd.concat([home_stats, away_stats], ignore_index=True)

Reshaping data to team-centric format...


In [62]:
team_game_df[(team_game_df['season'] == 2024) & (team_game_df['team'] == 'Alabama')]

Unnamed: 0,id,season,week,team,opponent,offense_ppa,offense_successRate,offense_explosiveness,offense_rushingPlays_ppa,offense_rushingPlays_successRate,...,defense_passingPlays_successRate,defense_passingPlays_explosiveness,defense_standardDowns_ppa,defense_standardDowns_successRate,defense_standardDowns_explosiveness,defense_passingDowns_ppa,defense_passingDowns_successRate,defense_passingDowns_explosiveness,turnovers_committed,turnovers_forced
8865,401628319,2024,1,Alabama,Western Kentucky,1.343144,0.47619,3.393672,0.848881,0.4,...,0.21875,1.626537,-0.334946,0.208333,0.993355,-0.111976,0.208333,1.77423,1.0,2.0
8950,401628335,2024,2,Alabama,South Florida,0.262745,0.402985,1.324513,0.222252,0.421053,...,0.230769,1.054071,-0.084724,0.375,0.79968,-0.018798,0.153846,2.022206,3.0,0.0
9156,401628374,2024,5,Alabama,Georgia,0.294012,0.471429,1.415743,0.088653,0.30303,...,0.411765,1.863592,0.30167,0.5,1.671896,0.199035,0.296296,1.396293,1.0,4.0
9259,401628385,2024,7,Alabama,South Carolina,0.139325,0.491803,1.11673,0.177998,0.558824,...,0.428571,1.765896,-0.021626,0.45098,0.847218,0.647312,0.4,2.744422,2.0,4.0
9370,401628400,2024,9,Alabama,Missouri,0.242476,0.442308,1.183641,0.246868,0.48,...,0.166667,0.819375,-0.238444,0.411765,0.623779,-0.290358,0.076923,1.885113,0.0,3.0
9532,401628425,2024,12,Alabama,Mercer,0.614156,0.567568,1.514592,0.559102,0.571429,...,0.466667,1.237692,-0.493196,0.363636,1.00434,-0.073049,0.4,1.198271,0.0,3.0
9646,401628437,2024,14,Alabama,Auburn,0.166806,0.487179,1.011816,0.004522,0.425926,...,0.395349,1.489217,-0.015114,0.454545,0.938314,0.42665,0.333333,1.985699,4.0,2.0
18751,401628350,2024,3,Alabama,Wisconsin,0.55854,0.487179,1.560085,0.51214,0.5,...,0.357143,1.559695,-0.00099,0.465116,0.890309,0.34637,0.375,1.360695,0.0,2.0
18938,401628384,2024,6,Alabama,Vanderbilt,0.464147,0.555556,1.499859,0.388686,0.473684,...,0.666667,2.091545,0.057245,0.42,1.052493,0.639085,0.416667,2.228449,2.0,0.0
19039,401628397,2024,8,Alabama,Tennessee,-0.066958,0.363636,1.040095,-0.06864,0.4,...,0.34375,2.31839,-0.190941,0.382979,0.73823,0.462074,0.307692,2.615161,2.0,3.0


### Critical: Sort for Rolling Calculation

In [63]:
print("Sorting team-centric data...")
team_game_df.sort_values(by=['team', 'season', 'week', 'id'], inplace=True)

Sorting team-centric data...


### Calculate Lagged EWMA

In [64]:
print(f"Calculating lagged EWMAs (span={ewma_span})...")
ewma_cols_generated = []
stats_to_roll.append('turnovers_committed')
stats_to_roll.append('turnovers_forced')
stats_to_roll.remove('turnovers')
for stat in stats_to_roll:
    if stat in team_game_df.columns: # Ensure stat column was created successfully
        ewma_col_name = f'{stat}_ewma_lag1'
        # Calculate EWMA and shift within each group
        # Use transform for efficiency if possible
        team_game_df[ewma_col_name] = team_game_df.groupby('team')[stat].transform(
            lambda x: x.ewm(span=ewma_span, min_periods=min_periods_for_ewma, adjust=True).mean().shift(1)
        )
        ewma_cols_generated.append(ewma_col_name)
    else:
        print(f"Skipping EWMA for '{stat}' as column not found in team_game_df.")

print(f"Generated {len(ewma_cols_generated)} EWMA columns.")

Calculating lagged EWMAs (span=5)...
Generated 34 EWMA columns.


### Merge back to Master DataFrame

In [65]:
print("Merging EWMA features back to master DataFrame...")

# Select only necessary columns from team_game_df for merging
ewma_features_to_merge = team_game_df[['id', 'team'] + ewma_cols_generated].copy()

# Merge for Home Team stats
master_df_merged = pd.merge(
    master_df,
    ewma_features_to_merge,
    left_on=['id', 'home_team'],
    right_on=['id', 'team'],
    how='left',
    suffixes=('', '_y') # Avoid suffix collision initially
)
# Rename merged columns for home team
home_ewma_rename_dict = {col: f'home_{col}' for col in ewma_cols_generated}
master_df_merged.rename(columns=home_ewma_rename_dict, inplace=True)
master_df_merged.drop(columns=['team'], inplace=True) # Drop the 'team' column from the merge

# Merge for Away Team stats
master_df_final = pd.merge(
    master_df_merged,
    ewma_features_to_merge,
    left_on=['id', 'away_team'],
    right_on=['id', 'team'],
    how='left',
    suffixes=('', '_y') # Avoid suffix collision
)
# Rename merged columns for away team
away_ewma_rename_dict = {col: f'away_{col}' for col in ewma_cols_generated}
master_df_final.rename(columns=away_ewma_rename_dict, inplace=True)
master_df_final.drop(columns=['team'], inplace=True) # Drop the 'team' column from the merge

# Clean up any potential duplicate '_y' columns if merging caused issues (shouldn't with suffixes)
cols_to_drop = [col for col in master_df_final.columns if col.endswith('_y')]
if cols_to_drop:
    print(f"Dropping potentially duplicated columns: {cols_to_drop}")
    master_df_final.drop(columns=cols_to_drop, inplace=True)

Merging EWMA features back to master DataFrame...


**Note: When we have FBS vs. FCS games, all of our shifted EWM data for the FCS team is very likely to be NaN because we don't have previous games**

### Create Matchup Features

In [66]:
print("Creating matchup features (differences)...")
matchup_features = []

# Examples: Home Offense vs Away Defense
for stat in stats_to_roll:
    if stat.startswith('offense_'): # e.g., offense_ppa
        def_equiv_stat = stat.replace('offense_', 'defense_') # e.g., defense_ppa
        home_off_col = f'home_{stat}_ewma_lag1'
        away_def_col = f'away_{def_equiv_stat}_ewma_lag1'
        if home_off_col in master_df_final.columns and away_def_col in master_df_final.columns:
            matchup_col_name = f'matchup_HO_v_AD_{stat.replace("offense_", "")}_ewma_lag1'
            master_df_final[matchup_col_name] = master_df_final[home_off_col] - master_df_final[away_def_col]
            matchup_features.append(matchup_col_name)

# Examples: Away Offense vs Home Defense
for stat in stats_to_roll:
     if stat.startswith('offense_'): # e.g., offense_ppa
        def_equiv_stat = stat.replace('offense_', 'defense_') # e.g., defense_ppa
        away_off_col = f'away_{stat}_ewma_lag1'
        home_def_col = f'home_{def_equiv_stat}_ewma_lag1'
        if away_off_col in master_df_final.columns and home_def_col in master_df_final.columns:
            matchup_col_name = f'matchup_AO_v_HD_{stat.replace("offense_", "")}'
            master_df_final[matchup_col_name] = master_df_final[away_off_col] - master_df_final[home_def_col]
            matchup_features.append(matchup_col_name)


print(f"Generated {len(matchup_features)} matchup difference features.")

Creating matchup features (differences)...
Generated 32 matchup difference features.


### Update Master Dataframe

In [67]:
master_df = master_df_final # Replace the old master_df
del master_df_merged, master_df_final, team_game_df, home_stats, away_stats # Clean up memory

### Inspect Engineered Features

In [70]:
print("\n--- Inspecting Engineered Features (Head) ---")
engineered_feature_cols = [col for col in master_df.columns if '_ewma_lag1' in col or 'matchup_' in col]
print(master_df[['id', 'season', 'week', 'home_team', 'away_team'] + engineered_feature_cols].head())

# Check for NaNs introduced by lagging/EWMA (expected early season)
print("\n--- NaN check for first few EWMA features ---")
missing_pcts = []
for col in engineered_feature_cols: # Check first few
    missing_pct = master_df[col].isnull().mean() * 100
    missing_pcts.append(missing_pct)
    print(f"  Feature '{col}' missing: {missing_pct:.2f}%")


--- Inspecting Engineered Features (Head) ---
          id  season  week       home_team         away_team  \
0  332410006    2013     1   South Alabama     Southern Utah   
1  332410023    2013     1  San José State  Sacramento State   
2  332410041    2013     1           UConn            Towson   
3  332410062    2013     1         Hawai'i               USC   
4  332410084    2013     1         Indiana     Indiana State   

   home_offense_ppa_ewma_lag1  home_offense_successRate_ewma_lag1  \
0                         NaN                                 NaN   
1                         NaN                                 NaN   
2                         NaN                                 NaN   
3                         NaN                                 NaN   
4                         NaN                                 NaN   

   home_offense_explosiveness_ewma_lag1  \
0                                   NaN   
1                                   NaN   
2                       

### Definte Final Feature List (Example)

In [71]:
# Combine basic features with the new engineered features
# We might refine this later in feature selection
all_engineered_features = [col for col in master_df.columns if '_ewma_lag1' in col or 'matchup_' in col]
potential_features = basic_features + all_engineered_features
print(f"\nTotal potential features generated: {len(potential_features)}")
# print("Potential Features:", potential_features) # Uncomment to see full list

print("\n--- Phase 1, Step 3 Complete ---")
print("Engineered features (lagged EWMAs, matchups) added.")
print("Next Steps: Handling Missing Values in Features, Feature Selection.")


Total potential features generated: 107

--- Phase 1, Step 3 Complete ---
Engineered features (lagged EWMAs, matchups) added.
Next Steps: Handling Missing Values in Features, Feature Selection.


## 4. Handling Missing Data

### Indentify and Quantify Missing Feature Values

In [None]:
# Use the 'potential_features' list created at the end of Step 3
# If you didn't create it, define it again:
# all_engineered_features = [col for col in master_df.columns if '_ewma_lag1' in col or 'matchup_' in col]
# potential_features = basic_features + all_engineered_features # basic_features defined in step 2

# Calculate missing percentage for features we might use
print(f"Checking missing values for {len(potential_features)} potential features...")
missing_summary = master_df[potential_features].isnull().mean().sort_values(ascending=False) * 100
missing_summary = missing_summary[missing_summary > 0] # Filter to only show columns with missing data

print("\nFeatures with Missing Values (%):")
if missing_summary.empty:
    print("No missing values found in the potential feature set.")
else:
    with pd.option_context('display.max_rows', None): # Ensure all rows are printed
        print(missing_summary)

### Decide on Imputation Strategy

Option A: Let Model Handle (XGBoost/LightGBM) -- Chosen for Now

* These models can often learn optimal imputation strategy internally
* PROS: Simple, potentially optimal performance
* CONS: Requires model that supports it.

Option B: Mean/Median Imputation:


*   Replace NaN with mean/median calculated ONLY from the training set
*   PROS: Works for any model
*   CONS: Distorts feature distribution, leaks info if mean/median calculated globally

Option C: Zero Imputation
* Replace NaN with 0
* PROS: Simple
* CONS: Often inappropriate for rates/averages, changes feature meaning.



### Implement Strategy

In [None]:
# If we were doing Option B (Mean Imputation - Incorrect Global Example for Demo):
# print("\n--- Example: Global Mean Imputation (NOT RECOMMENDED - DATA LEAKAGE) ---")
# features_to_impute = missing_summary.index.tolist() # Columns identified with NaNs
# for col in features_to_impute:
#    mean_val = master_df[col].mean() # GLOBAL mean - incorrect for proper validation
#    master_df[col].fillna(mean_val, inplace=True)
#    print(f"Imputed '{col}' with global mean: {mean_val:.4f}")
# print("--- End Example ---")

### Verify

In [None]:
print("\nVerifying NaN status (Option A - NaNs should remain):")
remaining_missing_summary = master_df[potential_features].isnull().mean().sort_values(ascending=False) * 100
remaining_missing_summary = remaining_missing_summary[remaining_missing_summary > 0]

if remaining_missing_summary.empty:
    print("No missing values remain in potential features (unexpected for Option A).")
elif not remaining_missing_summary.equals(missing_summary):
     print("Missing values changed unexpectedly without imputation.")
     print("\nRemaining Features with Missing Values (%):")
     with pd.option_context('display.max_rows', None):
        print(remaining_missing_summary)
else:
    print("NaN values remain in features as expected for Strategy A.")
    print(f"Top 5 features with most NaNs:\n{remaining_missing_summary.head()}")


print("\n--- Phase 1, Step 4 Complete ---")
print("Missing data identified and strategy chosen (defer imputation for tree models).")
print("Next Steps: Feature Selection.")

# Phase 2: Model Selection and Training