In [1]:
import pandas as pd
import numpy as np
import warnings
import sqlite3
import os

# Warnings
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

# Config
DB_PATH = "cfb_data.db"
PRE_GAME_ELO_CSV_PATH = 'games_with_pregame_elo.csv'
#Define RP metrics to load and use
RP_METRICS_TO_USE =['usage','percentPPA']
DEFAULT_RP_VALUE = 0.5
EPSILON = 1e-6
TEAM_NAN_HANDLING = 'zero'

In [2]:
from google.colab import drive
drive.mount('/content/drive')
# Define the path to your desired directory in Google Drive
drive_path = '/content/drive/MyDrive/Betting/BettingModels'

# Change the current working directory to the desired location
os.chdir(drive_path)

# Verify the current working directory (optional)
print(f"Current working directory: {os.getcwd()}")

Mounted at /content/drive
Current working directory: /content/drive/MyDrive/Betting/BettingModels


# Data Load and Pre-Process

### Load Games Data

In [3]:
print(f"Connecting to database: {DB_PATH}")
conn = sqlite3.connect(DB_PATH)

# Load games_full data - Select ALL potentially relevant columns
# Explicitly listing columns is generally better than SELECT *
# Make sure this list matches the columns in your 'games_full' table
# (Derived from your initial dataset_head.csv)
all_feature_columns = [
    'id', 'season', 'week', 'season_type', 'completed', 'neutral_site',
    'conference_game', 'attendance', 'home_team', 'home_conference',
    'home_division', 'home_points', 'home_post_win_prob', 'home_pregame_elo', # Note: We use our CALC'd elo later
    'home_postgame_elo', 'away_team', 'away_conference', 'away_division',
    'away_points', 'away_post_win_prob', 'away_pregame_elo', # Note: We use our CALC'd elo later
    'away_postgame_elo', 'avg_closing_spread', 'avg_closing_total',
    'avg_opening_spread', 'avg_opening_total',
    # Home Offense Stats
    'home_offense_plays', 'home_offense_drives', 'home_offense_ppa',
    'home_offense_totalPPA', 'home_offense_successRate', 'home_offense_explosiveness',
    'home_offense_powerSuccess', 'home_offense_stuffRate', 'home_offense_lineYards',
    'home_offense_lineYardsTotal', 'home_offense_secondLevelYards',
    'home_offense_secondLevelYardsTotal', 'home_offense_openFieldYards',
    'home_offense_openFieldYardsTotal', 'home_offense_standardDowns_ppa',
    'home_offense_standardDowns_successRate', 'home_offense_standardDowns_explosiveness',
    'home_offense_passingDowns_ppa', 'home_offense_passingDowns_successRate',
    'home_offense_passingDowns_explosiveness', 'home_offense_rushingPlays_ppa',
    'home_offense_rushingPlays_totalPPA', 'home_offense_rushingPlays_successRate',
    'home_offense_rushingPlays_explosiveness', 'home_offense_passingPlays_ppa',
    'home_offense_passingPlays_totalPPA', 'home_offense_passingPlays_successRate',
    'home_offense_passingPlays_explosiveness',
    # Home Defense Stats
    'home_defense_plays', 'home_defense_drives', 'home_defense_ppa',
    'home_defense_totalPPA', 'home_defense_successRate', 'home_defense_explosiveness',
    'home_defense_powerSuccess', 'home_defense_stuffRate', 'home_defense_lineYards',
    'home_defense_lineYardsTotal', 'home_defense_secondLevelYards',
    'home_defense_secondLevelYardsTotal', 'home_defense_openFieldYards',
    'home_defense_openFieldYardsTotal', 'home_defense_standardDowns_ppa',
    'home_defense_standardDowns_successRate', 'home_defense_standardDowns_explosiveness',
    'home_defense_passingDowns_ppa', 'home_defense_passingDowns_successRate',
    'home_defense_passingDowns_explosiveness', 'home_defense_rushingPlays_ppa',
    'home_defense_rushingPlays_totalPPA', 'home_defense_rushingPlays_successRate',
    'home_defense_rushingPlays_explosiveness', 'home_defense_passingPlays_ppa',
    'home_defense_passingPlays_totalPPA', 'home_defense_passingPlays_successRate',
    'home_defense_passingPlays_explosiveness',
    # Away Offense Stats (matches home offense structure)
    'away_offense_plays', 'away_offense_drives', 'away_offense_ppa',
    'away_offense_totalPPA', 'away_offense_successRate', 'away_offense_explosiveness',
    'away_offense_powerSuccess', 'away_offense_stuffRate', 'away_offense_lineYards',
    'away_offense_lineYardsTotal', 'away_offense_secondLevelYards',
    'away_offense_secondLevelYardsTotal', 'away_offense_openFieldYards',
    'away_offense_openFieldYardsTotal', 'away_offense_standardDowns_ppa',
    'away_offense_standardDowns_successRate', 'away_offense_standardDowns_explosiveness',
    'away_offense_passingDowns_ppa', 'away_offense_passingDowns_successRate',
    'away_offense_passingDowns_explosiveness', 'away_offense_rushingPlays_ppa',
    'away_offense_rushingPlays_totalPPA', 'away_offense_rushingPlays_successRate',
    'away_offense_rushingPlays_explosiveness', 'away_offense_passingPlays_ppa',
    'away_offense_passingPlays_totalPPA', 'away_offense_passingPlays_successRate',
    'away_offense_passingPlays_explosiveness',
    # Away Defense Stats (matches home defense structure)
    'away_defense_plays', 'away_defense_drives', 'away_defense_ppa',
    'away_defense_totalPPA', 'away_defense_successRate', 'away_defense_explosiveness',
    'away_defense_powerSuccess', 'away_defense_stuffRate', 'away_defense_lineYards',
    'away_defense_lineYardsTotal', 'away_defense_secondLevelYards',
    'away_defense_secondLevelYardsTotal', 'away_defense_openFieldYards',
    'away_defense_openFieldYardsTotal', 'away_defense_standardDowns_ppa',
    'away_defense_standardDowns_successRate', 'away_defense_standardDowns_explosiveness',
    'away_defense_passingDowns_ppa', 'away_defense_passingDowns_successRate',
    'away_defense_passingDowns_explosiveness', 'away_defense_rushingPlays_ppa',
    'away_defense_rushingPlays_totalPPA', 'away_defense_rushingPlays_successRate',
    'away_defense_rushingPlays_explosiveness', 'away_defense_passingPlays_ppa',
    'away_defense_passingPlays_totalPPA', 'away_defense_passingPlays_successRate',
    'away_defense_passingPlays_explosiveness',
    # Other stats
    'home_turnovers', 'home_possessionTime', 'away_turnovers', 'away_possessionTime'
]

# Construct the SQL query string dynamically
select_clause = ",\n    ".join([f"g.{col}" for col in all_feature_columns])
games_query = f"""
SELECT
    {select_clause}
FROM
    games_full g
WHERE
    g.completed = 1 -- Only use completed games
ORDER BY
    g.season, g.week, g.id;
"""
# print(games_query) # Optional: Print the generated query to verify

print("Loading ALL games data (including advanced stats)...")
games_df = pd.read_sql_query(games_query, conn)
print(f"Loaded {len(games_df)} completed games with {len(games_df.columns)} columns.")

Connecting to database: cfb_data.db
Loading ALL games data (including advanced stats)...
Loaded 9816 completed games with 142 columns.


### Load RP Data

In [4]:
# --- Load Returning Production Data ---
rp_cols_select = ['season', 'team'] + RP_METRICS_TO_USE
rp_cols_str = ", ".join(rp_cols_select)
rp_query = f"SELECT {rp_cols_str} FROM returning_production;"
print(f"Loading returning production data ({RP_METRICS_TO_USE})...")
rp_df = pd.read_sql_query(rp_query, conn)
print(f"Loaded {len(rp_df)} returning production records.")


conn.close()
print("Database connection closed.")

Loading returning production data (['usage', 'percentPPA'])...
Loaded 1420 returning production records.
Database connection closed.


### Pre-Process Games

In [5]:
# Convert boolean-like columns to integers (0 or 1) for models
games_df['neutral_site'] = games_df['neutral_site'].astype(int)
games_df['conference_game'] = games_df['conference_game'].astype(int)

# Convert spread/total/score columns to numeric, coercing errors to NaN
numeric_cols = ['avg_closing_spread', 'avg_closing_total', 'avg_opening_spread',
                'avg_opening_total', 'home_points', 'away_points',
                'attendance', 'home_possessionTime', 'away_possessionTime',
                # Add Elo/win prob if needed, though we'll use our own Elo primarily
                'home_post_win_prob', 'home_pregame_elo', 'home_postgame_elo',
                'away_post_win_prob', 'away_pregame_elo', 'away_postgame_elo']

# Convert all advanced stat columns to numeric
# Identify the first advanced stat column to loop from there
first_adv_stat_col = 'home_offense_plays'
first_adv_stat_idx = games_df.columns.get_loc(first_adv_stat_col)
adv_stat_cols = games_df.columns[first_adv_stat_idx:]

numeric_cols.extend(adv_stat_cols)

print("Converting relevant columns to numeric...")
for col in numeric_cols:
    if col in games_df.columns: # Check if column exists (robustness)
        games_df[col] = pd.to_numeric(games_df[col], errors='coerce')

# Report missing values for key targets/inputs after conversion
check_missing_cols = ['avg_closing_spread', 'home_points', 'away_points']
print("Missing value check (post-numeric conversion):")
for col in check_missing_cols:
     if col in games_df.columns:
        missing_pct = games_df[col].isnull().mean() * 100
        print(f"  Column '{col}' missing: {missing_pct:.2f}%")

# Drop rows where essential score data might be missing after conversion
games_df.dropna(subset=['home_points', 'away_points'], inplace=True)

Converting relevant columns to numeric...
Missing value check (post-numeric conversion):
  Column 'avg_closing_spread' missing: 0.95%
  Column 'home_points' missing: 0.00%
  Column 'away_points' missing: 0.00%


### Pre-Processing Returning Production Data


In [6]:
print("Preprocessing returning production data...")
# Convert RP metrics to numeric
for col in RP_METRICS_TO_USE:
    rp_df[col] = pd.to_numeric(rp_df[col], errors='coerce')
# Fill missing RP values *before* merging
print(f"Filling NaNs in RP data with default: {DEFAULT_RP_VALUE}")
rp_df.fillna({col: DEFAULT_RP_VALUE for col in RP_METRICS_TO_USE}, inplace=True)
# Prepare for merge - RP data for season S applies to season S games
rp_df.rename(columns={'season': 'rp_season'}, inplace=True) # Avoid collision with game season

Preprocessing returning production data...
Filling NaNs in RP data with default: 0.5


### Load Pre-Calculated Elo Ratings

In [7]:
print(f"Loading pre-game Elo ratings from: {PRE_GAME_ELO_CSV_PATH}")
pre_game_elo_df = pd.read_csv(PRE_GAME_ELO_CSV_PATH)
pre_game_elo_df = pre_game_elo_df[['game_id', 'home_pregame_elo', 'away_pregame_elo']]
print(f"Loaded Elo ratings for {len(pre_game_elo_df)} games.")

# Rename columns to avoid conflict with original Elo cols and clarify source
pre_game_elo_df.rename(columns={
    'home_pregame_elo': 'home_pregame_elo_calc',
    'away_pregame_elo': 'away_pregame_elo_calc'
}, inplace=True)

Loading pre-game Elo ratings from: games_with_pregame_elo.csv
Loaded Elo ratings for 9816 games.


### Merge Games and Elo Data

In [8]:
print("Merging games data with pre-game Elo ratings...")
master_df = pd.merge(
    games_df,
    pre_game_elo_df,
    left_on='id',
    right_on='game_id',
    how='left'
)

# Check for games potentially missed by the merge
missing_elo_count = master_df['home_pregame_elo_calc'].isnull().sum()
if missing_elo_count > 0:
    print(f"Warning: {missing_elo_count} games are missing calculated pre-game Elo ratings after merge.")
    # Depending on strategy, might drop these rows later if calc'd Elo is crucial
    # master_df.dropna(subset=['home_pregame_elo_calc', 'away_pregame_elo_calc'], inplace=True)

master_df.drop(columns=['game_id'], inplace=True)

Merging games data with pre-game Elo ratings...


### Merge RP Data

In [9]:
print("Merging returning production data...")
# Merge for Home Team
master_df = pd.merge(
    master_df,
    rp_df,
    left_on=['season', 'home_team'],
    right_on=['rp_season', 'team'],
    how='left',
    suffixes=('', '_rp_home_temp')
)
# Rename home RP columns and drop temporary merge keys
home_rp_rename = {col: f'home_rp_{col}' for col in RP_METRICS_TO_USE}
master_df.rename(columns=home_rp_rename, inplace=True)
master_df.drop(columns=['rp_season', 'team'], inplace=True, errors='ignore')

# Merge for Away Team
master_df = pd.merge(
    master_df,
    rp_df,
    left_on=['season', 'away_team'],
    right_on=['rp_season', 'team'],
    how='left',
    suffixes=('', '_rp_away_temp')
)
# Rename away RP columns and drop temporary merge keys
away_rp_rename = {col: f'away_rp_{col}' for col in RP_METRICS_TO_USE}
master_df.rename(columns=away_rp_rename, inplace=True)
master_df.drop(columns=['rp_season', 'team'], inplace=True, errors='ignore')

# Fill NaNs for teams potentially missing in RP data *after* merge (use default)
home_rp_cols = list(home_rp_rename.values())
away_rp_cols = list(away_rp_rename.values())
rp_cols_merged = home_rp_cols + away_rp_cols
for col in rp_cols_merged:
    if master_df[col].isnull().any():
        print(f"Filling NaNs in merged RP column '{col}' with {DEFAULT_RP_VALUE}")
        master_df[col].fillna(DEFAULT_RP_VALUE, inplace=True)

Merging returning production data...
Filling NaNs in merged RP column 'home_rp_usage' with 0.5
Filling NaNs in merged RP column 'home_rp_percentPPA' with 0.5
Filling NaNs in merged RP column 'away_rp_usage' with 0.5
Filling NaNs in merged RP column 'away_rp_percentPPA' with 0.5


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  master_df[col].fillna(DEFAULT_RP_VALUE, inplace=True)


### Critical: Drop Target Variable Missing Data and Sort Chronologically

In [10]:
print("Sorting final DataFrame chronologically...")
master_df.sort_values(by=['season', 'week', 'id'], inplace=True)
master_df.reset_index(drop=True, inplace=True)

df = master_df.copy()

Sorting final DataFrame chronologically...


# Opponent Adjustment

### Identify Stats to Adjust

In [11]:
base_stats_offense = [
    'ppa', 'successRate', 'explosiveness',
    'standardDowns_ppa', 'standardDowns_successRate', 'standardDowns_explosiveness',
    'passingDowns_ppa', 'passingDowns_successRate', 'passingDowns_explosiveness',
    'rushingPlays_ppa', 'rushingPlays_successRate',
    'rushingPlays_explosiveness', 'passingPlays_ppa',
    'passingPlays_successRate', 'passingPlays_explosiveness'
]

base_stats_defense = base_stats_offense.copy()

# Generate full column names
home_offense_cols = [f'home_offense_{stat}' for stat in base_stats_offense]
away_offense_cols = [f'away_offense_{stat}' for stat in base_stats_offense]
home_defense_cols = [f'home_defense_{stat}' for stat in base_stats_defense]
away_defense_cols = [f'away_defense_{stat}' for stat in base_stats_defense]
all_stat_cols = home_offense_cols + away_offense_cols + home_defense_cols + away_defense_cols

# Ensure stats are numeric
for col in all_stat_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    else:
        print(f"Warning: Column '{col}' not found in DataFrame.")



## Calculate Averages Needed for Adjustment

### League Averages per Season

In [12]:
final_season_averages = {}
league_weekly_data_list = []

for season, season_df in df.groupby('season'):
    season_stats = {}
    for stat in base_stats_offense:
        home_col = f'home_offense_{stat}'
        away_col = f'away_offense_{stat}'
        avg_col_name_off = f'league_avg_off_{stat}'
        if home_col in season_df.columns and away_col in season_df.columns:
            valid_data = pd.concat([season_df[home_col], season_df[away_col]]).dropna()
            # Calculate final average for the whole season
            final_avg = valid_data.mean() if not valid_data.empty else 0
            season_stats[f'{avg_col_name_off}_final'] = final_avg

            # Aggregate weekly data for rolling average calculation later
            weekly_sum = season_df.groupby('week')[home_col].sum().fillna(0) + season_df.groupby('week')[away_col].sum().fillna(0)
            weekly_count = season_df.groupby('week')[home_col].count() + season_df.groupby('week')[away_col].count() # Counts non-NaN implicitly? Let's be explicit.
            weekly_count_h = season_df.groupby('week')[home_col].apply(lambda x: x.notna().sum())
            weekly_count_a = season_df.groupby('week')[away_col].apply(lambda x: x.notna().sum())
            weekly_count = weekly_count_h.add(weekly_count_a, fill_value=0)

            weekly_avg = weekly_sum / weekly_count.replace(0, 1)
            weekly_avg.name = avg_col_name_off # Name series for merging
            league_weekly_data_list.append(weekly_avg.reset_index().assign(season=season))


    for stat in base_stats_defense:
        home_col = f'home_defense_{stat}'
        away_col = f'away_defense_{stat}'
        avg_col_name_def = f'league_avg_def_{stat}'
        if home_col in season_df.columns and away_col in season_df.columns:
            valid_data = pd.concat([season_df[home_col], season_df[away_col]]).dropna()
            final_avg = valid_data.mean() if not valid_data.empty else 0
            season_stats[f'{avg_col_name_def}_final'] = final_avg

            # Aggregate weekly data
            weekly_sum = season_df.groupby('week')[home_col].sum().fillna(0) + season_df.groupby('week')[away_col].sum().fillna(0)
            weekly_count_h = season_df.groupby('week')[home_col].apply(lambda x: x.notna().sum())
            weekly_count_a = season_df.groupby('week')[away_col].apply(lambda x: x.notna().sum())
            weekly_count = weekly_count_h.add(weekly_count_a, fill_value=0)

            weekly_avg = weekly_sum / weekly_count.replace(0, 1)
            weekly_avg.name = avg_col_name_def # Name series for merging
            league_weekly_data_list.append(weekly_avg.reset_index().assign(season=season))


    final_season_averages[season] = season_stats

# Convert final averages dict to DataFrame for easier mapping
final_season_averages_df = pd.DataFrame.from_dict(final_season_averages, orient='index').reset_index().rename(columns={'index': 'season'})

# Combine all weekly league averages
league_weekly_df = pd.concat(league_weekly_data_list)
# Pivot to have stats as columns, indexed by season and week
league_weekly_pivot = league_weekly_df.pivot_table(index=['season', 'week'], values=league_weekly_df.columns.drop(['season', 'week']), aggfunc='mean') # Use mean in case of duplicates (shouldn't happen)

### Lagged Rolling 3-Week Average

In [13]:
rolling_avg_cols = {}
for col in league_weekly_pivot.columns:
    # Calculate rolling mean within each season, then shift
    rolling_avg_col_name = f'{col}_rolling_3wk_lag1'
    # Apply rolling calculation within each season group
    # Reset index to allow grouping on season, then set index back for potential joins
    temp_pivot = league_weekly_pivot.reset_index()
    temp_pivot[rolling_avg_col_name] = temp_pivot.groupby('season')[col].transform(
        lambda x: x.rolling(window=3, min_periods=1).mean().shift(1)
    )
    league_weekly_pivot = temp_pivot.set_index(['season', 'week']) # Set index back
    rolling_avg_cols[col] = rolling_avg_col_name # Store new column names

### Map Previous Season's Final Average

In [14]:
df['prev_season'] = df['season'] - 1
df = pd.merge(df, final_season_averages_df, left_on='prev_season', right_on='season', how='left', suffixes=('', '_prev_final'))
df.drop(columns=['season_prev_final', 'prev_season'], inplace=True) # Clean up merge columns

### Map Current Season's Lagged Rolling 3-Week Average

In [15]:
rolling_cols_to_merge = [col for col in league_weekly_pivot.columns if '_rolling_3wk_lag1' in col]
df = pd.merge(df, league_weekly_pivot[rolling_cols_to_merge], on=['season', 'week'], how='left')

### Create a hybrid League Average Column

In [16]:
print("\nCreating hybrid league average columns...")
FIRST_SEASON_FALLBACK = 'zero'
hybrid_league_avg_cols = []
first_season = df['season'].min()

for stat in base_stats_offense:
    base_name = f'league_avg_off_{stat}'
    prev_season_col = f'{base_name}_final'
    rolling_avg_col = f'{base_name}_rolling_3wk_lag1'
    hybrid_col = f'{base_name}_hybrid_lag1'
    hybrid_league_avg_cols.append(hybrid_col)

    if prev_season_col in df.columns and rolling_avg_col in df.columns:
        # --- Handle first season fallback ---
        # Option 1: Fill previous season NaN with 0
        if FIRST_SEASON_FALLBACK == 'zero':
            df[prev_season_col].fillna(0, inplace=True)
        # Option 2: Fill with the rolling average value that starts in week 4
        elif FIRST_SEASON_FALLBACK == 'use_rolling_avg_wk4':
             # Find the value used in week 4 for that season and stat
             wk4_values = df.loc[df['week'] == 4, ['season', rolling_avg_col]].set_index('season')
             # Map these week 4 values to the NaNs in the prev_season_col for the first season
             is_first_season = df['season'] == first_season
             df.loc[is_first_season, prev_season_col] = df.loc[is_first_season, 'season'].map(wk4_values[rolling_avg_col])
             # Still might have NaNs if wk4 value itself was NaN, fill remaining with 0
             df[prev_season_col].fillna(0, inplace=True)
        else: # Default to zero if option invalid
             df[prev_season_col].fillna(0, inplace=True)

        # Fill NaNs in rolling average (especially early weeks) - using forward fill within season? Or 0? Let's use 0 for now.
        # A better approach might be to backfill from week 4's value for weeks 1-3 if needed.
        df[rolling_avg_col] = df.groupby('season')[rolling_avg_col].ffill().bfill() # Fill within season
        df[rolling_avg_col].fillna(0, inplace=True) # Fill any remaining NaNs


        # Apply conditional logic
        df[hybrid_col] = np.where(
            df['week'] <= 3,
            df[prev_season_col],  # Use previous season's final average
            df[rolling_avg_col]   # Use current season's lagged rolling 3-week average
        )
    else:
        print(f"Warning: Missing columns for hybrid avg: {prev_season_col} or {rolling_avg_col}")
        df[hybrid_col] = 0 # Assign default if columns missing


for stat in base_stats_defense:
    base_name = f'league_avg_def_{stat}'
    prev_season_col = f'{base_name}_final'
    rolling_avg_col = f'{base_name}_rolling_3wk_lag1'
    hybrid_col = f'{base_name}_hybrid_lag1'
    hybrid_league_avg_cols.append(hybrid_col)

    if prev_season_col in df.columns and rolling_avg_col in df.columns:
        # Handle first season fallback
        if FIRST_SEASON_FALLBACK == 'zero':
            df[prev_season_col].fillna(0, inplace=True)
        elif FIRST_SEASON_FALLBACK == 'use_rolling_avg_wk4':
             wk4_values = df.loc[df['week'] == 4, ['season', rolling_avg_col]].set_index('season')
             is_first_season = df['season'] == first_season
             df.loc[is_first_season, prev_season_col] = df.loc[is_first_season, 'season'].map(wk4_values[rolling_avg_col])
             df[prev_season_col].fillna(0, inplace=True) # Fill remaining with 0
        else:
             df[prev_season_col].fillna(0, inplace=True)

        # Fill NaNs in rolling average
        df[rolling_avg_col] = df.groupby('season')[rolling_avg_col].ffill().bfill() # Fill within season
        df[rolling_avg_col].fillna(0, inplace=True)

        # Apply conditional logic
        df[hybrid_col] = np.where(
            df['week'] <= 3,
            df[prev_season_col],
            df[rolling_avg_col]
        )
    else:
        print(f"Warning: Missing columns for hybrid avg: {prev_season_col} or {rolling_avg_col}")
        df[hybrid_col] = 0


Creating hybrid league average columns...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[prev_season_col].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[rolling_avg_col].fillna(0, inplace=True) # Fill any remaining NaNs
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which w

## Calculate Lagged Expanding Team Averages





In [17]:
print("\nCalculating lagged expanding team averages...")
# Create the team-centric view again
home_view = df[['id', 'season', 'week', 'home_team', 'away_team'] + home_offense_cols + home_defense_cols].copy()
away_view = df[['id', 'season', 'week', 'away_team', 'home_team'] + away_offense_cols + away_defense_cols].copy()
# ... (rest of team_centric_df creation and column renaming is identical to previous version) ...
home_view.rename(columns={'home_team': 'team', 'away_team': 'opponent'}, inplace=True)
home_view.columns = [col.replace('home_offense_', 'offense_') for col in home_view.columns]
home_view.columns = [col.replace('home_defense_', 'defense_') for col in home_view.columns]
home_view['is_home'] = 1
away_view.rename(columns={'away_team': 'team', 'home_team': 'opponent'}, inplace=True)
away_view.columns = [col.replace('away_offense_', 'offense_') for col in away_view.columns]
away_view.columns = [col.replace('away_defense_', 'defense_') for col in away_view.columns]
away_view['is_home'] = 0
team_centric_df = pd.concat([home_view, away_view], ignore_index=True)
team_centric_df.sort_values(by=['season', 'week', 'id', 'is_home'], inplace=True)

# Calculate lagged expanding means *within each team's season*
team_avg_cols = []
calculated_team_global_means = {} # For potential global mean filling if needed
for stat in base_stats_offense:
    col = f'offense_{stat}'
    if col in team_centric_df.columns:
        avg_col_name = f'avg_{col}_gained_exp_lag1'
        team_centric_df[avg_col_name] = team_centric_df.groupby(['season', 'team'])[col].transform(
            lambda x: x.expanding(min_periods=1).mean().shift(1)
        )
        team_avg_cols.append(avg_col_name)
        calculated_team_global_means[avg_col_name] = team_centric_df[avg_col_name].mean()


for stat in base_stats_defense:
    col = f'defense_{stat}'
    if col in team_centric_df.columns:
        avg_col_name = f'avg_{col}_allowed_exp_lag1'
        team_centric_df[avg_col_name] = team_centric_df.groupby(['season', 'team'])[col].transform(
            lambda x: x.expanding(min_periods=1).mean().shift(1)
        )
        team_avg_cols.append(avg_col_name)
        calculated_team_global_means[avg_col_name] = team_centric_df[avg_col_name].mean()

# Handle NaNs for team averages
if TEAM_NAN_HANDLING == 'zero':
    print("Filling initial team NaNs with 0.")
    for col in team_avg_cols:
         team_centric_df[col].fillna(0, inplace=True)
elif TEAM_NAN_HANDLING == 'global_mean':
     print("Filling initial team NaNs with global mean of that stat.")
     for col in team_avg_cols:
         fill_val = calculated_team_global_means.get(col, 0)
         team_centric_df[col].fillna(fill_val, inplace=True)


Calculating lagged expanding team averages...
Filling initial team NaNs with 0.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  team_centric_df[col].fillna(0, inplace=True)


### Merge Lagged Team Averages Back

In [18]:
cols_to_merge = ['id', 'season', 'week', 'team'] + team_avg_cols

# Prepare and Merge Home Perspective Averages
home_avg_slice = team_centric_df[team_centric_df['is_home'] == 1][cols_to_merge].copy()
# Define renaming dict for clarity
home_rename_dict = {col: f"{col}_home_perspective" for col in team_avg_cols}
home_avg_slice.rename(columns=home_rename_dict, inplace=True)

# Perform the merge
df = pd.merge(
    df,
    home_avg_slice,
    left_on=['id', 'season', 'week', 'home_team'],
    right_on=['id', 'season', 'week', 'team'],
    how='left'
)
#drop the redundant 'team' column from the right side of the merge
df.drop(columns=['team'], inplace=True, errors='ignore')

# Prepare and Merge Away Perspective Averages
away_avg_slice = team_centric_df[team_centric_df['is_home'] == 0][cols_to_merge].copy()
# Define renaming dict
away_rename_dict = {col: f"{col}_away_perspective" for col in team_avg_cols}
away_avg_slice.rename(columns=away_rename_dict, inplace=True)

# Perform the merge
df = pd.merge(
    df,
    away_avg_slice,
    left_on=['id', 'season', 'week', 'away_team'],
    right_on=['id', 'season', 'week', 'team'],
    how='left'
)
# Drop the redundant 'team' column
df.drop(columns=['team'], inplace=True, errors='ignore')


print(f"DataFrame shape after revised merging: {df.shape}")

DataFrame shape after revised merging: (9816, 298)


## Apply Opponent Adjustment

In [19]:
print("\nApplying Opponent Adjustments with Hybrid League Average...")
adj_prefix = 'adj_hybrid_' # New prefix for clarity

# Adjust Home Offense vs Away Defense
for stat in base_stats_offense:
    game_stat_col = f'home_offense_{stat}'
    opponent_avg_col = f'avg_defense_{stat}_allowed_exp_lag1_away_perspective' # Away team's lagged DEFENSE ALLOWED
    league_avg_col = f'league_avg_off_{stat}_hybrid_lag1' # Use the HYBRID league average
    adj_col_name = f'{adj_prefix}{game_stat_col}'

    if all(c in df.columns for c in [game_stat_col, opponent_avg_col, league_avg_col]):
        df[adj_col_name] = df[game_stat_col] - df[opponent_avg_col] + df[league_avg_col]
    # else: print(f"Skipping {adj_col_name}: Missing columns")

# Adjust Away Offense vs Home Defense
for stat in base_stats_offense:
    game_stat_col = f'away_offense_{stat}'
    opponent_avg_col = f'avg_defense_{stat}_allowed_exp_lag1_home_perspective' # Home team's lagged DEFENSE ALLOWED
    league_avg_col = f'league_avg_off_{stat}_hybrid_lag1' # Use the HYBRID league average
    adj_col_name = f'{adj_prefix}{game_stat_col}'

    if all(c in df.columns for c in [game_stat_col, opponent_avg_col, league_avg_col]):
      df[adj_col_name] = df[game_stat_col] - df[opponent_avg_col] + df[league_avg_col]
    # else: print(f"Skipping {adj_col_name}: Missing columns")

# Adjust Home Defense vs Away Offense
for stat in base_stats_defense:
    game_stat_col = f'home_defense_{stat}'
    opponent_avg_col = f'avg_offense_{stat}_gained_exp_lag1_away_perspective' # Away team's lagged OFFENSE GAINED
    league_avg_col = f'league_avg_def_{stat}_hybrid_lag1' # Use the HYBRID league average
    adj_col_name = f'{adj_prefix}{game_stat_col}'

    if all(c in df.columns for c in [game_stat_col, opponent_avg_col, league_avg_col]):
       df[adj_col_name] = df[game_stat_col] - df[opponent_avg_col] + df[league_avg_col]
    # else: print(f"Skipping {adj_col_name}: Missing columns")

# Adjust Away Defense vs Home Offense
for stat in base_stats_defense:
    game_stat_col = f'away_defense_{stat}'
    opponent_avg_col = f'avg_offense_{stat}_gained_exp_lag1_home_perspective' # Home team's lagged OFFENSE GAINED
    league_avg_col = f'league_avg_def_{stat}_hybrid_lag1' # Use the HYBRID league average
    adj_col_name = f'{adj_prefix}{game_stat_col}'

    if all(c in df.columns for c in [game_stat_col, opponent_avg_col, league_avg_col]):
      df[adj_col_name] = df[game_stat_col] - df[opponent_avg_col] + df[league_avg_col]
    # else: print(f"Skipping {adj_col_name}: Missing columns")

print("\nHybrid Opponent Adjustments Applied.")


Applying Opponent Adjustments with Hybrid League Average...

Hybrid Opponent Adjustments Applied.


## Display Results

In [20]:
print("\nShowing original and hybrid adjusted PPA columns (Weeks 2-5 of a sample season):")
sample_season = df['season'].unique()[1] if len(df['season'].unique()) > 1 else df['season'].min() # Pick second season if available
sample_df = df[(df['season'] == sample_season) & (df['week'] >= 2) & (df['week'] <= 5)]

adjusted_cols_preview = [
    'season', 'week', 'home_team', 'away_team',
    'home_offense_ppa', f'{adj_prefix}home_offense_ppa',
    'away_offense_ppa', f'{adj_prefix}away_offense_ppa',
    'home_defense_ppa', f'{adj_prefix}home_defense_ppa',
    'away_defense_ppa', f'{adj_prefix}away_defense_ppa',
    'league_avg_off_ppa_hybrid_lag1', # Show the hybrid value used
    'league_avg_def_ppa_hybrid_lag1'
]
adjusted_cols_preview = [col for col in adjusted_cols_preview if col in df.columns]
print(sample_df[adjusted_cols_preview].head(15))


print(f"\nFinal DataFrame shape with hybrid adjusted columns: {df.shape}")


Showing original and hybrid adjusted PPA columns (Weeks 2-5 of a sample season):
     season  week       home_team             away_team  home_offense_ppa  \
897    2014     2           UConn           Stony Brook         -0.119283   
898    2014     2  South Carolina         East Carolina          0.232510   
899    2014     2         Houston             Grambling          0.485744   
900    2014     2            UCLA               Memphis          0.211255   
901    2014     2     North Texas                   SMU          0.055086   
902    2014     2   South Florida              Maryland         -0.142242   
903    2014     2          Temple                  Navy          0.128995   
904    2014     2          Tulane          Georgia Tech          0.013387   
905    2014     2           Tulsa              Oklahoma         -0.003274   
906    2014     2  Boston College            Pittsburgh          0.035952   
907    2014     2         Clemson  South Carolina State          0.3224

In [21]:
df.to_csv('opponent_adjustments.csv')