In [None]:
import pandas as pd
import numpy as np

# Basketball-specific constants
FOUR_FACTOR_WEIGHTS = {
    'shooting': 0.35,  # Reduced from 0.4
    'turnovers': 0.3,   # Increased from 0.25
    'rebounding': 0.25, # Increased from 0.2
    'free_throws': 0.1  # Reduced from 0.15
}

# Load data with absolute paths to prevent leaks
game_data = pd.read_csv('games_2022.csv')
teams_regions = pd.read_csv('Team Region Groups.csv')

# %%
def calculate_possessions(group):
    """Calculate possessions using NCAA formula"""
    group['possessions'] = group['FGA_2'] + group['FGA_3'] + 0.44 * group['FTA'] - group['OREB'] + group['TOV_team']
    return group

# Process games with True Possessions
games = game_data.groupby('game_id', group_keys=False).apply(calculate_possessions)

# %%
def team_efficiency_metrics(df):
    """Calculate season-long Four Factors metrics"""
    agg_stats = df.groupby('team').agg({
        'FGM_2': 'sum',
        'FGA_2': 'sum',
        'FGM_3': 'sum', 
        'FGA_3': 'sum',
        'OREB': 'sum',
        'DREB': 'sum',
        'TOV_team': 'mean',
        'FTA': 'sum',
        'team_score': 'mean',
        'possessions': 'sum'
    })
    
    # Four Factors Calculations
    agg_stats['eFG%'] = (agg_stats['FGM_2'] + 1.5*agg_stats['FGM_3']) / (agg_stats['FGA_2'] + agg_stats['FGA_3'])
    agg_stats['TOV%'] = agg_stats['TOV_team'] / agg_stats['possessions']
    agg_stats['ORB%'] = agg_stats['OREB'] / (agg_stats['OREB'] + agg_stats['DREB'])
    agg_stats['FTRate'] = agg_stats['FTA'] / (agg_stats['FGA_2'] + agg_stats['FGA_3'])
    
    return agg_stats

team_stats = team_efficiency_metrics(games)

# Composite Four Factors Score
team_stats['Composite'] = (
    FOUR_FACTOR_WEIGHTS['shooting'] * team_stats['eFG%'] +
    FOUR_FACTOR_WEIGHTS['turnovers'] * (1 - team_stats['TOV%']) +
    FOUR_FACTOR_WEIGHTS['rebounding'] * team_stats['ORB%'] +
    FOUR_FACTOR_WEIGHTS['free_throws'] * team_stats['FTRate']
)

# Merge with regions
team_rankings = pd.merge(
    team_stats.reset_index(),
    teams_regions,
    on='team', how='inner'
)

# Generate regional rankings
for region in ['North', 'South', 'West']:
    region_teams = team_rankings[team_rankings['region'] == region] \
        .sort_values('Composite', ascending=False) \
        .head(16) \
        [['team', 'Composite']]
    
    print(f"\n--- {region} Region Rankings ---")
    print(region_teams.round(3))


--- North Region Rankings ---
                             team  Composite
49      jackson_state_lady_tigers      0.592
108    stephen_f_austin_ladyjacks      0.590
112  tennessee_tech_golden_eagles      0.589
99       south_carolina_gamecocks      0.585
34        georgia_southern_eagles      0.584
11                 belmont_bruins      0.582
3            alabama_crimson_tide      0.581
30           freed_hardeman_lions      0.580
89                ole_miss_rebels      0.578
94                      rice_owls      0.577
105     southern_miss_lady_eagles      0.577
28      florida_gulf_coast_eagles      0.577
125                   ucf_knights      0.575
63                     lsu_tigers      0.575
32   gardner_webb_runnin_bulldogs      0.574
119             tulane_green_wave      0.574

--- South Region Rankings ---
                              team  Composite
23              depaul_blue_demons      0.592
134  western_kentucky_lady_toppers      0.591
70             michigan_wolverines 

  games = game_data.groupby('game_id', group_keys=False).apply(calculate_possessions)


In [2]:

# %% [markdown]
# ## Phase 1b: Four Factors Predictions

# %%
east_matchups = pd.read_csv('East Regional Games to predict.csv')

def matchup_predictor(row):
    """Calculate winning probability using Four Factors comparison"""
    home_team = team_stats.loc[row['team_home']]
    away_team = team_stats.loc[row['team_away']]
    
    # Four Factors Differential
    shooting_diff = home_team['eFG%'] - away_team['eFG%']
    tov_diff = away_team['TOV%'] - home_team['TOV%']  # Higher opponent TOV% is better
    orb_diff = home_team['ORB%'] - away_team['ORB%']
    ft_diff = home_team['FTRate'] - away_team['FTRate']
    
    # Weighted Advantage Score
    raw_score = (
        FOUR_FACTOR_WEIGHTS['shooting'] * shooting_diff +
        FOUR_FACTOR_WEIGHTS['turnovers'] * tov_diff +
        FOUR_FACTOR_WEIGHTS['rebounding'] * orb_diff +
        FOUR_FACTOR_WEIGHTS['free_throws'] * ft_diff
    )
    
    # Replace logistic scaling
    base_prob = 0.5 + (raw_score * 0.8)  # Tuned multiplier
    base_prob = np.clip(base_prob, 0.2, 0.8)
    
    # Contextual Adjustments
    rest_adj = 0.03 * (row['rest_days_Home'] - row['rest_days_Away'])
    travel_adj = -0.02 * (row['travel_dist_Away'] / 500)
    neutral_adj = 0.05 if row['home_away_NS'] == 0 else 0
    
    adj_score = base_prob + rest_adj + travel_adj + neutral_adj
    
    # Logistic conversion to probability
    # return round(1 / (1 + np.exp(-adj_score*10)), 4)  # Scaled for proper probability distribution
    return adj_score

# Generate predictions
east_matchups['WINNING %'] = east_matchups.apply(matchup_predictor, axis=1)
# east_matchups['WINNING %'] = east_matchups['WINNING %'].clip(0.01, 0.99)

# Validate and export
print("\nFinal Predictions:")
print(east_matchups[['game_id', 'team_home', 'team_away', 'WINNING %']])
# east_matchups.to_csv('FourFactors_predictions.csv', index=False)

# %% [markdown]
# ## Validation Suite


Final Predictions:
     game_id                   team_home                 team_away  WINNING %
0   G_East_1           rhode_island_rams  north_carolina_tar_heels   0.663934
1   G_East_2           nc_state_wolfpack         rhode_island_rams   0.737499
2   G_East_3           nc_state_wolfpack  north_carolina_tar_heels   0.701433
3   G_East_4              liberty_flames            bucknell_bison   0.477245
4   G_East_5              drexel_dragons        delaware_blue_hens   0.576590
5   G_East_6   massachusetts_minutewomen          princeton_tigers   0.615532
6   G_East_7               buffalo_bulls     stony_brook_seawolves   0.420689
7   G_East_8             fairfield_stags             towson_tigers   0.467475
8   G_East_9               uconn_huskies  campbell_fighting_camels   0.443326
9  G_East_10  american_university_eagles            columbia_lions   0.498138


In [3]:

# %%
# Backtest on Regular Season
# %% [markdown]
# ## Validation Suite (Fixed Backtest)

def get_opponent_stats(game_row, full_data):
    """Retrieve opponent's stats from the same game_id"""
    game_id = game_row.name  # Using index with game_id
    game_group = full_data[full_data['game_id'] == game_id]
    return game_group[game_group['team'] != game_row['team']].iloc[0]

# Sample complete games (both opponents)
unique_games = games['game_id'].unique()
sampled_game_ids = np.random.choice(unique_games, 500, replace=False)
sample_games = games[games['game_id'].isin(sampled_game_ids)]

# Create paired records
predictions = []
for game_id, game_group in sample_games.groupby('game_id'):
    home_teams = game_group[game_group['home_away_NS'] == 1]
    away_teams = game_group[game_group['home_away_NS'] == -1]
    
    # Skip if we don't have both home and away teams
    if home_teams.empty or away_teams.empty:
        print(f"Skipping {game_id} - missing home or away team")
        continue
        
    home_team = home_teams.iloc[0]
    away_team = away_teams.iloc[0]
    
    try:
        pred = matchup_predictor(pd.Series({
            'team_home': home_team['team'],
            'team_away': away_team['team'],
            'rest_days_Home': home_team['rest_days'],
            'rest_days_Away': away_team['rest_days'],
            'travel_dist_Away': away_team['travel_dist'],
            'home_away_NS': 1  # Proper home/away context
        }))
        
        actual_winner_home = home_team['team_score'] > away_team['team_score']
        predictions.append({
            'pred': pred,
            'actual': actual_winner_home
        })
        print(f"Processed {game_id}: Home - {home_team['team']} vs Away - {away_team['team']}")
    except Exception as e:
        print(f"Skipping {game_id} due to error: {str(e)}")

# Calculate metrics
pred_df = pd.DataFrame(predictions)
# Fix the comparison by using parentheses to properly group operations
print(f"\nValidation Accuracy: {((pred_df['pred'] > 0.5) == pred_df['actual']).mean():.2%}")
print(f"Brier Score: {((pred_df['pred'] - pred_df['actual'].astype(float))**2).mean():.3f}")


Processed game_2022_1001: Home - creighton_bluejays vs Away - northern_iowa_panthers
Skipping game_2022_1005 - missing home or away team
Processed game_2022_1035: Home - stony_brook_seawolves vs Away - st_francis_brooklyn_terriers
Processed game_2022_1047: Home - oral_roberts_golden_eagles vs Away - little_rock_trojans
Skipping game_2022_1061 - missing home or away team
Skipping game_2022_1069 - missing home or away team
Processed game_2022_1076: Home - providence_friars vs Away - monmouth_hawks
Processed game_2022_1085: Home - alabama_state_lady_hornets vs Away - south_alabama_jaguars
Processed game_2022_1103: Home - ohio_bobcats vs Away - florida_am_rattlers
Processed game_2022_1139: Home - ualbany_great_danes vs Away - bryant_bulldogs
Processed game_2022_1147: Home - quinnipiac_bobcats vs Away - yale_bulldogs
Processed game_2022_1148: Home - buffalo_bulls vs Away - niagara_purple_eagles
Processed game_2022_1149: Home - presbyterian_blue_hose vs Away - unc_greensboro_spartans
Process