In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit

# Load and filter data
games = pd.read_csv('games_2022.csv').sort_values('game_date')
games = games[games['notD1_incomplete'] == False]

# --- Feature Engineering with Time Awareness ---
def temporal_features(df):
    # Cumulative team stats (avoid future leakage)
    df['cum_FGM2'] = df.groupby('team')['FGM_2'].cumsum()
    df['cum_FGM3'] = df.groupby('team')['FGM_3'].cumsum()
    df['cum_FGA2'] = df.groupby('team')['FGA_2'].cumsum()
    df['cum_FGA3'] = df.groupby('team')['FGA_3'].cumsum()
    
    # Rolling performance metrics
    df['last3_wins'] = df.groupby('team')['team_score'].transform(
        lambda x: x.rolling(3).apply(lambda s: (s > s.shift()).mean())
    )
    
    # Temporal efficiency calculations
    df['off_eff'] = (df['cum_FGM2'] + 1.5*df['cum_FGM3']) / (df['cum_FGA2'] + df['cum_FGA3'] + 1e-6)  # Avoid zero-division
    df['def_eff'] = df.groupby('team')['opponent_team_score'].expanding().mean().reset_index(level=0, drop=True)
    
    # Contextual features
    df['home_strength'] = df['home_away_NS'] * np.log1p(df['attendance'])/10
    df['rest_impact'] = np.clip(df['rest_days'], 0, 7)/7 - 0.5  # Centered rest days
    
    return df.fillna(0.5)  # Neutral fill for missing values

games = temporal_features(games)

In [None]:
features = [
    'home_strength',
    'rest_impact',
    'off_eff',
    'def_eff',
    'last3_wins'
]

X = games[features]
y = (games['team_score'] > games['opponent_team_score']).astype(int)

# Temporal cross-validation
tscv = TimeSeriesSplit(n_splits=5)

model = LogisticRegressionCV(
    cv=tscv,
    penalty='l2',
    solver='saga',
    scoring='neg_log_loss',
    max_iter=10000,
    n_jobs=-1
).fit(X, y)

# Probability calibration
calibrator = CalibratedClassifierCV(model, cv='prefit', method='isotonic')
calibrator.fit(X[:int(len(X)*0.8)], y[:int(len(X)*0.8)])  # Holdout 20% for calibration

# Prepare team state for prediction date (2022-03-15 assumed)
team_stats = games.groupby('team').last().reset_index()

def predict_playoff_game(row):
    """Per-game prediction with latest team state"""
    home = team_stats[team_stats['team'] == row['team_home']].iloc[0]
    away = team_stats[team_stats['team'] == row['team_away']].iloc[0]
    
    feature_vals = [
        row['home_away_NS'],
        (row['rest_days_Home']/7 - 0.5) - (row['rest_days_Away']/7 - 0.5),
        home['off_eff'],
        away['def_eff'],
        home['last3_wins'] - away['last3_wins']
    ]
    
    raw_prob = calibrator.predict_proba([feature_vals])[0][1]
    final_prob = np.clip(raw_prob, 0.15, 0.85)  # Conservative bounds
    return round(final_prob, 4)

# Final predictions
east_games = pd.read_csv('East Regional Games to predict.csv')
east_games['WINNING %'] = east_games.apply(predict_playoff_game, axis=1)
# east_games.to_csv('calibrated_predictions.csv', index=False)

In [3]:

# %% [markdown]
# ## 5. Validation & Sanity Checks

# Feature importance
print("Model Coefficients:")
print(pd.DataFrame({'feature': features, 'weight': model.coef_[0]}))

# Probability distribution check
print("\nPrediction Distribution:")
print(east_games['WINNING %'].describe())

# Sample outputs
print("\nExample Predictions:")
print(east_games[['game_id', 'team_home', 'team_away', 'WINNING %']])


Model Coefficients:
         feature    weight
0  home_strength  0.534121
1    rest_impact -0.087213
2        off_eff  9.720645
3        def_eff -0.095307
4     last3_wins  2.657278

Prediction Distribution:
count    10.000000
mean      0.537700
std       0.206325
min       0.236000
25%       0.346050
50%       0.562650
75%       0.690400
max       0.850000
Name: WINNING %, dtype: float64

Example Predictions:
     game_id                   team_home                 team_away  WINNING %
0   G_East_1           rhode_island_rams  north_carolina_tar_heels     0.3029
1   G_East_2           nc_state_wolfpack         rhode_island_rams     0.4755
2   G_East_3           nc_state_wolfpack  north_carolina_tar_heels     0.2360
3   G_East_4              liberty_flames            bucknell_bison     0.8500
4   G_East_5              drexel_dragons        delaware_blue_hens     0.3029
5   G_East_6   massachusetts_minutewomen          princeton_tigers     0.6559
6   G_East_7               buffalo_bulls