In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss

In [None]:
# Generate sample in-game data (replace with your actual play-by-play data)
def load_ingame_data():
    # Simulating play-by-play data
    n_samples = 5000
    data = pd.DataFrame({
        'score_diff': np.random.normal(0, 10, n_samples),  # Current score difference (home - away)
        'seconds_remaining': np.random.uniform(0, 2880, n_samples),  # Seconds left in game (48 min = 2880 sec)
        'home_elo': np.random.normal(1500, 200, n_samples),  # Home team strength
        'away_elo': np.random.normal(1500, 200, n_samples),  # Away team strength
        'quarter': np.random.choice([1, 2, 3, 4], n_samples),  # Current quarter
        'home_possession': np.random.randint(0, 2, n_samples),  # 1 if home has ball
        'home_rest': np.random.randint(0, 5, n_samples),  # Days rest
        'away_rest': np.random.randint(0, 5, n_samples),  # Days rest
        # Target: 1 if home team wins, 0 if away team wins
        'home_team_win': np.random.randint(0, 2, n_samples)
    })
    return data

In [None]:
# Feature engineering
def engineer_features(df):
    # Add useful derived features
    df['elo_diff'] = df['home_elo'] - df['away_elo']
    df['time_remaining_pct'] = df['seconds_remaining'] / 2880
    df['points_per_second'] = df['score_diff'] / (2880 - df['seconds_remaining']).clip(lower=1)
    df['is_endgame'] = (df['seconds_remaining'] < 300) & (df['quarter'] == 4).astype(int)
    
    features = [
        'score_diff',
        'seconds_remaining',
        'home_elo',
        'away_elo',
        'quarter',
        'home_possession',
        'home_rest',
        'away_rest',
        'elo_diff',
        'time_remaining_pct',
        'points_per_second',
        'is_endgame'
    ]
    
    return df, features

In [None]:
def prepare_data(df, features):
    X = df[features]
    y = df['home_team_win']
    
    # Split into train and test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    return X_train, X_test, y_train, y_test

In [None]:
def train_catboost_model(X_train, X_test, y_train, y_test):
    # Define categorical features
    cat_features = ['quarter', 'home_possession', 'is_endgame']
    
    # Create data pools
    train_pool = Pool(X_train, y_train, cat_features=cat_features)
    test_pool = Pool(X_test, y_test, cat_features=cat_features)
    
    # Initialize model
    model = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.05,
        depth=6,
        loss_function='Logloss',
        eval_metric='AUC',
        random_seed=42,
        verbose=100,
        use_best_model=True,
        # task_type="GPU",
    )
    
    # Train model
    model.fit(
        train_pool,
        eval_set=test_pool,
        early_stopping_rounds=50
    )
    
    return model

In [None]:
def evaluate_model(model, X_test, y_test):
    # Predictions
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_pred_proba > 0.5).astype(int)
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    logloss = log_loss(y_test, y_pred_proba)
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Log Loss: {logloss:.4f}")
    
    return y_pred_proba

In [None]:
def predict_live_game(model, game_state):
    # Convert game state to DataFrame with same features
    game_df = pd.DataFrame([game_state])
    return model.predict_proba(game_df)[:, 1][0]

In [None]:
# Load and process data
df = load_ingame_data()
df, features = engineer_features(df)
X_train, X_test, y_train, y_test = prepare_data(df, features)

# Train model
model = train_catboost_model(X_train, X_test, y_train, y_test)

# Evaluate
predictions = evaluate_model(model, X_test, y_test)

# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': model.get_feature_importance()
}).sort_values('importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)

# Example live prediction
sample_game_state = {
    'score_diff': 5,  # Home team up by 5
    'seconds_remaining': 600,  # 10 minutes left
    'home_elo': 1600,
    'away_elo': 1550,
    'quarter': 4,
    'home_possession': 1,
    'home_rest': 2,
    'away_rest': 1,
    'elo_diff': 50,
    'time_remaining_pct': 600/2880,
    'points_per_second': 5/(2880-600),
    'is_endgame': 0
}

win_prob = predict_live_game(model, sample_game_state)
print(f"\nHome team win probability: {win_prob:.4f}")