In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
# Load the dataset
file_path = 'games.csv'
data = pd.read_csv(file_path)

In [None]:
# Filter dataset for regular season games
data = data[data['game_type'] == 'REG']

In [None]:
# Convert game time to numeric
#

def convert_gametime(gametime_str):
  """Converts gametime string to decimal representation."""
  try:
    hours, minutes = map(int, gametime_str.split(':'))
    return hours + minutes / 60
  except:
    return None  # Or handle invalid input in a way that makes sense for your data


data['gametime'] = data['gametime'].apply(convert_gametime)

In [None]:
# Select relevant features and target variable
features = ['week', 'weekday', 'gametime', 'away_team', 'home_team', 'div_game', 'roof', 'surface', 'temp', 'wind']
target = data['home_score'] + data['away_score']

In [None]:
# One-hot encode categorical features
data_encoded = pd.get_dummies(data[features], columns=['weekday', 'away_team', 'home_team', 'div_game', 'roof', 'surface'], drop_first=True)

In [None]:
# Train-test split
data = data.dropna(subset=['home_score', 'away_score'])
X_train, X_test, y_train, y_test = train_test_split(data_encoded, target, test_size=0.2, random_state=42)

In [None]:
# Create LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [None]:
# Set parameters for LightGBM
params = {
    'objective': 'regression',
    'metric': 'mae',  # Mean Absolute Error
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1
}

# Train the model using early stopping callback
model = lgb.train(
    params,
    train_data,
    valid_sets=[test_data],
    num_boost_round=1000,  # Specify maximum number of boosting rounds
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False),
               lgb.log_evaluation(period=50, show_stdv=False)]  # Log evaluation every 50 rounds
)

[50]	valid_0's l1: 11.2465


In [None]:
print(y_test.isna().sum())  # Check for NaNs in true values
print(pd.isna(y_pred).sum())  # Check for NaNs in predictions


32
0


In [None]:
# Drop NaNs from y_test and y_pred
non_nan_indices = ~pd.isna(y_test) & ~pd.isna(y_pred)  # Keep rows where both y_test and y_pred are not NaN
y_test_clean = y_test[non_nan_indices]
y_pred_clean = y_pred[non_nan_indices]

# Calculate MAE
mae = mean_absolute_error(y_test_clean, y_pred_clean)
print(f"Mean Absolute Error: {mae}")


Mean Absolute Error: 10.966525379319727
