In [85]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.multioutput import MultiOutputRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [90]:
folder_path = 'Desktop/nbaModel/archive (1)/stats'

dataframes = []

for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        dataframes.append(df)

# Combine all dataframes into a single dataframe
df = pd.concat(dataframes, ignore_index=True)

df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'], format='mixed')

# Check the minimum and maximum dates
print("Minimum GAME_DATE:", df['GAME_DATE'].min())
print("Maximum GAME_DATE:", df['GAME_DATE'].max())

df['HOME/AWAY'] = df['MATCHUP'].apply(lambda x: 'Home' if 'vs.' in x else 'Away')
df['OPPONENT'] = df['MATCHUP'].apply(lambda x: x.split()[-1])

encoder = OneHotEncoder(sparse_output=False)
encoded_cols = pd.DataFrame(encoder.fit_transform(df[['HOME/AWAY', 'WL', 'OPPONENT']]), columns=encoder.get_feature_names_out())
df = pd.concat([df, encoded_cols], axis=1)

df.drop(columns=['MATCHUP', 'WL', 'HOME/AWAY', 'OPPONENT'], inplace=True)

df['REST_DAYS'] = df.groupby('Player_ID')['GAME_DATE'].diff().dt.days.fillna(0)

# Example of rolling averages: Points per game (over the last 3 games for each player)
rolling_columns = ['MIN', 'FGM', 'FGA', 'FG_PCT', 'PTS', 'REB', 'AST']

window_size = 5
for col in rolling_columns:
    df[f'ROLLING_{col}'] = df.groupby('Player_ID')[col].rolling(window=window_size, min_periods=1).mean().reset_index(0, drop=True)


Minimum GAME_DATE: 2023-10-24 00:00:00
Maximum GAME_DATE: 2024-04-14 00:00:00


In [91]:
# Process the combined dataframe
print(df)
print(df.head())
print(df.info())

       SEASON_ID  Player_ID   Game_ID  GAME_DATE  MIN  FGM  FGA  FG_PCT  FG3M  \
0          22023     203991  22301178 2024-04-12   26    6    8   0.750     0   
1          22023     203991  22301159 2024-04-10   21    7   10   0.700     0   
2          22023     203991  22301147 2024-04-09   37    3    8   0.375     0   
3          22023     203991  22301130 2024-04-06   30    5    6   0.833     0   
4          22023     203991  22301124 2024-04-04   29    3    7   0.429     0   
...          ...        ...       ...        ...  ...  ...  ...     ...   ...   
26396      22023    1626167  22300001 2023-11-03   31    9   14   0.643     4   
26397      22023    1626167  22300118 2023-11-01   22    4   12   0.333     0   
26398      22023    1626167  22300102 2023-10-30   32    6   14   0.429     3   
26399      22023    1626167  22300091 2023-10-28   29    7   14   0.500     1   
26400      22023    1626167  22300064 2023-10-25   23    5    8   0.625     0   

       FG3A  ...  OPPONENT_

In [92]:
df.sort_values(['Player_ID', 'GAME_DATE'], inplace = True)
# Ensure df is not a slice
df = df.copy()

# Proceed with your operations
df['TARGET_PTS'] = df.groupby('Player_ID')['PTS'].shift(-1)


# Drop rows where 'TARGET_PTS' is NaN
df = df.dropna(subset=['TARGET_PTS']).reset_index(drop=True)


# Include rolling averages and other features
feature_columns = [
    'REST_DAYS',
    'ROLLING_MIN',
    'ROLLING_FGM',
    'ROLLING_FGA',
    'ROLLING_FG_PCT',
    'ROLLING_PTS',
    'ROLLING_REB',
    'ROLLING_AST',
    # Add one-hot encoded columns
] + list(encoded_cols.columns)

# Prepare the feature matrix X and target vector y
X = df[feature_columns].copy()
y = df['TARGET_PTS'].copy()

print(X.isnull().sum())

X.fillna(0)

# Convert 'GAME_DATE' to datetime if not already
df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])

# Sort by date
df.sort_values('GAME_DATE', inplace=True)

# Reset the index to ensure alignment
df.reset_index(drop=True, inplace=True)

# Reassign 'X' and 'y' after sorting
X = df[feature_columns].copy()
y = df['TARGET_PTS'].copy()

df = df.copy()

cutoff_date = '2024-01-01'  # Change to an earlier date if necessary

# Re-split the data
X_train = X[df['GAME_DATE'] < cutoff_date]
X_test = X[df['GAME_DATE'] >= cutoff_date]
y_train = y[df['GAME_DATE'] < cutoff_date]
y_test = y[df['GAME_DATE'] >= cutoff_date]


print("Shape of X:", X.shape)
print("Shape of y:", y.shape)
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

print("Total NaNs in y:", y.isnull().sum())
print("Total NaNs in y_train:", y_train.isnull().sum())
print("Total NaNs in y_test:", y_test.isnull().sum())

REST_DAYS         0
ROLLING_MIN       0
ROLLING_FGM       0
ROLLING_FGA       0
ROLLING_FG_PCT    0
ROLLING_PTS       0
ROLLING_REB       0
ROLLING_AST       0
HOME/AWAY_Away    0
HOME/AWAY_Home    0
WL_L              0
WL_W              0
OPPONENT_ATL      0
OPPONENT_BKN      0
OPPONENT_BOS      0
OPPONENT_CHA      0
OPPONENT_CHI      0
OPPONENT_CLE      0
OPPONENT_DAL      0
OPPONENT_DEN      0
OPPONENT_DET      0
OPPONENT_GSW      0
OPPONENT_HOU      0
OPPONENT_IND      0
OPPONENT_LAC      0
OPPONENT_LAL      0
OPPONENT_MEM      0
OPPONENT_MIA      0
OPPONENT_MIL      0
OPPONENT_MIN      0
OPPONENT_NOP      0
OPPONENT_NYK      0
OPPONENT_OKC      0
OPPONENT_ORL      0
OPPONENT_PHI      0
OPPONENT_PHX      0
OPPONENT_POR      0
OPPONENT_SAC      0
OPPONENT_SAS      0
OPPONENT_TOR      0
OPPONENT_UTA      0
OPPONENT_WAS      0
dtype: int64
Shape of X: (25829, 42)
Shape of y: (25829,)
Shape of X_train: (10298, 42)
Shape of y_train: (10298,)
Shape of X_test: (15531, 42)
Shape of y_test:

In [93]:
# Initialize the model
model = RandomForestRegressor(random_state=42)

# Train the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² Score: {r2}")

Mean Absolute Error (MAE): 3.844450453930848
Root Mean Squared Error (RMSE): 5.123688883927297
R² Score: 0.6735034859411158
