In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.multioutput import MultiOutputRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib

In [3]:
folder_path = 'archive (1)/stats'

dataframes = []

for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        dataframes.append(df)

# Combine all dataframes into a single dataframe
df = pd.concat(dataframes, ignore_index=True)

df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'], format='mixed')

# Check the minimum and maximum dates
print("Minimum GAME_DATE:", df['GAME_DATE'].min())
print("Maximum GAME_DATE:", df['GAME_DATE'].max())






# Example of rolling averages: Points per game (over the last 3 games for each player)
rolling_columns = ['MIN', 'FGM', 'FGA', 'FG_PCT', 'PTS', 'REB', 'AST']

window_size = 5
for col in rolling_columns:
    df[f'ROLLING_{col}'] = df.groupby('Player_ID')[col].rolling(window=window_size, min_periods=1).mean().reset_index(0, drop=True)


Minimum GAME_DATE: 2023-10-24 00:00:00
Maximum GAME_DATE: 2024-04-14 00:00:00


In [4]:
# Process the combined dataframe
print(df)
print(df.head())
print(df.info())

       SEASON_ID  Player_ID   Game_ID  GAME_DATE      MATCHUP WL  MIN  FGM  \
0          22023     203991  22301178 2024-04-12    ATL @ MIN  L   26    6   
1          22023     203991  22301159 2024-04-10  ATL vs. CHA  L   21    7   
2          22023     203991  22301147 2024-04-09  ATL vs. MIA  L   37    3   
3          22023     203991  22301130 2024-04-06    ATL @ DEN  L   30    5   
4          22023     203991  22301124 2024-04-04    ATL @ DAL  L   29    3   
...          ...        ...       ...        ...          ... ..  ...  ...   
26396      22023    1626167  22300001 2023-11-03  IND vs. CLE  W   31    9   
26397      22023    1626167  22300118 2023-11-01    IND @ BOS  L   22    4   
26398      22023    1626167  22300102 2023-10-30  IND vs. CHI  L   32    6   
26399      22023    1626167  22300091 2023-10-28    IND @ CLE  W   29    7   
26400      22023    1626167  22300064 2023-10-25  IND vs. WAS  W   23    5   

       FGA  FG_PCT  ...  PTS  PLUS_MINUS  VIDEO_AVAILABLE  ROLL

In [12]:
df.sort_values(['Player_ID', 'GAME_DATE'], inplace = True)
# Ensure df is not a slice
df = df.copy()

# Proceed with your operations
df['TARGET_PTS'] = df.groupby('Player_ID')['PTS'].shift(-1)


# Drop rows where 'TARGET_PTS' is NaN
df = df.dropna(subset=['TARGET_PTS']).reset_index(drop=True)


# Include rolling averages and other features
feature_columns = [
    
    'ROLLING_MIN',
    'ROLLING_FGM',
    'ROLLING_FGA',
    'ROLLING_FG_PCT',
    'ROLLING_PTS',
    'ROLLING_REB',
    'ROLLING_AST',
    # Add one-hot encoded columns
] 

# Prepare the feature matrix X and target vector y
X = df[feature_columns].copy()
y = df['TARGET_PTS'].copy()

print(X.isnull().sum())

X.fillna(0)

# Convert 'GAME_DATE' to datetime if not already
df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])

# Sort by date
df.sort_values('GAME_DATE', inplace=True)

# Reset the index to ensure alignment
df.reset_index(drop=True, inplace=True)

# Reassign 'X' and 'y' after sorting
X = df[feature_columns].copy()
y = df['TARGET_PTS'].copy()

df = df.copy()

cutoff_date = '2024-01-01'  # Change to an earlier date if necessary

# Re-split the data
X_train = X[df['GAME_DATE'] < cutoff_date]
X_test = X[df['GAME_DATE'] >= cutoff_date]
y_train = y[df['GAME_DATE'] < cutoff_date]
y_test = y[df['GAME_DATE'] >= cutoff_date]


print("Shape of X:", X.shape)
print("Shape of y:", y.shape)
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

print("Total NaNs in y:", y.isnull().sum())
print("Total NaNs in y_train:", y_train.isnull().sum())
print("Total NaNs in y_test:", y_test.isnull().sum())

ROLLING_MIN       0
ROLLING_FGM       0
ROLLING_FGA       0
ROLLING_FG_PCT    0
ROLLING_PTS       0
ROLLING_REB       0
ROLLING_AST       0
dtype: int64
Shape of X: (23087, 7)
Shape of y: (23087,)
Shape of X_train: (10185, 7)
Shape of y_train: (10185,)
Shape of X_test: (12902, 7)
Shape of y_test: (12902,)
Total NaNs in y: 0
Total NaNs in y_train: 0
Total NaNs in y_test: 0


In [13]:
# Initialize the model
model = RandomForestRegressor(random_state=42)

# Train the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# After preparing your training data
model_feature_names = X.columns.tolist()

# Save the model feature names
joblib.dump(model_feature_names, 'model_feature_names.joblib')


# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² Score: {r2}")

Mean Absolute Error (MAE): 3.9384592926407374
Root Mean Squared Error (RMSE): 5.225134437541815
R² Score: 0.6604039112156335


In [14]:
joblib.dump(model, 'nbaPerformanceModel.joblib')

['nbaPerformanceModel.joblib']