In [3]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import joblib  # For saving the model

### Load DATA

In [4]:
# Load the data
csv_file_path = os.path.join('..', 'Data', 'csv_file', 'combined_data', 'train_data.csv')
data = pd.read_csv(csv_file_path)

In [9]:
features = [
    'MIN', 'PTS_LAST_5', 'PLAYER_HOME_AVG_PTS', 'PLAYER_AWAY_AVG_PTS', 'USG_PCT', 'PER',
    'TS_PCT', 'USG_PCT_LAST_5', 'USG_DRTG_INTERACTION', 'OFF_RATING',
    'TEAM_OFF_RATING', 'TEAM_PACE', 'TEAM_PTS', 'TEAM_AST', 'TEAM_FGA',
    'OPP_DEF_RATING', 'OPP_PACE', 'GAME_PACE'
]

X = data[features]
y = data['PTS']

# X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.0,random_state=42)

xgb_model = XGBRegressor()
xgb_model.fit(X,y)



In [10]:
from sklearn.model_selection import cross_val_score

# Evaluate the model using cross-validation
cv_scores = cross_val_score(xgb_model, X, y, cv=5, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-cv_scores)

print(f"Cross-validated RMSE: {rmse_scores.mean():.4f} ± {rmse_scores.std():.4f}")

Cross-validated RMSE: 1.4204 ± 0.0374


In [21]:
csv_file_path = os.path.join('..', 'Data', 'csv_file', '2024', 'players_df_2024.csv')
data_2024 = pd.read_csv(csv_file_path)
df = data[features]
df.corr()



Unnamed: 0,MIN,PTS_LAST_5,PLAYER_HOME_AVG_PTS,PLAYER_AWAY_AVG_PTS,USG_PCT,PER,TS_PCT,USG_PCT_LAST_5,USG_DRTG_INTERACTION,OFF_RATING,TEAM_OFF_RATING,TEAM_PACE,TEAM_PTS,TEAM_AST,TEAM_FGA,OPP_DEF_RATING,OPP_PACE,GAME_PACE
MIN,1.0,0.745403,0.694897,0.68618,0.290175,0.28899,0.28421,0.387442,0.28102,0.283692,-0.015665,0.01372,-0.008717,-0.030753,0.01558,-0.015665,0.015007,0.014886
PTS_LAST_5,0.745403,1.0,0.917888,0.917056,0.534746,0.460256,0.251758,0.72436,0.533361,0.226633,0.042013,0.022461,0.049989,0.013294,0.011838,0.042013,0.026094,0.025161
PLAYER_HOME_AVG_PTS,0.694897,0.917888,1.0,0.930224,0.516302,0.412199,0.188819,0.704446,0.513204,0.202082,0.036429,0.016924,0.042066,0.014437,0.002312,0.036429,0.022185,0.020267
PLAYER_AWAY_AVG_PTS,0.68618,0.917056,0.930224,1.0,0.515843,0.415323,0.189715,0.704875,0.514689,0.196384,0.04245,0.015171,0.047101,0.015559,0.002696,0.04245,0.020441,0.018452
USG_PCT,0.290175,0.534746,0.516302,0.515843,1.0,0.594225,0.192334,0.735046,0.972037,0.089445,-0.015726,0.006499,-0.011377,0.004734,0.006584,-0.015726,-0.000679,0.003019
PER,0.28899,0.460256,0.412199,0.415323,0.594225,1.0,0.574471,0.468669,0.614733,0.344452,0.136173,0.049327,0.151145,0.130294,0.066889,0.136173,0.062683,0.05804
TS_PCT,0.28421,0.251758,0.188819,0.189715,0.192334,0.574471,1.0,0.124616,0.225489,0.401502,0.168786,6.6e-05,0.158126,0.116322,-0.044633,0.168786,0.023218,0.012057
USG_PCT_LAST_5,0.387442,0.72436,0.704446,0.704875,0.735046,0.468669,0.124616,1.0,0.714903,0.100051,-0.007976,0.009716,-0.002942,0.004187,0.000223,-0.007976,0.007353,0.008846
USG_DRTG_INTERACTION,0.28102,0.533361,0.513204,0.514689,0.972037,0.614733,0.225489,0.714903,1.0,0.173476,0.19585,-0.016555,0.175636,0.122929,0.015238,0.19585,0.002137,-0.007478
OFF_RATING,0.283692,0.226633,0.202082,0.196384,0.089445,0.344452,0.401502,0.100051,0.173476,1.0,0.386908,-0.007585,0.358954,0.218224,0.055563,0.386908,0.010786,0.001652


In [22]:
predictions_2024 = xgb_model.predict(df)

# Assuming 'PTS' is the actual column in data_2024
if 'PTS' in data_2024.columns:
    y_2024 = data_2024['PTS']
    
    # Calculate the error
    error_2024 = y_2024 - predictions_2024

    # Create a DataFrame with actual, predicted, and error
    results_df = pd.DataFrame({
        'Actual': y_2024,
        'Predicted': predictions_2024,
        'Error': error_2024
    })

    # Print the first few rows of the results
    print(results_df.head(20))

    # Calculate and print RMSE and R²
    rmse_2024 = np.sqrt(mean_squared_error(y_2024, predictions_2024))
    r2_2024 = r2_score(y_2024, predictions_2024)
    print(f"2024 Season RMSE: {rmse_2024:.4f}, R²: {r2_2024:.4f}")
else:
    print("Actual 'PTS' column not found in the 2024 data.")

    Actual  Predicted     Error
0       21  19.705875  1.294125
1       21  22.347961 -1.347961
2       27  32.193604 -5.193604
3       19  20.720934 -1.720934
4       35  35.748077 -0.748077
5       24  25.940968 -1.940968
6       30  29.566498  0.433502
7       18  19.347097 -1.347097
8       32  35.095814 -3.095814
9       16  20.580879 -4.580879
10      28  31.547688 -3.547688
11      35  34.172123  0.827877
12      37  35.099075  1.900925
13      17  19.031713 -2.031713
14      26  27.362623 -1.362623
15      22  24.012886 -2.012886
16      18  16.206938  1.793062
17      25  25.759964 -0.759964
18      21  24.455912 -3.455912
19      16  16.538231 -0.538231
2024 Season RMSE: 1.3559, R²: 0.9772


In [18]:
results_df.describe()

Unnamed: 0,Actual,Predicted,Error
count,26401.0,26401.0,26401.0
mean,10.642021,10.639983,0.002039
std,8.983887,8.906435,1.35591
min,0.0,-4.872216,-9.783688
25%,3.0,3.465735,-0.602466
50%,9.0,8.803712,0.007992
75%,16.0,15.755449,0.674982
max,73.0,62.852348,14.702698
