In [45]:
# Import Libraries and Load All Player CSVs


In [46]:
import os
import pandas as pd

# Get all CSV files in the data directory
csv_dir = '../data'
csv_files = [f for f in os.listdir(csv_dir) if f.endswith('.csv')]

dfs = []
for file in csv_files:
    # Extract player name from filename (remove _gamelog... or _gamelogs... and .csv)
    base = file.replace('_gamelog_','_').replace('_gamelogs_','_').replace('.csv','')
    player_name = base.split('_')[0] if '_' in base else base
    df = pd.read_csv(os.path.join(csv_dir, file))
    df['player_name'] = player_name
    dfs.append(df)

# Concatenate all DataFrames into one
all_data = pd.concat(dfs, ignore_index=True)

# Display the first few rows
all_data.head()

  all_data = pd.concat(dfs, ignore_index=True)


Unnamed: 0,SEASON_ID,Player_ID,Game_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,...,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE,player_name,opponent_team,is_home_game,rolling_pts_5,over_fake_line
0,22023.0,1628983.0,22301196.0,"APR 14, 2024",OKC vs. DAL,W,16.0,5.0,11.0,0.455,...,0.0,1.0,15,32.0,1.0,Shai,,,,
1,22023.0,1628983.0,22301179.0,"APR 12, 2024",OKC vs. MIL,W,29.0,7.0,21.0,0.333,...,0.0,1.0,23,10.0,1.0,Shai,,,,
2,22023.0,1628983.0,22301163.0,"APR 10, 2024",OKC vs. SAS,W,26.0,10.0,18.0,0.556,...,1.0,1.0,26,26.0,1.0,Shai,,,,
3,22023.0,1628983.0,22301153.0,"APR 09, 2024",OKC vs. SAC,W,38.0,11.0,21.0,0.524,...,5.0,2.0,40,6.0,1.0,Shai,,,,
4,22023.0,1628983.0,22301082.0,"MAR 31, 2024",OKC @ NYK,W,35.0,7.0,16.0,0.438,...,2.0,4.0,19,-15.0,1.0,Shai,,,,


# Extract Opponent and Home/Away

From the `MATCHUP` column, extract:
- `opponent_team`: last 3 letters after 'vs.' or '@'
- `is_home_game`: True if 'vs.' in matchup, False if '@'

In [47]:
def extract_opponent_and_home(matchup):
    if 'vs.' in matchup:
        opponent = matchup.split('vs.')[-1].strip()[:3]
        is_home = True
    elif '@' in matchup:
        opponent = matchup.split('@')[-1].strip()[:3]
        is_home = False
    else:
        opponent = ''
        is_home = None
    return pd.Series([opponent, is_home])

all_data[['opponent_team', 'is_home_game']] = all_data['MATCHUP'].apply(extract_opponent_and_home)

# Display the first few rows to verify
all_data[['MATCHUP', 'opponent_team', 'is_home_game']].head()

Unnamed: 0,MATCHUP,opponent_team,is_home_game
0,OKC vs. DAL,DAL,True
1,OKC vs. MIL,MIL,True
2,OKC vs. SAS,SAS,True
3,OKC vs. SAC,SAC,True
4,OKC @ NYK,NYK,False


# Sort and Calculate Rolling Average (Fake Line)

For each player:
- Sort games by `GAME_DATE`
- Calculate rolling average of `PTS` over last 5 games
- Shift that average down by 1 (so it's prior to the current game)

In [48]:
# Ensure GAME_DATE is datetime
df = all_data.copy()
df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])

# Sort and calculate rolling average for each player
df = df.sort_values(['player_name', 'GAME_DATE'])
df['rolling_pts_5'] = (
    df.groupby('player_name')['PTS']
      .transform(lambda x: x.rolling(window=5, min_periods=1).mean().shift(1))
)

# Display the first few rows to verify
cols = ['player_name', 'GAME_DATE', 'PTS', 'rolling_pts_5']
df[cols].head(10)

  df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'])


Unnamed: 0,player_name,GAME_DATE,PTS,rolling_pts_5
6699,Brandon,2023-10-25,19,
6698,Brandon,2023-10-28,26,19.0
6697,Brandon,2023-11-04,16,22.5
6696,Brandon,2023-11-06,22,20.333333
6695,Brandon,2023-11-08,24,20.75
6694,Brandon,2023-11-10,31,21.4
6693,Brandon,2023-11-12,20,23.8
6692,Brandon,2023-11-14,25,22.6
6691,Brandon,2023-11-17,21,24.4
6690,Brandon,2023-11-18,30,24.2


In [49]:
# Create Binary Label: Did player go over their fake line?
df['over_fake_line'] = (df['PTS'] > df['rolling_pts_5']).astype(int)

# Display a preview
cols = ['player_name', 'GAME_DATE', 'PTS', 'rolling_pts_5', 'over_fake_line']
df[cols].head(10)

# Save the final dataset to CSV
df.to_csv('../data/final_dataset.csv', index=False)
print('Final dataset saved to data/final_dataset.csv')

Final dataset saved to data/final_dataset.csv


# Import Modeling Libraries
Import the necessary libraries for modeling, such as scikit-learn and numpy.

In [50]:
# Import modeling libraries
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
# Import libraries for modeling
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Select Features and Labels
Define the features and label for modeling. You can add more features later if needed.

In [51]:
features = ['is_home_game', 'rolling_pts_5']
X = df[features]
y = df['PTS']

# Handle NaNs
X = X.dropna()
y = y.loc[X.index]

from sklearn.linear_model import LogisticRegression


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.2f}")
print(f"R² Score: {r2:.2f}")

results = X_test.copy()
results['actual_pts'] = y_test
results['predicted_pts'] = y_pred
results['fake_line'] = results['rolling_pts_5']

# Did we "bet" the right side?
results['bet_win'] = (results['predicted_pts'] > results['fake_line']) == (results['actual_pts'] > results['fake_line'])
print("Simulated betting accuracy:", results['bet_win'].mean())

import joblib

# Save the trained model to the data directory (adjust path if needed)
joblib.dump(model, '../data/random_forest_model.pkl')
print("Model saved to ../data/random_forest_model.pkl")


Mean Absolute Error: 7.43
R² Score: 0.05
Simulated betting accuracy: 0.6254442075337597
Model saved to ../data/random_forest_model.pkl
