Player Stats
Data Science Process:
1) Formulate a data science question:
Predict the number of points, assists, and rebounds a position might get.
Use location, back to back factor, opponent's average number of points conceded from each position.

In [1]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
import numpy as np
import player_position_scraper
import plotly.express as px
from nba_api.stats.static import players, teams
from nba_api.stats.endpoints import commonplayerinfo, teamgamelog, playergamelog, boxscoretraditionalv2, leaguegamefinder, LeagueGameFinder, playbyplayv2
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
player_position_scraper.player_pos_extracts()
def player_stats(player_name):
    player = players.find_players_by_full_name(player_name)[0]
    player_id = player['id']
    game_log = playergamelog.PlayerGameLog(player_id=player_id, season='2024-25', season_type_all_star='Regular Season')
    game_log_df = game_log.get_data_frames()[0]
    

    # Print the game log
    return game_log_df


Summary Statistics

Average number of points, rebounds and assists

In [3]:
player_df = player_stats("Dillon Brooks")
player_df[['PTS', 'AST', 'REB']].describe()

Unnamed: 0,PTS,AST,REB
count,21.0,21.0,21.0
mean,12.571429,1.52381,3.619048
std,6.867106,1.209093,1.802115
min,0.0,0.0,1.0
25%,7.0,1.0,2.0
50%,14.0,1.0,4.0
75%,17.0,2.0,5.0
max,28.0,4.0,7.0


In [4]:
player_df['GAME_DATE'] = pd.to_datetime(player_df['GAME_DATE'])
player_df['Location'] = np.where(player_df['MATCHUP'].str.contains('vs'), 'Home', 'Away')
player_df['days_diff'] = player_df['GAME_DATE'].diff(periods=-1).dt.days
player_df = player_df.rename(columns={'Game_ID': 'GAME_ID'})
player_df

Unnamed: 0,SEASON_ID,Player_ID,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE,Location,days_diff
0,22024,1628415,22400310,2024-12-01,HOU vs. OKC,W,29,6,11,0.545,2,4,0.5,2,2,1.0,2,5,7,3,0,0,0,5,16,0,1,Home,4.0
1,22024,1628415,22400290,2024-11-27,HOU @ PHI,W,36,0,11,0.0,0,8,0.0,0,0,0.0,0,1,1,4,0,2,2,3,0,3,1,Away,1.0
2,22024,1628415,22400037,2024-11-26,HOU @ MIN,W,41,8,14,0.571,4,7,0.571,2,2,1.0,2,1,3,2,0,0,2,3,22,6,1,Away,3.0
3,22024,1628415,22400267,2024-11-23,HOU vs. POR,L,31,5,11,0.455,1,3,0.333,2,2,1.0,1,4,5,1,1,0,0,3,13,0,1,Home,1.0
4,22024,1628415,22400031,2024-11-22,HOU vs. POR,W,25,10,15,0.667,6,8,0.75,2,4,0.5,3,3,6,1,0,0,2,3,28,13,1,Home,2.0
5,22024,1628415,22400254,2024-11-20,HOU vs. IND,W,22,1,10,0.1,0,7,0.0,2,2,1.0,1,0,1,3,0,1,1,2,4,8,1,Home,2.0
6,22024,1628415,22400248,2024-11-18,HOU @ MIL,L,21,2,8,0.25,1,3,0.333,0,0,0.0,2,2,4,1,1,0,1,5,5,-3,1,Away,1.0
7,22024,1628415,22400241,2024-11-17,HOU @ CHI,W,28,7,13,0.538,1,3,0.333,0,1,0.0,0,3,3,3,2,0,1,1,15,29,1,Away,2.0
8,22024,1628415,22400016,2024-11-15,HOU vs. LAC,W,33,6,12,0.5,5,9,0.556,0,0,0.0,0,1,1,2,0,0,2,5,17,37,1,Home,2.0
9,22024,1628415,22400222,2024-11-13,HOU vs. LAC,W,30,5,9,0.556,4,6,0.667,1,2,0.5,1,4,5,2,0,0,1,3,15,6,1,Home,2.0


Average points as days_diff increases

In [5]:
avg_points_per_diff = player_df.groupby(['days_diff','Location'])['PTS'].mean().reset_index()

# Create a scatter plot with Plotly
# fig = px.scatter(avg_points_per_diff, x='days_diff', y='PTS', color='Location',
#                  title='Average Points Scored vs Days Between Games (Home vs Away)',
#                  labels={'days_diff': 'Days Between Games', 'points_scored': 'Average Points Scored'},
#                  symbol='Location',  # Different symbols for Home and Away
#                  trendline='ols'  # Optional: Adds a trendline
# )
fig = px.bar(avg_points_per_diff, x='days_diff', y='PTS', color='Location',
                 title='Average Points Scored vs Days Between Games (Home vs Away)',
                 labels={'days_diff': 'Days Between Games', 'points_scored': 'Average Points Scored'},
)

# Show the plot
fig.show()

In [6]:
avg_rebounds_per_diff = player_df.groupby(['days_diff','Location'])['REB'].mean().reset_index()

# Create a scatter plot with Plotly
# fig = px.scatter(avg_rebounds_per_diff, x='days_diff', y='PTS', color='Location',
#                  title='Average rebounds Scored vs Days Between Games (Home vs Away)',
#                  labels={'days_diff': 'Days Between Games', 'rebounds_scored': 'Average rebounds Scored'},
#                  symbol='Location',  # Different symbols for Home and Away
#                  trendline='ols'  # Optional: Adds a trendline
# )
fig = px.bar(avg_rebounds_per_diff, x='days_diff', y='REB', color='Location',
                 title='Average rebounds Scored vs Days Between Games (Home vs Away)',
                 labels={'days_diff': 'Days Between Games', 'rebounds_scored': 'Average rebounds Scored'},
)

# Show the plot
fig.show()

In [7]:
nba_teams = teams.get_teams()

# Create a dictionary with team ID as the key and team name as the value
team_id_name_dict = {team['full_name']: team['id'] for team in nba_teams}

print(team_id_name_dict)

{'Atlanta Hawks': 1610612737, 'Boston Celtics': 1610612738, 'Cleveland Cavaliers': 1610612739, 'New Orleans Pelicans': 1610612740, 'Chicago Bulls': 1610612741, 'Dallas Mavericks': 1610612742, 'Denver Nuggets': 1610612743, 'Golden State Warriors': 1610612744, 'Houston Rockets': 1610612745, 'Los Angeles Clippers': 1610612746, 'Los Angeles Lakers': 1610612747, 'Miami Heat': 1610612748, 'Milwaukee Bucks': 1610612749, 'Minnesota Timberwolves': 1610612750, 'Brooklyn Nets': 1610612751, 'New York Knicks': 1610612752, 'Orlando Magic': 1610612753, 'Indiana Pacers': 1610612754, 'Philadelphia 76ers': 1610612755, 'Phoenix Suns': 1610612756, 'Portland Trail Blazers': 1610612757, 'Sacramento Kings': 1610612758, 'San Antonio Spurs': 1610612759, 'Oklahoma City Thunder': 1610612760, 'Toronto Raptors': 1610612761, 'Utah Jazz': 1610612762, 'Memphis Grizzlies': 1610612763, 'Washington Wizards': 1610612764, 'Detroit Pistons': 1610612765, 'Charlotte Hornets': 1610612766}


In [85]:
team_name = 'Brooklyn Nets'
target_team_id = team_id_name_dict[team_name] #Select the team

# Step 1: Find recent games involving the target team
# Using leaguegamefinder to get recent game IDs involving the target team
gamefinder = leaguegamefinder.LeagueGameFinder(team_id_nullable=target_team_id)
games = gamefinder.get_data_frames()[0]

# Extract the game for this season only
games_df = games[games["GAME_DATE"] >= '2024-10-22']
game_ids = games_df['GAME_ID'].tolist()
#games_df = games[['GAME_ID', 'GAME_DATE', 'MATCHUP']]
games_df['Location'] = ['home' if 'vs' in matchup else 'away' for matchup in games_df['MATCHUP']]
games_df['Opponents'] = [matchup[len(matchup)-4:] for matchup in games_df['MATCHUP']]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [86]:
#Overall Game Stats
#avg_points_per_location_conceded = games_df.groupby(['Location'])['PTS'].mean().reset_index()

# Create a scatter plot with Plotly
# fig = px.bar(avg_points_per_location_conceded, x='Location', y='PTS', color='Location',
#                 barmode='group',
#                  title='Average Points Scored based on location)',
#                  labels={'PTS': 'Average Points Scored'},
# 
average_y = games_df['PTS'].mean()
# average_y_home = games_df[games_df['Location'] == 'home']['PTS'].mean()
# average_y_away = games_df[games_df['Location'] == 'away']['PTS'].mean()
fig = px.line(games_df, x='GAME_DATE', y='PTS', color='Location', title=f'Total Number of Points Scored Over Time by {team_name}', text='Opponents', markers = True)
fig.add_hline(y=average_y, line_dash="dash", line_color="black", annotation_text=f"Overall Average:{average_y}", annotation_position="top left")
# fig.add_hline(y=average_y_home, line_dash="dash", line_color="blue", annotation_text="Home Average", annotation_position="top right")
# fig.add_hline(y=average_y_away, line_dash="dash", line_color="orange", annotation_text="Away Average", annotation_position="bottom right")
fig.update_yaxes(range=[0, 150])

# Show the plot
fig.show()

In [87]:
opponent_boxscore = []

for game_id in game_ids:
    # Retrieve box score data for each game
    boxscore = boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id)
    boxscore_df = boxscore.get_data_frames()[0]
    
    # Filter box score data to keep only opponents of the target team
    opponent_boxscore.append(boxscore_df[boxscore_df['TEAM_ID'] != target_team_id])
    

# Concatenate all box scores into a single DataFrame
box_scores_df = pd.concat(opponent_boxscore, ignore_index=True)

player_pos_df = pd.read_csv('player_positions.csv')
merge_df = pd.merge(box_scores_df, player_pos_df, on = "PLAYER_NAME", how='left')
games_gameid_df = games_df[['GAME_ID', 'GAME_DATE', 'MATCHUP','Location', 'Opponents']]
opponents_df = pd.merge(merge_df, games_gameid_df, on = "GAME_ID", how = "left")
opponents_df = opponents_df[opponents_df['Location'].notna()] #Removes the preseason games as we are merging with player data which does not have preseason data
opponents_df['MIN'] = opponents_df['MIN'].str.split(':').str[0]
opponents_df_more_20_mins = opponents_df[opponents_df['MIN'].astype(float) > 20] #Selecting players who played at least 20 minutes

In [88]:
#Opponents points conceded over time
total_points_per_location_conceded = opponents_df.groupby(['Location','GAME_DATE', 'TEAM_ABBREVIATION'])['PTS'].sum().reset_index()
average_y = total_points_per_location_conceded['PTS'].mean()
fig = px.line(total_points_per_location_conceded, x='GAME_DATE', y='PTS', color='Location', 
                title=f'Total Number of Points Conceded by {team_name} Over Time',text='TEAM_ABBREVIATION', markers = True)
fig.add_hline(y=average_y, line_dash="dash", line_color="black", annotation_text=f"Overall Average:{average_y}", annotation_position="top left")
fig.update_yaxes(range=[0, 150])

In [89]:
avg_points_per_pos_conceded = opponents_df_more_20_mins.groupby(['Position','Location'])['PTS'].mean().reset_index()

# Create a scatter plot with Plotly
fig = px.bar(avg_points_per_pos_conceded, x='Position', y='PTS', color='Location',
                barmode='group',
                 title=f'Average Points Scored by Position (Home vs Away) against {team_name}',
                 labels={'Position': 'Player Position', 'PTS': 'Average Points Scored'},
)

# Show the plot
fig.show()
opponents_df_more_20_mins.groupby('Position')['PTS'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
C,22.0,15.272727,8.558938,3.0,8.0,13.0,23.0,30.0
PF,23.0,16.782609,9.652868,2.0,8.5,16.0,23.5,36.0
PG,27.0,18.148148,7.769287,8.0,12.0,17.0,23.5,37.0
SF,21.0,16.619048,8.102322,3.0,11.0,18.0,22.0,29.0
SG,33.0,13.212121,6.416568,3.0,9.0,12.0,16.0,31.0


Adding a note here to add another metric that shows where a player ranks among other players in similar position on average. For example if 75% of players score at least 13 points for a specific position, compare the average points of those players in 25% and the player you want to bet on. If the player averages higher than the 25%, it means you should bet on the player.

In [90]:
avg_assists_per_pos_conceded = opponents_df_more_20_mins.groupby(['Position','Location'])['AST'].mean().reset_index()

# Create a scatter plot with Plotly
fig = px.bar(avg_assists_per_pos_conceded, x='Position', y='AST', color='Location',
                barmode='group',
                 title=f'Average Assists Scored by Position (Home vs Away)against {team_name}',
                 labels={'Position': 'Player Position', 'AST': 'Average Assists Scored'},
)

# Show the plot
fig.show()
opponents_df_more_20_mins.groupby('Position')['AST'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
C,22.0,2.909091,3.490246,0.0,1.0,2.0,3.75,16.0
PF,23.0,3.695652,2.867289,0.0,1.5,3.0,6.0,10.0
PG,27.0,5.851852,3.072254,0.0,4.0,5.0,7.5,12.0
SF,21.0,3.380952,2.539216,0.0,1.0,3.0,4.0,9.0
SG,33.0,3.151515,2.611527,0.0,1.0,3.0,4.0,10.0


In [91]:
avg_rebounds_per_pos_conceded = opponents_df_more_20_mins.groupby(['Position','Location'])['REB'].mean().reset_index()

# Create a scatter plot with Plotly
fig = px.bar(avg_rebounds_per_pos_conceded, x='Position', y='REB', color='Location',
                barmode='group',
                 title=f'Average Rebounds Scored by Position (Home vs Away) against {team_name}',
                 labels={'Position': 'Player Position', 'REB': 'Average Rebounds Scored'},
)

# Show the plot
fig.show()
opponents_df_more_20_mins.groupby('Position')['REB'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
C,22.0,9.0,5.09902,1.0,4.25,9.0,11.75,18.0
PF,23.0,7.304348,3.00658,3.0,5.5,7.0,8.5,16.0
PG,27.0,3.851852,2.160906,0.0,2.5,4.0,5.0,9.0
SF,21.0,5.952381,2.819152,0.0,5.0,6.0,8.0,12.0
SG,33.0,3.242424,2.305297,0.0,1.0,3.0,5.0,9.0


In [92]:
avg_3pa_per_pos_conceded = opponents_df_more_20_mins.groupby(['Position','Location'])['FG3A'].mean().reset_index()

# Create a scatter plot with Plotly
fig = px.bar(avg_3pa_per_pos_conceded, x='Position', y='FG3A', color='Location',
                barmode='group',
                 title=f'Average Three Points Attempted by Position (Home vs Away) against {team_name}',
                 labels={'Position': 'Player Position', 'FG3A': 'Average Three Point Attempts'},
)

# Show the plot
fig.show()
opponents_df_more_20_mins.groupby('Position')['FG3A'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
C,22.0,1.727273,2.394076,0.0,0.0,0.5,3.0,9.0
PF,23.0,4.869565,3.401929,0.0,2.5,4.0,6.5,13.0
PG,27.0,5.666667,2.828427,1.0,4.0,6.0,7.0,16.0
SF,21.0,5.238095,2.586595,1.0,3.0,5.0,7.0,10.0
SG,33.0,5.545455,2.450649,2.0,4.0,5.0,7.0,11.0


In [93]:
#opponents_df_more_20_mins_more_20_mins[(opponents_df_more_20_mins_more_20_mins['Location'] == 'home') & (opponents_df_more_20_mins_more_20_mins['Position'] == 'PG')]
#opponents_df_more_20_mins[(opponents_df_more_20_mins['Location'] == 'home') & (opponents_df_more_20_mins['Position'] == 'SG')]
opponents_df_more_20_mins[(opponents_df_more_20_mins['Position'] == 'PF')]

Unnamed: 0.1,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS,Unnamed: 0,Position,GAME_DATE,MATCHUP,Location,Opponents
27,22400297,1610612756,PHX,Phoenix,201142,Kevin Durant,Kevin,F,,33.0,10.0,18.0,0.556,3.0,6.0,0.5,7.0,10.0,0.7,0.0,8.0,8.0,2.0,0.0,3.0,7.0,3.0,30.0,-2.0,60.0,PF,2024-11-27,BKN @ PHX,away,PHX
41,22400284,1610612744,GSW,Golden State,203110,Draymond Green,Draymond,F,,28.0,2.0,6.0,0.333,1.0,3.0,0.333,1.0,4.0,0.25,0.0,6.0,6.0,7.0,0.0,0.0,2.0,3.0,6.0,1.0,168.0,PF,2024-11-25,BKN @ GSW,away,GSW
54,22400276,1610612758,SAC,Sacramento,1631099,Keegan Murray,Keegan,F,,36.0,7.0,14.0,0.5,1.0,5.0,0.2,6.0,8.0,0.75,3.0,5.0,8.0,2.0,1.0,1.0,2.0,4.0,21.0,8.0,86.0,PF,2024-11-24,BKN @ SAC,away,SAC
67,22400027,1610612755,PHI,Philadelphia,1627824,Guerschon Yabusele,Guerschon,C,,31.0,4.0,12.0,0.333,2.0,4.0,0.5,0.0,0.0,0.0,3.0,8.0,11.0,3.0,1.0,1.0,2.0,3.0,10.0,10.0,162.0,PF,2024-11-22,BKN @ PHI,away,PHI
79,22400022,1610612766,CHA,Charlotte,1628970,Miles Bridges,Miles,F,,33.0,8.0,14.0,0.571,5.0,9.0,0.556,0.0,0.0,0.0,0.0,3.0,3.0,2.0,0.0,0.0,1.0,1.0,21.0,-15.0,159.0,PF,2024-11-19,BKN vs. CHA,home,CHA
80,22400022,1610612766,CHA,Charlotte,1629684,Grant Williams,Grant,C,,33.0,5.0,8.0,0.625,0.0,0.0,0.0,6.0,6.0,1.0,4.0,3.0,7.0,4.0,0.0,3.0,3.0,1.0,16.0,-12.0,171.0,PF,2024-11-19,BKN vs. CHA,home,CHA
90,22400240,1610612752,NYK,New York,1628384,OG Anunoby,OG,F,,40.0,9.0,17.0,0.529,3.0,7.0,0.429,3.0,5.0,0.6,3.0,5.0,8.0,0.0,0.0,1.0,0.0,4.0,24.0,3.0,49.0,PF,2024-11-17,BKN @ NYK,away,NYK
103,22400014,1610612752,NYK,New York,1628384,OG Anunoby,OG,F,,38.0,10.0,17.0,0.588,1.0,8.0,0.125,4.0,5.0,0.8,1.0,7.0,8.0,1.0,2.0,0.0,3.0,6.0,25.0,10.0,49.0,PF,2024-11-15,BKN @ NYK,away,NYK
116,22400218,1610612738,BOS,Boston,1628369,Jayson Tatum,Jayson,F,,35.0,12.0,19.0,0.632,5.0,6.0,0.833,7.0,9.0,0.778,1.0,8.0,9.0,10.0,0.0,0.0,1.0,3.0,36.0,24.0,2.0,PF,2024-11-13,BKN vs. BOS,home,BOS
123,22400218,1610612738,BOS,Boston,1630573,Sam Hauser,Sam,,,25.0,4.0,10.0,0.4,4.0,10.0,0.4,0.0,0.0,0.0,1.0,5.0,6.0,1.0,0.0,1.0,0.0,0.0,12.0,10.0,213.0,PF,2024-11-13,BKN vs. BOS,home,BOS


In [94]:
##Random Forest Model
#Predict the number of points a position will score 
rf_df = opponents_df_more_20_mins.dropna(subset=['Position'])
X = rf_df[['Position','Location','Opponents']]
y = rf_df[['PTS']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train = pd.get_dummies(X_train, drop_first=True)
X_test =  pd.get_dummies(X_test, drop_first=True)


#push to the mongo db how well the model does based on date, teams and position

In [95]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


The feature names should match those that were passed during fit. Starting version 1.2, an error will be raised.
Feature names seen at fit time, yet now missing:
- Opponents_ DET
- Opponents_ NOP




ValueError: X has 18 features, but RandomForestRegressor is expecting 20 features as input.