## Install the packages

In [1]:
# %pip install nba_api
# %pip install pandas

## Import the libraries

In [27]:
import pandas as pd
import time

from nba_api.stats.static import players
from nba_api.stats.static import teams
from nba_api.stats.endpoints import playergamelog
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.endpoints import playernextngames
from nba_api.stats.endpoints import commonplayerinfo


## Basic search parameters

In [28]:
season = '2020-21'
season_type = 'Regular Season'

## Get active players IDs

In [29]:
active_players = players.get_active_players()
active_players[0:2]

[{'id': 1630173,
  'full_name': 'Precious Achiuwa',
  'first_name': 'Precious',
  'last_name': 'Achiuwa',
  'is_active': True},
 {'id': 203500,
  'full_name': 'Steven Adams',
  'first_name': 'Steven',
  'last_name': 'Adams',
  'is_active': True}]

In [30]:
players_ids = [str(player['id']) for player in active_players]
players_ids[0:2]

['1630173', '203500']

In [31]:
len(players_ids)

587

## Get teams IDs

In [26]:
nba_teams_full_info = teams.get_teams()
nba_teams_full_info[:2]

[{'id': 1610612737,
  'full_name': 'Atlanta Hawks',
  'abbreviation': 'ATL',
  'nickname': 'Hawks',
  'city': 'Atlanta',
  'state': 'Atlanta',
  'year_founded': 1949},
 {'id': 1610612738,
  'full_name': 'Boston Celtics',
  'abbreviation': 'BOS',
  'nickname': 'Celtics',
  'city': 'Boston',
  'state': 'Massachusetts',
  'year_founded': 1946}]

In [27]:
nba_teams_short_info = [{'id': team['id'], 'full_name': team['full_name'], 'abbreviation': team['abbreviation']} for team in nba_teams_full_info]
nba_teams_short_info[0:2]

[{'id': 1610612737, 'full_name': 'Atlanta Hawks', 'abbreviation': 'ATL'},
 {'id': 1610612738, 'full_name': 'Boston Celtics', 'abbreviation': 'BOS'}]

In [28]:
nba_teams_ids = [team['id'] for team in nba_teams_full_info]
nba_teams_ids[0:2]

[1610612737, 1610612738]

## Get stats by game (player total)

In [10]:
columns = ['SEASON_ID', 'Player_ID', 'Game_ID', 'GAME_DATE', 'MATCHUP', 'WL',
       'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA',
       'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS', 'PLUS_MINUS', 'VIDEO_AVAILABLE']

df = pd.DataFrame(columns=columns)

In [11]:
for idx, id in enumerate(players_ids):
    try:
        data = playergamelog.PlayerGameLog(
            player_id=id,
            season=season,
            season_type_all_star=season_type,
            league_id_nullable='00',
            timeout=10)
    
        df = pd.concat([df, data.get_data_frames()[0]])
        
        # Preventing timeout exceptions
        time.sleep(.600)
    except:
        print(f'There was some problem while gathering data.\nPlayer id: {id}\nIteration #{idx}\n')

df.head(3)

Unnamed: 0,SEASON_ID,Player_ID,Game_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,...,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE
0,22020,1630173,22001069,"MAY 16, 2021",MIA @ DET,W,42,10,16,0.625,...,7,10,0,2,1,2,3,23,6,1
1,22020,1630173,22001062,"MAY 15, 2021",MIA @ MIL,L,2,1,2,0.5,...,0,0,1,0,0,0,0,2,3,1
2,22020,1630173,22001050,"MAY 13, 2021",MIA vs. PHI,W,2,0,0,0.0,...,0,0,0,0,0,0,0,0,-5,1


In [12]:
df.shape

(21349, 27)

In [17]:
print(f"The final dataset contais stats about {df['Player_ID'].nunique()} players")

The final dataset contais stats about 453 players


This difference in relation to the length of players_ids is acceptable, due various reasons, like the players who dont play any game during the season, or players who got hurt in the pre-season.

## Save the DataFrame

In [30]:
# Pickle
# df.to_pickle('df_all_players_raw.pkl')

# Unpickle
df = pd.read_pickle('df_all_players_raw.pkl')

## Adjust the columns names

In [31]:
df.columns

Index(['SEASON_ID', 'Player_ID', 'Game_ID', 'GAME_DATE', 'MATCHUP', 'WL',
       'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA',
       'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS', 'PLUS_MINUS', 'VIDEO_AVAILABLE'],
      dtype='object')

In [32]:
new_column_names = ['Season ID', 'Player ID', 'Game ID', 'Game Date', 'Matchup', 'Won or Lost',
       'Minutes played', 'Field Goals Made', 'Field Goals Attempted', 'Field Goals %', 
       'Field Goals 3 Points Made', 'Field Goal 3 Points Attempt', 'Field Goal 3 Points %', 
       'Free Throws Made', 'Free Throws Attempt', 'Free Throw %', 'Offensive Rebounds', 'Defensive Rebounds', 'Rebounds', 
       'Assists', 'Steals', 'Blocks', 'Turnovers', 'Personal Fouls',
       'Points', 'Plus Minus', 'Video Available']
    
df.columns = new_column_names
df.columns

Index(['Season ID', 'Player ID', 'Game ID', 'Game Date', 'Matchup',
       'Won or Lost', 'Minutes played', 'Field Goals Made',
       'Field Goals Attempted', 'Field Goals %', 'Field Goals 3 Points Made',
       'Field Goal 3 Points Attempt', 'Field Goal 3 Points %',
       'Free Throws Made', 'Free Throws Attempt', 'Free Throw %',
       'Offensive Rebounds', 'Defensive Rebounds', 'Rebounds', 'Assists',
       'Steals', 'Blocks', 'Turnovers', 'Personal Fouls', 'Points',
       'Plus Minus', 'Video Available'],
      dtype='object')

## Drop the irrelevant features

In [33]:
columns_to_drop = ['Field Goals %', 'Field Goal 3 Points %', 
'Free Throw %', 'Rebounds', 'Plus Minus', 'Video Available']

df.drop(columns=columns_to_drop, inplace=True)
df.columns

Index(['Season ID', 'Player ID', 'Game ID', 'Game Date', 'Matchup',
       'Won or Lost', 'Minutes played', 'Field Goals Made',
       'Field Goals Attempted', 'Field Goals 3 Points Made',
       'Field Goal 3 Points Attempt', 'Free Throws Made',
       'Free Throws Attempt', 'Offensive Rebounds', 'Defensive Rebounds',
       'Assists', 'Steals', 'Blocks', 'Turnovers', 'Personal Fouls', 'Points'],
      dtype='object')

In [34]:
df.head(2)

Unnamed: 0,Season ID,Player ID,Game ID,Game Date,Matchup,Won or Lost,Minutes played,Field Goals Made,Field Goals Attempted,Field Goals 3 Points Made,...,Free Throws Made,Free Throws Attempt,Offensive Rebounds,Defensive Rebounds,Assists,Steals,Blocks,Turnovers,Personal Fouls,Points
0,22020,1630173,22001069,"MAY 16, 2021",MIA @ DET,W,42,10,16,0,...,3,7,3,7,0,2,1,2,3,23
1,22020,1630173,22001062,"MAY 15, 2021",MIA @ MIL,L,2,1,2,0,...,0,0,0,0,1,0,0,0,0,2


## Checking the missing values

In [35]:
df.isna().sum().sum()

0

## Check for duplicated rows

In [10]:
df.duplicated().sum()

0

## Reset the index

In [36]:
df.reset_index(drop=True, inplace=True)

## Adjust the data types

### Adjust Game Date

In [37]:
df['Game Date'] =  pd.to_datetime(df['Game Date'])
df.head(2)

Unnamed: 0,Season ID,Player ID,Game ID,Game Date,Matchup,Won or Lost,Minutes played,Field Goals Made,Field Goals Attempted,Field Goals 3 Points Made,...,Free Throws Made,Free Throws Attempt,Offensive Rebounds,Defensive Rebounds,Assists,Steals,Blocks,Turnovers,Personal Fouls,Points
0,22020,1630173,22001069,2021-05-16,MIA @ DET,W,42,10,16,0,...,3,7,3,7,0,2,1,2,3,23
1,22020,1630173,22001062,2021-05-15,MIA @ MIL,L,2,1,2,0,...,0,0,0,0,1,0,0,0,0,2


### Adjust Won or Lost

In [38]:
df['Won or Lost'].replace({'W': 1, 'L': 0}, inplace=True)
df.rename(columns={'Won or Lost': 'Won'}, inplace=True)
df.head(2)

Unnamed: 0,Season ID,Player ID,Game ID,Game Date,Matchup,Won,Minutes played,Field Goals Made,Field Goals Attempted,Field Goals 3 Points Made,...,Free Throws Made,Free Throws Attempt,Offensive Rebounds,Defensive Rebounds,Assists,Steals,Blocks,Turnovers,Personal Fouls,Points
0,22020,1630173,22001069,2021-05-16,MIA @ DET,1,42,10,16,0,...,3,7,3,7,0,2,1,2,3,23
1,22020,1630173,22001062,2021-05-15,MIA @ MIL,0,2,1,2,0,...,0,0,0,0,1,0,0,0,0,2


### Adjust the numeric features types

In [39]:
df.dtypes

Season ID                              object
Player ID                              object
Game ID                                object
Game Date                      datetime64[ns]
Matchup                                object
Won                                     int64
Minutes played                         object
Field Goals Made                       object
Field Goals Attempted                  object
Field Goals 3 Points Made              object
Field Goal 3 Points Attempt            object
Free Throws Made                       object
Free Throws Attempt                    object
Offensive Rebounds                     object
Defensive Rebounds                     object
Assists                                object
Steals                                 object
Blocks                                 object
Turnovers                              object
Personal Fouls                         object
Points                                 object
dtype: object

In [40]:
columns_to_int = ['Minutes played', 'Field Goals Made', 'Field Goals Attempted',
       'Field Goals 3 Points Made', 'Field Goal 3 Points Attempt',
       'Free Throws Made', 'Free Throws Attempt', 'Offensive Rebounds',
       'Defensive Rebounds', 'Assists', 'Steals', 'Blocks', 'Turnovers',
       'Personal Fouls', 'Points']

columns_to_str = ['Season ID', 'Player ID', 'Game ID', 'Matchup']

In [41]:
df[columns_to_int] = df[columns_to_int].apply(pd.to_numeric)
df[columns_to_str] = df[columns_to_str].apply(lambda x: x.astype(str))
df.dtypes

Season ID                              object
Player ID                              object
Game ID                                object
Game Date                      datetime64[ns]
Matchup                                object
Won                                     int64
Minutes played                          int64
Field Goals Made                        int64
Field Goals Attempted                   int64
Field Goals 3 Points Made               int64
Field Goal 3 Points Attempt             int64
Free Throws Made                        int64
Free Throws Attempt                     int64
Offensive Rebounds                      int64
Defensive Rebounds                      int64
Assists                                 int64
Steals                                  int64
Blocks                                  int64
Turnovers                               int64
Personal Fouls                          int64
Points                                  int64
dtype: object

In [42]:
df.head(2)

Unnamed: 0,Season ID,Player ID,Game ID,Game Date,Matchup,Won,Minutes played,Field Goals Made,Field Goals Attempted,Field Goals 3 Points Made,...,Free Throws Made,Free Throws Attempt,Offensive Rebounds,Defensive Rebounds,Assists,Steals,Blocks,Turnovers,Personal Fouls,Points
0,22020,1630173,22001069,2021-05-16,MIA @ DET,1,42,10,16,0,...,3,7,3,7,0,2,1,2,3,23
1,22020,1630173,22001062,2021-05-15,MIA @ MIL,0,2,1,2,0,...,0,0,0,0,1,0,0,0,0,2


## Include aditional features

### Include player name

In [43]:
players_ids[0:3]

['1630173', '203500', '1628389']

In [44]:
active_players[0:2]

[{'id': 1630173,
  'full_name': 'Precious Achiuwa',
  'first_name': 'Precious',
  'last_name': 'Achiuwa',
  'is_active': True},
 {'id': 203500,
  'full_name': 'Steven Adams',
  'first_name': 'Steven',
  'last_name': 'Adams',
  'is_active': True}]

In [45]:
players_ids.index('203500')

1

In [46]:
active_players[players_ids.index('203500')]['full_name']

'Steven Adams'

In [47]:
df['Player Name'] = df['Player ID'].apply(lambda x: str(active_players[players_ids.index(str(x))]['full_name']))
df.head(2)

Unnamed: 0,Season ID,Player ID,Game ID,Game Date,Matchup,Won,Minutes played,Field Goals Made,Field Goals Attempted,Field Goals 3 Points Made,...,Free Throws Attempt,Offensive Rebounds,Defensive Rebounds,Assists,Steals,Blocks,Turnovers,Personal Fouls,Points,Player Name
0,22020,1630173,22001069,2021-05-16,MIA @ DET,1,42,10,16,0,...,7,3,7,0,2,1,2,3,23,Precious Achiuwa
1,22020,1630173,22001062,2021-05-15,MIA @ MIL,0,2,1,2,0,...,0,0,0,1,0,0,0,0,2,Precious Achiuwa


In [48]:
df.tail(2)

Unnamed: 0,Season ID,Player ID,Game ID,Game Date,Matchup,Won,Minutes played,Field Goals Made,Field Goals Attempted,Field Goals 3 Points Made,...,Free Throws Attempt,Offensive Rebounds,Defensive Rebounds,Assists,Steals,Blocks,Turnovers,Personal Fouls,Points,Player Name
21347,22020,1627826,22000009,2020-12-25,LAC @ DEN,1,19,2,2,0,...,11,0,0,0,0,1,2,4,12,Ivica Zubac
21348,22020,1627826,22000002,2020-12-22,LAC @ LAL,1,27,4,5,0,...,4,3,3,1,1,1,1,4,11,Ivica Zubac


### Include the players team name

#### Games

In [49]:
game_finder = leaguegamefinder.LeagueGameFinder(season_nullable=season, season_type_nullable=season_type)
games = game_finder.get_data_frames()[0]
games.head(2)

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22020,1610612758,SAC,Sacramento Kings,22001077,2021-05-16,SAC vs. UTA,L,241,99,...,0.6,5,34,39,24,10,5,12,22,-22.0
1,22020,1610612743,DEN,Denver Nuggets,22001076,2021-05-16,DEN @ POR,L,240,116,...,0.933,10,26,36,20,8,2,6,20,-16.0


In [50]:
games = games[['SEASON_ID', 'TEAM_ABBREVIATION', 'GAME_ID', 'WL']]
games.head(2)

Unnamed: 0,SEASON_ID,TEAM_ABBREVIATION,GAME_ID,WL
0,22020,SAC,22001077,L
1,22020,DEN,22001076,L


In [51]:
def get_team_name(players_df, player_id, games_df, game_id):
    if players_df[(players_df['Player ID'] == player_id) & (players_df['Game ID'] == game_id)]['Won'].values[0]:
        return games_df[(games_df['GAME_ID'] == game_id) & (games_df['WL'] == 'W')]['TEAM_ABBREVIATION'].values[0]
    else:
        return games_df[(games_df['GAME_ID'] == game_id) & (games_df['WL'] == 'L')]['TEAM_ABBREVIATION'].values[0]

In [52]:
# Player ID = 1627826: Ivica Zubac
# Expected return: LAC
get_team_name(df, '1627826', games, '0022000002')

'LAC'

In [53]:
df['Player Team'] = ''

for idx, row in df.iterrows():
    df['Player Team'].iloc[idx] = get_team_name(df, row['Player ID'], games, row['Game ID'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Player Team'].iloc[idx] = get_team_name(df, row['Player ID'], games, row['Game ID'])


In [54]:
df[(df['Player ID'] == '1627826') & (df['Won'] == 0)].head(1)

Unnamed: 0,Season ID,Player ID,Game ID,Game Date,Matchup,Won,Minutes played,Field Goals Made,Field Goals Attempted,Field Goals 3 Points Made,...,Offensive Rebounds,Defensive Rebounds,Assists,Steals,Blocks,Turnovers,Personal Fouls,Points,Player Name,Player Team
21277,22020,1627826,22001074,2021-05-16,LAC @ OKC,0,0,0,0,0,...,0,0,0,0,0,0,1,0,Ivica Zubac,LAC


In [55]:
df[df['Player Name'] == 'LeBron James'].head(1)

Unnamed: 0,Season ID,Player ID,Game ID,Game Date,Matchup,Won,Minutes played,Field Goals Made,Field Goals Attempted,Field Goals 3 Points Made,...,Offensive Rebounds,Defensive Rebounds,Assists,Steals,Blocks,Turnovers,Personal Fouls,Points,Player Name,Player Team
9873,22020,2544,22001072,2021-05-16,LAL @ NOP,1,27,11,22,2,...,1,0,6,3,0,2,0,25,LeBron James,LAL


### Include Home x Away

In [56]:
df[df['Game ID'] == '0022001072']

Unnamed: 0,Season ID,Player ID,Game ID,Game Date,Matchup,Won,Minutes played,Field Goals Made,Field Goals Attempted,Field Goals 3 Points Made,...,Offensive Rebounds,Defensive Rebounds,Assists,Steals,Blocks,Turnovers,Personal Fouls,Points,Player Name,Player Team
209,22020,1629638,22001072,2021-05-16,NOP vs. LAL,0,37,5,15,2,...,1,3,5,1,0,4,0,14,Nickeil Alexander-Walker,NOP
2231,22020,202339,22001072,2021-05-16,NOP vs. LAL,0,22,5,9,2,...,0,4,2,0,2,0,0,13,Eric Bledsoe,NOP
3608,22020,203484,22001072,2021-05-16,LAL @ NOP,1,27,4,8,2,...,0,1,6,1,0,0,3,10,Kentavious Caldwell-Pope,LAL
4008,22020,1627936,22001072,2021-05-16,LAL @ NOP,1,19,4,8,1,...,1,2,5,0,1,1,2,9,Alex Caruso,LAL
4918,22020,203076,22001072,2021-05-16,LAL @ NOP,1,30,5,12,0,...,0,5,1,1,0,1,0,14,Anthony Davis,LAL
5560,22020,203083,22001072,2021-05-16,LAL @ NOP,1,21,6,11,0,...,5,8,0,2,2,2,5,13,Andre Drummond,LAL
6342,22020,1629117,22001072,2021-05-16,NOP vs. LAL,0,20,1,4,0,...,0,4,0,0,1,0,6,4,Wenyen Gabriel,NOP
8207,22020,1629637,22001072,2021-05-16,NOP vs. LAL,0,20,4,7,0,...,2,5,3,1,0,0,2,10,Jaxson Hayes,NOP
8389,22020,1626195,22001072,2021-05-16,NOP vs. LAL,0,30,8,11,0,...,4,9,1,0,2,2,4,19,Willy Hernangomez,NOP
9024,22020,1629659,22001072,2021-05-16,LAL @ NOP,1,14,2,6,1,...,0,1,1,2,0,1,1,6,Talen Horton-Tucker,LAL


#### The '@' in the 'Matchup' feature indicates that the game was in the second location

In [57]:
df['Home'] = df['Matchup'].apply(lambda x: 0 if x[4] == '@' else 1)

In [58]:
df[df['Game ID'] == '0022001072'][['Game ID', 'Matchup', 'Player Name', 'Player Team', 'Home']]

Unnamed: 0,Game ID,Matchup,Player Name,Player Team,Home
209,22001072,NOP vs. LAL,Nickeil Alexander-Walker,NOP,1
2231,22001072,NOP vs. LAL,Eric Bledsoe,NOP,1
3608,22001072,LAL @ NOP,Kentavious Caldwell-Pope,LAL,0
4008,22001072,LAL @ NOP,Alex Caruso,LAL,0
4918,22001072,LAL @ NOP,Anthony Davis,LAL,0
5560,22001072,LAL @ NOP,Andre Drummond,LAL,0
6342,22001072,NOP vs. LAL,Wenyen Gabriel,NOP,1
8207,22001072,NOP vs. LAL,Jaxson Hayes,NOP,1
8389,22001072,NOP vs. LAL,Willy Hernangomez,NOP,1
9024,22001072,LAL @ NOP,Talen Horton-Tucker,LAL,0


### Include opponent

In [59]:
df['Opponent'] = df['Matchup'].apply(lambda x: x[-3:])
df.head()

Unnamed: 0,Season ID,Player ID,Game ID,Game Date,Matchup,Won,Minutes played,Field Goals Made,Field Goals Attempted,Field Goals 3 Points Made,...,Assists,Steals,Blocks,Turnovers,Personal Fouls,Points,Player Name,Player Team,Home,Opponent
0,22020,1630173,22001069,2021-05-16,MIA @ DET,1,42,10,16,0,...,0,2,1,2,3,23,Precious Achiuwa,MIA,0,DET
1,22020,1630173,22001062,2021-05-15,MIA @ MIL,0,2,1,2,0,...,1,0,0,0,0,2,Precious Achiuwa,MIA,0,MIL
2,22020,1630173,22001050,2021-05-13,MIA vs. PHI,1,2,0,0,0,...,0,0,0,0,0,0,Precious Achiuwa,MIA,1,PHI
3,22020,1630173,22000986,2021-05-04,MIA vs. DAL,0,6,1,2,0,...,1,0,1,0,0,2,Precious Achiuwa,MIA,1,DAL
4,22020,1630173,22000969,2021-05-02,MIA @ CHA,1,1,0,1,0,...,0,0,0,0,0,0,Precious Achiuwa,MIA,0,CHA


### Rearrange columns order

In [60]:
df.columns

Index(['Season ID', 'Player ID', 'Game ID', 'Game Date', 'Matchup', 'Won',
       'Minutes played', 'Field Goals Made', 'Field Goals Attempted',
       'Field Goals 3 Points Made', 'Field Goal 3 Points Attempt',
       'Free Throws Made', 'Free Throws Attempt', 'Offensive Rebounds',
       'Defensive Rebounds', 'Assists', 'Steals', 'Blocks', 'Turnovers',
       'Personal Fouls', 'Points', 'Player Name', 'Player Team', 'Home',
       'Opponent'],
      dtype='object')

In [61]:
new_columns_order = ['Season ID', 'Player ID', 'Player Name',
                    'Game ID', 'Game Date', 'Matchup', 'Player Team', 'Opponent', 'Home', 'Won',
                    'Minutes played', 'Field Goals Made', 'Field Goals Attempted',
                    'Field Goals 3 Points Made', 'Field Goal 3 Points Attempt',
                    'Free Throws Made', 'Free Throws Attempt', 'Offensive Rebounds',
                    'Defensive Rebounds', 'Assists', 'Steals', 'Blocks', 'Turnovers',
                    'Personal Fouls', 'Points']

In [62]:
df = df[new_columns_order]
df.head(2)

Unnamed: 0,Season ID,Player ID,Player Name,Game ID,Game Date,Matchup,Player Team,Opponent,Home,Won,...,Free Throws Made,Free Throws Attempt,Offensive Rebounds,Defensive Rebounds,Assists,Steals,Blocks,Turnovers,Personal Fouls,Points
0,22020,1630173,Precious Achiuwa,22001069,2021-05-16,MIA @ DET,MIA,DET,0,1,...,3,7,3,7,0,2,1,2,3,23
1,22020,1630173,Precious Achiuwa,22001062,2021-05-15,MIA @ MIL,MIA,MIL,0,0,...,0,0,0,0,1,0,0,0,0,2


### Include PER (Player Efficiency Ratings)

PER = (FGM x 85.910 + Steals x 53.897 + 3PTM x 51.757 + FTM x 46.845 + Blocks x 39.190 + Offensive_Reb x 39.190 + Assists x 34.677 + Defensive_Reb x 14.707 — Foul x 17.174 — FT_Miss x 20.091 — FG_Miss x 39.190 — TO x 53.897) x (1 / Minutes)

Reference: https://towardsdatascience.com/predicting-the-outcome-of-nba-games-with-machine-learning-a810bb768f20

In [65]:
df.columns

Index(['Season ID', 'Player ID', 'Player Name', 'Game ID', 'Game Date',
       'Matchup', 'Player Team', 'Opponent', 'Home', 'Won', 'Minutes played',
       'Field Goals Made', 'Field Goals Attempted',
       'Field Goals 3 Points Made', 'Field Goal 3 Points Attempt',
       'Free Throws Made', 'Free Throws Attempt', 'Offensive Rebounds',
       'Defensive Rebounds', 'Assists', 'Steals', 'Blocks', 'Turnovers',
       'Personal Fouls', 'Points'],
      dtype='object')

In [74]:
df['Player Efficiency Ratings'] = ''

for idx, row in df.iterrows():
    if row['Minutes played'] == 0:
        per = 0
    else:
        per = round((row['Field Goals Made'] * 85.910 + row['Steals'] * 53.897 + row['Field Goals 3 Points Made'] * 51.757 + row['Free Throws Made'] * 46.845 + row['Blocks'] * 39.190 + row['Offensive Rebounds'] * 39.190 + row['Assists'] * 34.677 + row['Defensive Rebounds'] * 14.707 - row['Personal Fouls'] * 17.174 - (row['Free Throws Attempt'] - row['Free Throws Made']) * 20.091 - (row['Free Throws Attempt'] - row['Free Throws Made']) * 39.190 - row['Turnovers'] * 53.897) * (1 / row['Minutes played']), 3)
    df['Player Efficiency Ratings'].iloc[idx] = per

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Player Efficiency Ratings'].iloc[idx] = per


In [75]:
df.head(2)

Unnamed: 0,Season ID,Player ID,Player Name,Game ID,Game Date,Matchup,Player Team,Opponent,Home,Won,...,Free Throws Attempt,Offensive Rebounds,Defensive Rebounds,Assists,Steals,Blocks,Turnovers,Personal Fouls,Points,Player Efficiency Ratings
0,22020,1630173,Precious Achiuwa,22001069,2021-05-16,MIA @ DET,MIA,DET,0,1,...,7,3,7,0,2,1,2,3,23,23.112
1,22020,1630173,Precious Achiuwa,22001062,2021-05-15,MIA @ MIL,MIA,MIL,0,0,...,0,0,0,1,0,0,0,0,2,60.293


## Save the DataFrame

In [11]:
import pandas as pd

# Pickle
# df.to_pickle('final_df.pkl')

# Unpickle
df = pd.read_pickle('final_df.pkl')

## Get the next games

In [35]:
playernextngames.PlayerNextNGames(
    number_of_games=3,
    player_id=players_ids[0], 
    season_all='2021-22', 
    season_type_all_star=season_type).get_data_frames()[0]

Unnamed: 0,GAME_ID,GAME_DATE,HOME_TEAM_ID,VISITOR_TEAM_ID,HOME_TEAM_NAME,VISITOR_TEAM_NAME,HOME_TEAM_ABBREVIATION,VISITOR_TEAM_ABBREVIATION,HOME_TEAM_NICKNAME,VISITOR_TEAM_NICKNAME,GAME_TIME,HOME_WL,VISITOR_WL
0,22100790,"FEB 04, 2022",1610612761,1610612737,Toronto,Atlanta,TOR,ATL,Raptors,Hawks,07:30 PM,27-23,25-26
1,22100810,"FEB 07, 2022",1610612766,1610612761,Charlotte,Toronto,CHA,TOR,Hornets,Raptors,07:00 PM,28-24,27-23
2,22100827,"FEB 09, 2022",1610612760,1610612761,Oklahoma City,Toronto,OKC,TOR,Thunder,Raptors,08:00 PM,16-34,27-23


## Function to retrieve the mean value of players stats

In [3]:
def get_mean_player_stats(df, player_id, ref_date, n_days, opponent='Any'):
    """
    df: Pandas DataFrame with the data
    ref_date: String with the reference date (YYYY-MM-DD)
    n_days: Number of records to be used in the mean
    opponent: String of the opponenet name, with 3 leters
    """
    columns_to_get_mean = ['Minutes played',
       'Field Goals Made', 'Field Goals Attempted',
       'Field Goals 3 Points Made', 'Field Goal 3 Points Attempt',
       'Free Throws Made', 'Free Throws Attempt', 'Offensive Rebounds',
       'Defensive Rebounds', 'Assists', 'Steals', 'Blocks', 'Turnovers',
       'Points', 'Player Efficiency Ratings']
    opponent = opponent.upper()
    ref_date = pd.to_datetime(ref_date)
    
    if opponent == 'ANY':
        result = df[(df['Player ID'] == player_id) & (df['Game Date'] < ref_date)].sort_values('Game Date', ascending=False).loc[:][0:n_days]
        if result.shape[0] == 0:
            print("The search did not retrieve any result")
            return None
    else:
        result = df[(df['Player ID'] == player_id) & (df['Game Date'] < ref_date) & (df['Opponent'] == opponent)].sort_values('Game Date', ascending=False).loc[:][0:n_days]
        if result.shape[0] == 0:
            print("The search did not retrieve any result")
            return None
    return result[columns_to_get_mean].mean()

In [4]:
# Function test
player_id = '2544' # LeBron James
date = '2021-12-28'
opponent = 'GSW'
n_days = 4

get_mean_player_stats(df, player_id, date, n_days, opponent)

Minutes played                 30.666667
Field Goals Made                7.000000
Field Goals Attempted          13.666667
Field Goals 3 Points Made       2.333333
Field Goal 3 Points Attempt     5.333333
Free Throws Made                3.666667
Free Throws Attempt             6.000000
Offensive Rebounds              1.000000
Defensive Rebounds              6.000000
Assists                         6.666667
Steals                          1.000000
Blocks                          0.666667
Turnovers                       4.333333
Points                         20.000000
Player Efficiency Ratings      32.185667
dtype: float64

## Get Player Position

In [5]:
test = commonplayerinfo.CommonPlayerInfo(player_id='2544').get_data_frames()[0]
test.head()

Unnamed: 0,PERSON_ID,FIRST_NAME,LAST_NAME,DISPLAY_FIRST_LAST,DISPLAY_LAST_COMMA_FIRST,DISPLAY_FI_LAST,PLAYER_SLUG,BIRTHDATE,SCHOOL,COUNTRY,...,PLAYERCODE,FROM_YEAR,TO_YEAR,DLEAGUE_FLAG,NBA_FLAG,GAMES_PLAYED_FLAG,DRAFT_YEAR,DRAFT_ROUND,DRAFT_NUMBER,GREATEST_75_FLAG
0,2544,LeBron,James,LeBron James,"James, LeBron",L. James,lebron-james,1984-12-30T00:00:00,St. Vincent-St. Mary HS (OH),USA,...,lebron_james,2003,2021,N,Y,Y,2003,1,1,Y


In [10]:
test.columns

Index(['PERSON_ID', 'FIRST_NAME', 'LAST_NAME', 'DISPLAY_FIRST_LAST',
       'DISPLAY_LAST_COMMA_FIRST', 'DISPLAY_FI_LAST', 'PLAYER_SLUG',
       'BIRTHDATE', 'SCHOOL', 'COUNTRY', 'LAST_AFFILIATION', 'HEIGHT',
       'WEIGHT', 'SEASON_EXP', 'JERSEY', 'POSITION', 'ROSTERSTATUS',
       'GAMES_PLAYED_CURRENT_SEASON_FLAG', 'TEAM_ID', 'TEAM_NAME',
       'TEAM_ABBREVIATION', 'TEAM_CODE', 'TEAM_CITY', 'PLAYERCODE',
       'FROM_YEAR', 'TO_YEAR', 'DLEAGUE_FLAG', 'NBA_FLAG', 'GAMES_PLAYED_FLAG',
       'DRAFT_YEAR', 'DRAFT_ROUND', 'DRAFT_NUMBER', 'GREATEST_75_FLAG'],
      dtype='object')

In [9]:
test['POSITION'].values[0]

'Forward'

In [12]:
df.columns

Index(['Season ID', 'Player ID', 'Player Name', 'Game ID', 'Game Date',
       'Matchup', 'Player Team', 'Opponent', 'Home', 'Won', 'Minutes played',
       'Field Goals Made', 'Field Goals Attempted',
       'Field Goals 3 Points Made', 'Field Goal 3 Points Attempt',
       'Free Throws Made', 'Free Throws Attempt', 'Offensive Rebounds',
       'Defensive Rebounds', 'Assists', 'Steals', 'Blocks', 'Turnovers',
       'Personal Fouls', 'Points', 'Player Efficiency Ratings'],
      dtype='object')

In [70]:
def get_player_position(id_list) -> dict:
    player_id_position = {}
    for idx, id in enumerate(id_list):
        info = commonplayerinfo.CommonPlayerInfo(player_id=id).get_data_frames()[0]
        player_id_position[id] = info['POSITION'].values[0]
        # Preventing timeout exceptions
        time.sleep(.600)
    return player_id_position

In [None]:
player_positions = get_player_position(players_ids)

In [52]:
# Player ID: '2544'
# Player Name: LeBron James
# Expected Value: Forward

player_positions['2544']

'Forward'

In [56]:
# Count players with missing possition
missing = []
for key, value in player_positions.items():
    if value == '':
        missing.append(key)

len(missing)

17

In [67]:
for player in active_players:
    if str(player['id']) in missing:
        print(player['full_name'])

Derrick Alston Jr.
Mitch Ballock
D.J. Carton
Matt Coleman III
Johnny Hamilton
Daulton Hommes
Feron Hunt
AJ Lawson
Matt Lewis
Isaiah Miller
Matt Ryan
Aamir Simms
Dru Smith
DJ Steward
DJ Stewart
MaCio Teague
Ethan Thompson


Only irrelevant players. Possible solutions:
- Remove them from the dataset
- Define a "default" position for them
- Insert manually

To be decided.

In [68]:
import pickle

# Pickle
# with open('player_positions.pkl', 'wb') as f:
#     pickle.dump(player_positions, f)

# Unpickle
# with open('player_positions.pkl', 'rb') as f:
    # player_positions = pickle.load(f)


## Get players' salary and position from DraftKings

Reference: https://swishanalytics.com/optimus/nba/daily-fantasy-salary-changes

In [None]:
# %pip install requests beautifulsoup4
# %pip install lxml

In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
url = "https://swishanalytics.com/optimus/nba/daily-fantasy-salary-changes"

html_content = requests.get(url).text

# Parse the html content
soup = BeautifulSoup(html_content, "lxml")
print(soup.prettify()[:200]) # print the parsed data of html

<!DOCTYPE html>
<html class="no-js" lang="en">
 <head>
  <style>
  </style>
  <script type="text/javascript">
   (function(e,b){if(!b.__SV){var a,f,i,g;window.mixpanel=b;b._i=[];b.init=function(a,e,d)


In [3]:
scripts = soup.findAll('script')

In [6]:
scripts[21]

<script>
      function Model(){
        this.players_dk = [{"player_id":"956574","player_name":"Jarrett Allen","nickname":"Cavaliers","pos_main":"C","fantasy_pts":"24.07","avg_pts":"36.94","fpts_diff":"-12.87","date":"2022-02-20","salary":"7,600","salary_diff":"4600","salary_diff_percentage":"153.3","salary_change_html":"<td class=\"width-15 green\" id=\"salary-col\">+$4600 (153.3%)<\/td>","salary_change":"153.3"},{"player_id":"1134262","player_name":"Darius Garland","nickname":"Cavaliers","pos_main":"PG","fantasy_pts":"38.63","avg_pts":"38.90","fpts_diff":"-0.27","date":"2022-02-20","salary":"7,900","salary_diff":"4100","salary_diff_percentage":"107.9","salary_change_html":"<td class=\"width-15 green\" id=\"salary-col\">+$4100 (107.9%)<\/td>","salary_change":"107.9"},{"player_id":"837030","player_name":"Karl-Anthony Towns","nickname":"Timberwolves","pos_main":"C","fantasy_pts":"46.21","avg_pts":"47.15","fpts_diff":"-0.94","date":"2022-02-20","salary":"9,800","salary_diff":"5000","sal

In [7]:
table = scripts[21].contents[0]

In [8]:
table = table.split('{')
table

['\n      function Model()',
 '\n        this.players_dk = [',
 '"player_id":"956574","player_name":"Jarrett Allen","nickname":"Cavaliers","pos_main":"C","fantasy_pts":"24.07","avg_pts":"36.94","fpts_diff":"-12.87","date":"2022-02-20","salary":"7,600","salary_diff":"4600","salary_diff_percentage":"153.3","salary_change_html":"<td class=\\"width-15 green\\" id=\\"salary-col\\">+$4600 (153.3%)<\\/td>","salary_change":"153.3"},',
 '"player_id":"1134262","player_name":"Darius Garland","nickname":"Cavaliers","pos_main":"PG","fantasy_pts":"38.63","avg_pts":"38.90","fpts_diff":"-0.27","date":"2022-02-20","salary":"7,900","salary_diff":"4100","salary_diff_percentage":"107.9","salary_change_html":"<td class=\\"width-15 green\\" id=\\"salary-col\\">+$4100 (107.9%)<\\/td>","salary_change":"107.9"},',
 '"player_id":"837030","player_name":"Karl-Anthony Towns","nickname":"Timberwolves","pos_main":"C","fantasy_pts":"46.21","avg_pts":"47.15","fpts_diff":"-0.94","date":"2022-02-20","salary":"9,800","sa

In [9]:
table[-36]

'"player_id":"1142902","player_name":"Neemias Queta","nickname":"Kings","pos_main":"C","fantasy_pts":"0.94","avg_pts":"6.08","fpts_diff":"-5.14","date":"2022-02-16","salary":"3,500","salary_diff":"0","salary_diff_percentage":"0.0","salary_change_html":"<td class=\\"width-15 green\\" id=\\"salary-col\\">+$0 (0.0%)<\\/td>","salary_change":"0.0"}];\n        this.players_ya = [];\n        this.current_site = \'dk\';\n      }\n      Model.prototype = '

In [10]:
table[-35]

'\n        determineSortFunction: function(type)'

In [11]:
table[2]

'"player_id":"956574","player_name":"Jarrett Allen","nickname":"Cavaliers","pos_main":"C","fantasy_pts":"24.07","avg_pts":"36.94","fpts_diff":"-12.87","date":"2022-02-20","salary":"7,600","salary_diff":"4600","salary_diff_percentage":"153.3","salary_change_html":"<td class=\\"width-15 green\\" id=\\"salary-col\\">+$4600 (153.3%)<\\/td>","salary_change":"153.3"},'

In [12]:
table[1]

'\n        this.players_dk = ['

In [13]:
table = table[2:-35]

In [14]:
table[0]

'"player_id":"956574","player_name":"Jarrett Allen","nickname":"Cavaliers","pos_main":"C","fantasy_pts":"24.07","avg_pts":"36.94","fpts_diff":"-12.87","date":"2022-02-20","salary":"7,600","salary_diff":"4600","salary_diff_percentage":"153.3","salary_change_html":"<td class=\\"width-15 green\\" id=\\"salary-col\\">+$4600 (153.3%)<\\/td>","salary_change":"153.3"},'

In [15]:
table[-1]

'"player_id":"1142902","player_name":"Neemias Queta","nickname":"Kings","pos_main":"C","fantasy_pts":"0.94","avg_pts":"6.08","fpts_diff":"-5.14","date":"2022-02-16","salary":"3,500","salary_diff":"0","salary_diff_percentage":"0.0","salary_change_html":"<td class=\\"width-15 green\\" id=\\"salary-col\\">+$0 (0.0%)<\\/td>","salary_change":"0.0"}];\n        this.players_ya = [];\n        this.current_site = \'dk\';\n      }\n      Model.prototype = '

In [106]:
import re
def get_name_position_and_salary(player_info: list) -> dict:
    def convert_salary_float(salary):
        if "," in salary:
            salary = salary.replace(',', '')
        return float(salary)

    try:
        player_name = re.search('"player_name":"[\w].+[\w][",]', player_info)[0].split(':')[1].split(',')[0]
        position = re.search('"pos_main":"[\w]*"', player_info)[0].split(':')[1][1:-1]
        salary = convert_salary_float(re.search('"salary":"[\w]*[,]*[\w]*"', player_info)[0].split(':')[1][1:-1])
    except:
        print(f"It was not possible to get player's info. \nInput: \n{player_info}")

    return({'player_name': player_name, 'position': position, 'salary': salary})
    
get_name_position_and_salary(table[-1])


{'player_name': '"Neemias Queta"', 'position': 'C', 'salary': 3500.0}

In [108]:
position_and_salary = [get_name_position_and_salary(entry) for entry in table]
position_and_salary

[{'player_name': '"Jarrett Allen"', 'position': 'C', 'salary': 7600.0},
 {'player_name': '"Darius Garland"', 'position': 'PG', 'salary': 7900.0},
 {'player_name': '"Karl-Anthony Towns"', 'position': 'C', 'salary': 9800.0},
 {'player_name': '"Trae Young"', 'position': 'G', 'salary': 10.0},
 {'player_name': '"Devin Booker"', 'position': 'G', 'salary': 27.0},
 {'player_name': '"Chris Paul"', 'position': 'G', 'salary': 30.0},
 {'player_name': '"Deandre Ayton"', 'position': 'C', 'salary': 55.0},
 {'player_name': '"Clint Capela"', 'position': 'C', 'salary': 48.0},
 {'player_name': '"John Collins"', 'position': 'C', 'salary': 59.0},
 {'player_name': '"Bogdan Bogdanovic"', 'position': 'F', 'salary': 89.0},
 {'player_name': '"Mikal Bridges"', 'position': 'F', 'salary': 140.0},
 {'player_name': '"Kevin Huerter"', 'position': 'G', 'salary': 127.0},
 {'player_name': '"Danilo Gallinari"', 'position': 'F', 'salary': 137.0},
 {'player_name': '"Jae Crowder"', 'position': 'F', 'salary': 150.0},
 {'play

In [109]:
import pickle

# Pickle
# with open('positions_and_salary.pkl', 'wb') as f:
#     pickle.dump(position_and_salary, f)

# Unpickle
# with open('positions_and_salary.pkl', 'rb') as f:
    # position_and_salary = pickle.load(f)
