# NBA Player Stats by Game

* Start with a selected start and end date
* Scrape https://basketball.realgm.com to pull all player level game stats during that period
* Organize results into a pandas dataframe

(Patterned after scraping code found here: https://github.com/jacobbaruch/NBA_data_scraping_and_analysis)

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import ast

In [2]:
# specify the date range we're interested in
start_date = "2018-11-06"
end_date = "2018-11-07"

In [3]:
def convert_date(date_list):
    '''Convert the date representation provided by the api (`[month,day,year]`) to a string in `YYYY-MM-DD` format.'''
    # inbound values: [11, 6, 2018]  ([month, day, year])
    # outbound values: "2018-11-06"
    year = date_list[2]
    month = date_list[0]
    day = date_list[1]
    
    return "{:0>4}-{:0>2}-{:0>2}".format(year, month, day)

In [4]:
# retrieve the available dates for boxscores
showdates_url = 'https://basketball.realgm.com/ajax/nba_scoreboard.phtml?action=showdates'
dates_resp = requests.get(showdates_url)
dates_all = ast.literal_eval(dates_resp.text)

# reformat from [[month, day, year]] to [(year, month, day)]
dates_all = [convert_date(d) for d in dates_all]

# filter to the dates of interest
dates = list(filter(lambda d: d >= start_date and d <= end_date, dates_all))

In [5]:
# for each date of interest, get a list of game links
links = []
for date in dates:
    # load e.g.: "https://basketball.realgm.com/nba/scores/{}/All".format("2018-11-01")
    page_resp = requests.get("https://basketball.realgm.com/nba/scores/{}/All".format(date))
    soup = BeautifulSoup(page_resp.text, 'html.parser')

    # within that page, find tables with class="game played"
    # within those tables, find the a href with text of "Box Score"
    links.extend([(link.get('href'), date) for link in soup.select('table.game.played a') if link.text == 'Box Score'])

In [6]:
# some stats are in a hyphen separated {made}-{attempted} format
# e.g. FGM-A (field goals made-attempted)
# parse out to discrete values to allow detail calcs
def parse_made_attempted(val):
    a, b = val.split('-')
    return (int(a), int(b))

In [7]:
def parse_minutes_played(val):
    '''Convert the text representation, e.g. "40:44" to a float value, e.g. "40.73333".'''
    minutes, seconds = [int(x) for x in val.split(':')]
    return minutes + seconds / 60

In [8]:
def parse_team_stats(team_name, date, stat_rows):    
    players = []
    for row in stat_rows:
        cols = row.find_all('td')

        player = {}
        player['date'] = date
        player['team'] = team_name        
        player['name'] = cols[1].string
        player['number'] = int(cols[0].string)        
        player['status'] = cols[2].string
        player['position'] = cols[3].string
        player['minutes'] = parse_minutes_played(cols[4].string)

        fgm, fga = parse_made_attempted(cols[5].string)
        player['FGA'] = fga  # field goals attempted
        player['FGM'] = fgm  # field goals made
        
        threePtMd, threePtAtt = parse_made_attempted(cols[6].string)
        player['3FGA'] = threePtAtt # three-point field goals attempted
        player['3FGM'] = threePtMd # three-point field goals made
        
        ftm, fta = parse_made_attempted(cols[7].string)    
        player['FTA'] = fta  # free throws attempted
        player['FTM'] = ftm  # free throws made
        
        player['FIC'] = float(cols[8].string)  # floor impact counter
        player['OREB'] = int(float(cols[9].string))  # offensive rebounds - not sure why this is provided as a float, but convert to int
        player['DREB'] = int(cols[10].string)  # defensive rebounds
        player['REB'] = int(cols[11].string)  # total rebounds
        player['AST'] = int(cols[12].string)  # assists
        player['PF'] = int(cols[13].string)  # personal fouls
        player['STL'] = int(cols[14].string)  # steals
        player['BLK'] = int(cols[15].string)  # blocks
        player['TO'] = int(cols[16].string)  # turnovers
        player['PTS'] = int(cols[17].string)  # points

        players.append(player)
        
    return players

In [9]:
# follow link to box score page
#https://basketball.realgm.com/nba/boxscore/2018-11-01/LA-Clippers-at-Philadelphia/308916
stats = []
for (link, date) in links:
    url = 'https://basketball.realgm.com' + link
    print('retrieving stats for {}'.format(url))
    
    page_resp = requests.get(url)
    soup = BeautifulSoup(page_resp.text, 'html.parser')

    # pull all h2 tags to get team names
    headers = soup.find_all('h2')    
    tables = soup.find_all('table')
    
    teamA_players = parse_team_stats(headers[1].text, date, tables[3].find('tbody').find_all('tr'))
    teamB_players = parse_team_stats(headers[2].text, date, tables[4].find('tbody').find_all('tr'))
    
    stats.extend(teamA_players)
    stats.extend(teamB_players)    

retrieving stats for https://basketball.realgm.com/nba/boxscore/2018-11-06/Atlanta-at-Charlotte/308952
retrieving stats for https://basketball.realgm.com/nba/boxscore/2018-11-06/Washington-at-Dallas/308953
retrieving stats for https://basketball.realgm.com/nba/boxscore/2018-11-06/Brooklyn-at-Phoenix/308954
retrieving stats for https://basketball.realgm.com/nba/boxscore/2018-11-06/Milwaukee-at-Portland/308955


In [10]:
# control the column order in the generated dataframe
columns = ['date', 'team', 'name', 'number', 'status',
    'position', 'minutes', 'FGA', 'FGM', '3FGA', '3FGM',
    'FTA', 'FTM', 'FIC', 'OREB', 'DREB', 'REB',
    'AST', 'PF', 'STL', 'BLK', 'TO', 'PTS']

df = pd.DataFrame(stats)
df = df.loc[:, columns]

In [11]:
df.head()

Unnamed: 0,date,team,name,number,status,position,minutes,FGA,FGM,3FGA,...,FIC,OREB,DREB,REB,AST,PF,STL,BLK,TO,PTS
0,2018-11-06,Atlanta Hawks,Kent Bazemore,24,Starter,SG,28.066667,12,6,7,...,9.9,0,2,2,1,3,4,2,3,16
1,2018-11-06,Atlanta Hawks,Trae Young,11,Starter,PG,29.816667,19,8,7,...,9.5,1,2,3,10,0,0,0,6,18
2,2018-11-06,Atlanta Hawks,Kevin Huerter,1,Starter,SF,32.216667,5,2,2,...,11.6,1,4,5,3,1,3,1,1,7
3,2018-11-06,Atlanta Hawks,Omari Spellman,6,Starter,PF,16.75,6,1,3,...,1.5,2,2,4,1,1,0,0,0,2
4,2018-11-06,Atlanta Hawks,Alex Len,25,Starter,C,19.083333,11,6,3,...,3.5,0,3,3,1,1,1,0,4,12
