In [10]:
# patterned after scraping code found here: https://github.com/jacobbaruch/NBA_data_scraping_and_analysis

In [34]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import ast

In [48]:
# specify the date range we're interested in
start_date = (2018, 11, 4)
end_date = (2018, 11, 6)

In [49]:
# retrieve the available dates for boxscores
showdates_url = 'https://basketball.realgm.com/ajax/nba_scoreboard.phtml?action=showdates'
dates_resp = requests.get(showdates_url)
dates_all = ast.literal_eval(dates_resp.text)

# reformat from [[month, day, year]] to [(year, month, day)]
dates_all = [(d[2], d[0], d[1]) for d in dates_all]

# filter to the dates of interest
dates = list(filter(lambda d: d >= start_date and d <= end_date, dates_all))

In [71]:
# for each date of interest, get a list of game links
links = []
for date in dates:
    # load e.g.: "https://basketball.realgm.com/nba/scores/{}/All".format("2018-11-01")
    date_str = "{:0>4}-{:0>2}-{:0>2}".format(date[0], date[1], date[2])
    page_resp = requests.get("https://basketball.realgm.com/nba/scores/{}/All".format(date_str))
    soup = BeautifulSoup(page_resp.text, 'html.parser')

    # within that page, find tables with class="game played"
    # within those tables, find the a href with text of "Box Score"
    links.extend([(link.get('href'), date_str) for link in soup.select('table.game.played a') if link.text == 'Box Score'])

In [72]:
links

[('/nba/boxscore/2018-11-04/Sacramento-at-Milwaukee/308936', '2018-11-04'),
 ('/nba/boxscore/2018-11-04/Philadelphia-at-Brooklyn/308937', '2018-11-04'),
 ('/nba/boxscore/2018-11-04/New-York-at-Washington/308938', '2018-11-04'),
 ('/nba/boxscore/2018-11-04/Orlando-at-San-Antonio/308939', '2018-11-04'),
 ('/nba/boxscore/2018-11-04/Memphis-at-Phoenix/308940', '2018-11-04'),
 ('/nba/boxscore/2018-11-04/Minnesota-at-Portland/308941', '2018-11-04'),
 ('/nba/boxscore/2018-11-04/Toronto-at-LA-Lakers/308942', '2018-11-04'),
 ('/nba/boxscore/2018-11-05/Houston-at-Indiana/308944', '2018-11-05'),
 ('/nba/boxscore/2018-11-05/Miami-at-Detroit/308943', '2018-11-05'),
 ('/nba/boxscore/2018-11-05/Cleveland-at-Orlando/308945', '2018-11-05'),
 ('/nba/boxscore/2018-11-05/Chicago-at-New-York/308946', '2018-11-05'),
 ('/nba/boxscore/2018-11-05/New-Orleans-at-Oklahoma-City/308947',
  '2018-11-05'),
 ('/nba/boxscore/2018-11-05/Toronto-at-Utah/308949', '2018-11-05'),
 ('/nba/boxscore/2018-11-05/Boston-at-Denve

In [74]:
# some stats are in a hyphen separated {made}-{attempted} format
# e.g. FGM-A (field goals made-attempted)
# parse out to discrete values to allow detail calcs
def parse_made_attempted(val):
    a, b = val.split('-')
    return (int(a), int(b))

In [79]:
# todo: clean up column names
# todo: order columns intuitively
# todo: parse to correct types
def parse_team_stats(team_name, date, stat_rows):    
    players = []
    for row in stat_rows:
        cols = row.find_all('td')

        player = {}
        player['team'] = team_name
        player['date'] = date
        player['number'] = int(cols[0].string)
        player['name'] = cols[1].string
        player['status'] = cols[2].string
        player['position'] = cols[3].string
        player['minutes'] = cols[4].string

        fgm, fga = parse_made_attempted(cols[5].string)
        player['fgm'] = fgm
        player['fga'] = fga

        threePtMd, threePtAtt = parse_made_attempted(cols[6].string)
        player['3pm'] = threePtMd
        player['3pa'] = threePtAtt

        ftm, fta = parse_made_attempted(cols[7].string)    
        player['ftm'] = ftm
        player['fta'] = fta

        player['fic'] = cols[8].string
        player['off'] = cols[9].string
        player['def'] = cols[10].string
        player['reb'] = cols[11].string
        player['ast'] = cols[12].string
        player['pf'] = cols[13].string
        player['stl'] = cols[14].string
        player['blk'] = cols[15].string
        player['to'] = cols[16].string
        player['pts'] = cols[17].string

        players.append(player)
        
    return players

In [77]:
# follow link to box score page
#https://basketball.realgm.com/nba/boxscore/2018-11-01/LA-Clippers-at-Philadelphia/308916
stats = []
for (link, date) in links:
    url = 'https://basketball.realgm.com' + link
    print('retrieving stats for {}'.format(url))
    
    page_resp = requests.get(url)
    soup = BeautifulSoup(page_resp.text, 'html.parser')

    # pull all h2 tags to get team names
    headers = soup.find_all('h2')    
    tables = soup.find_all('table')
    
    teamA_players = parse_team_stats(headers[1].text, date, tables[3].find('tbody').find_all('tr'))
    teamB_players = parse_team_stats(headers[2].text, date, tables[4].find('tbody').find_all('tr'))
    
    stats.extend(teamA_players)
    stats.extend(teamB_players)    

retrieving stats for https://basketball.realgm.com/nba/boxscore/2018-11-04/Sacramento-at-Milwaukee/308936
retrieving stats for https://basketball.realgm.com/nba/boxscore/2018-11-04/Philadelphia-at-Brooklyn/308937
retrieving stats for https://basketball.realgm.com/nba/boxscore/2018-11-04/New-York-at-Washington/308938
retrieving stats for https://basketball.realgm.com/nba/boxscore/2018-11-04/Orlando-at-San-Antonio/308939
retrieving stats for https://basketball.realgm.com/nba/boxscore/2018-11-04/Memphis-at-Phoenix/308940
retrieving stats for https://basketball.realgm.com/nba/boxscore/2018-11-04/Minnesota-at-Portland/308941
retrieving stats for https://basketball.realgm.com/nba/boxscore/2018-11-04/Toronto-at-LA-Lakers/308942
retrieving stats for https://basketball.realgm.com/nba/boxscore/2018-11-05/Houston-at-Indiana/308944
retrieving stats for https://basketball.realgm.com/nba/boxscore/2018-11-05/Miami-at-Detroit/308943
retrieving stats for https://basketball.realgm.com/nba/boxscore/2018-

In [78]:
df = pd.DataFrame(stats)
df

Unnamed: 0,3pa,3pm,ast,blk,date,def,fga,fgm,fic,fta,...,number,off,pf,position,pts,reb,status,stl,team,to
0,5,1,2,0,2018-11-04,7,8,2,3.8,0,...,9,0.0,3,SF,5,7,Starter,0,Sacramento Kings,1
1,0,0,1,2,2018-11-04,3,3,2,3.2,6,...,0,1.0,5,C,6,4,Starter,1,Sacramento Kings,3
2,5,2,1,0,2018-11-04,2,15,8,7.9,1,...,24,2.0,2,SG,19,4,Starter,0,Sacramento Kings,3
3,5,3,6,0,2018-11-04,6,14,6,12.5,0,...,5,0.0,1,PG,15,6,Starter,0,Sacramento Kings,2
4,1,0,1,0,2018-11-04,3,8,2,0.8,0,...,88,1.0,1,PF,4,4,Starter,0,Sacramento Kings,1
5,0,0,3,0,2018-11-04,3,2,1,6.8,0,...,41,2.0,4,C,2,5,Bench,1,Sacramento Kings,0
6,1,0,1,0,2018-11-04,0,1,0,0.5,2,...,7,0.0,2,C,2,0,Bench,1,Sacramento Kings,1
7,0,0,0,0,2018-11-04,1,6,2,1.2,0,...,20,1.0,0,PF,4,2,Bench,1,Sacramento Kings,1
8,7,4,1,0,2018-11-04,0,12,9,13.0,0,...,25,0.0,0,SF,22,0,Bench,0,Sacramento Kings,1
9,5,0,6,1,2018-11-04,2,8,3,7.2,2,...,10,0.0,1,PG,6,2,Bench,1,Sacramento Kings,1
