In [1]:
!python3 -m pip install beautifulsoup4 tqdm lxml

# alternative to selenium
# !python3 -m pip install playwright

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip[0m


In [11]:
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import requests
import urllib.request
import pandas as pd
import time

SEASONS = list(range(2000,2024))

def scrape_season(season):
    # grab html contents
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_per_game.html"
    hdr = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
    req = urllib.request.Request(url, headers=hdr)
    html = urllib.request.urlopen(req)

    # access relevant parts of the page via beautiful soup
    soup = BeautifulSoup(html, "html.parser")
    table = soup.find('table', class_='sortable stats_table')
    header_row = table.find('thead')
    body = table.find('tbody')

    # use getText()to extract the headers into a list
    headers = [*[th.getText() for th in header_row.findAll('th')], 'Season']

    # iterate through table row by row and scrape player data
    data = [[i, *[td.getText() for td in row.findAll('td')], int(season)] for i, row in enumerate(body.findAll('tr', class_='full_table'))]

    # NOTE: sports reference does not allow more than 20 requests per minute
    # so we must sleep for 5 seconds
    time.sleep(3.2)
    return pd.DataFrame(data, columns=headers)

# iteratively scrape each season worth of player stats and concatenate data into 1 dataframe
df = pd.DataFrame()
for season in tqdm(SEASONS):
    df = pd.concat([df, scrape_season(season)], ignore_index=True, axis=0)

df

100%|██████████| 24/24 [01:36<00:00,  4.03s/it]


Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Season
0,0,Tariq Abdul-Wahad,SG,25,TOT,61,56,25.9,4.5,10.6,...,1.7,3.1,4.8,1.6,1.0,0.5,1.7,2.4,11.4,2000
1,1,Shareef Abdur-Rahim,SF,23,VAN,82,82,39.3,7.2,15.6,...,2.7,7.4,10.1,3.3,1.1,1.1,3.0,3.0,20.3,2000
2,2,Cory Alexander,PG,26,DEN,29,2,11.3,1.0,3.4,...,0.3,1.2,1.4,2.0,0.8,0.1,1.0,1.3,2.8,2000
3,3,Ray Allen*,SG,24,MIL,82,82,37.4,7.8,17.2,...,1.0,3.4,4.4,3.8,1.3,0.2,2.2,2.3,22.1,2000
4,4,Rafer Alston,PG,23,MIL,27,0,13.4,1.0,3.5,...,0.2,0.7,0.9,2.6,0.4,0.0,1.1,1.1,2.2,2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11521,534,Thaddeus Young,PF,34,TOR,54,9,14.7,2.0,3.7,...,1.3,1.8,3.1,1.4,1.0,0.1,0.8,1.6,4.4,2023
11522,535,Trae Young,PG,24,ATL,73,73,34.8,8.2,19.0,...,0.8,2.2,3.0,10.2,1.1,0.1,4.1,1.4,26.2,2023
11523,536,Omer Yurtseven,C,24,MIA,9,0,9.2,1.8,3.0,...,0.9,1.7,2.6,0.2,0.2,0.2,0.4,1.8,4.4,2023
11524,537,Cody Zeller,C,30,MIA,15,2,14.5,2.5,3.9,...,1.7,2.6,4.3,0.7,0.2,0.3,0.9,2.2,6.5,2023


In [12]:
# sort the data by player, then by season
df = df.sort_values(by=['Player', 'Season'], ascending=[True, True]).reset_index(drop=True)
df['Season'] = df['Season'].astype(int)

In [13]:
df.head(20)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Season
0,153,A.C. Green,PF,36,LAL,82,82,23.5,2.1,4.7,...,2.0,4.0,5.9,1.0,0.6,0.2,0.6,1.5,5.0,2000
1,151,A.C. Green,PF,37,MIA,82,1,17.2,1.8,4.0,...,1.3,2.5,3.8,0.5,0.4,0.1,0.5,1.5,4.5,2001
2,46,A.J. Bramlett,C,23,CLE,8,0,7.6,0.5,2.6,...,1.5,1.3,2.8,0.0,0.1,0.0,0.4,1.6,1.0,2000
3,173,A.J. Green,SG,23,MIL,35,1,9.9,1.5,3.6,...,0.2,1.1,1.3,0.6,0.2,0.0,0.3,0.9,4.4,2023
4,154,A.J. Guyton,PG,22,CHI,33,8,19.1,2.4,5.8,...,0.3,0.8,1.1,1.9,0.3,0.2,0.7,1.1,6.0,2001
5,159,A.J. Guyton,PG,23,CHI,45,6,13.5,2.0,5.4,...,0.3,0.7,1.0,1.8,0.2,0.2,0.8,0.5,5.4,2002
6,145,A.J. Guyton,PG,24,GSW,2,0,4.5,0.0,2.0,...,0.0,0.0,0.0,1.0,0.5,0.0,0.5,0.0,0.0,2003
7,170,A.J. Hammons,C,24,DAL,22,0,7.4,0.8,1.9,...,0.4,1.3,1.6,0.2,0.0,0.6,0.5,1.0,2.2,2017
8,283,A.J. Lawson,SG,22,TOT,15,0,7.2,1.5,2.9,...,0.4,1.0,1.4,0.1,0.1,0.0,0.2,0.7,3.7,2023
9,334,A.J. Price,PG,23,IND,56,2,15.4,2.6,6.3,...,0.2,1.4,1.6,1.9,0.6,0.1,1.1,0.9,7.3,2010


In [14]:
df.to_csv('./../data/player_seasonal_stats.csv', index=False)

In [15]:
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import requests
import urllib.request
import pandas as pd
import time

SEASONS = list(range(2000,2024))

def scrape_season(season):
    # grab html contents
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_advanced.html"
    hdr = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
    req = urllib.request.Request(url, headers=hdr)
    html = urllib.request.urlopen(req)

    # access relevant parts of the page via beautiful soup
    soup = BeautifulSoup(html, "html.parser")
    table = soup.find('table', class_='sortable stats_table')
    header_row = table.find('thead')
    body = table.find('tbody')

    # use getText()to extract the headers into a list
    headers = [*[th.getText() for th in header_row.findAll('th')], 'Season']

    # iterate through table row by row and scrape player data
    data = [[i, *[td.getText() for td in row.findAll('td')], int(season)] for i, row in enumerate(body.findAll('tr', class_='full_table'))]

    # NOTE: sports reference does not allow more than 20 requests per minute
    # so we must sleep for 5 seconds
    time.sleep(3.2)
    return pd.DataFrame(data, columns=headers)

# iteratively scrape each season worth of player stats and concatenate data into 1 dataframe
df = pd.DataFrame()
for season in tqdm(SEASONS):
    df = pd.concat([df, scrape_season(season)], ignore_index=True, axis=0)

df

100%|██████████| 24/24 [01:35<00:00,  4.00s/it]


Unnamed: 0,Rk,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,...,OWS,DWS,WS,WS/48,Unnamed: 16,OBPM,DBPM,BPM,VORP,Season
0,0,Tariq Abdul-Wahad,SG,25,TOT,61,1578,13.6,.477,.036,...,0.4,1.8,2.2,.068,,-1.2,-0.1,-1.2,0.3,2000
1,1,Shareef Abdur-Rahim,SF,23,VAN,82,3223,20.2,.547,.075,...,6.2,2.6,8.8,.132,,2.6,-0.4,2.2,3.4,2000
2,2,Cory Alexander,PG,26,DEN,29,329,8.8,.381,.357,...,-0.5,0.4,-0.1,-0.012,,-4.1,1.8,-2.4,0.0,2000
3,3,Ray Allen*,SG,24,MIL,82,3070,20.6,.570,.288,...,9.0,1.0,10.1,.157,,4.7,-1.1,3.6,4.3,2000
4,4,Rafer Alston,PG,23,MIL,27,361,4.3,.310,.147,...,-0.7,0.0,-0.7,-0.095,,-5.0,-2.3,-7.3,-0.5,2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11521,534,Thaddeus Young,PF,34,TOR,54,795,14.1,.573,.172,...,0.7,1.1,1.8,.109,,-1.8,1.9,0.1,0.4,2023
11522,535,Trae Young,PG,24,ATL,73,2541,22.0,.573,.331,...,5.3,1.4,6.7,.126,,5.3,-2.0,3.3,3.4,2023
11523,536,Omer Yurtseven,C,24,MIA,9,83,16.7,.675,.259,...,0.2,0.1,0.3,.159,,-2.5,-1.5,-3.9,0.0,2023
11524,537,Cody Zeller,C,30,MIA,15,217,16.4,.659,.034,...,0.4,0.3,0.7,.147,,-2.0,-0.7,-2.8,0.0,2023


In [16]:
df.to_csv('./../data/player_seasonal_stats_advanced.csv', index=False)