In [1]:
!python3 -m pip install beautifulsoup4 tqdm lxml

# alternative to selenium
# !python3 -m pip install playwright



In [86]:
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import requests
import urllib.request
import pandas as pd
import time

SEASONS = list(range(2000,2023))

def scrape_season(season):
    # grab html contents
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_per_game.html"
    hdr = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
    req = urllib.request.Request(url, headers=hdr)
    html = urllib.request.urlopen(req)

    # access relevant parts of the page via beautiful soup
    soup = BeautifulSoup(html, "html.parser")
    table = soup.find('table', class_='sortable stats_table')
    header_row = table.find('thead')
    body = table.find('tbody')

    # use getText()to extract the headers into a list
    headers = [*[th.getText() for th in header_row.findAll('th')], 'Season']

    # iterate through table row by row and scrape player data
    data = [[i, *[td.getText() for td in row.findAll('td')], int(season)] for i, row in enumerate(body.findAll('tr', class_='full_table'))]

    # NOTE: sports reference does not allow more than 20 requests per minute
    # so we must sleep for 5 seconds
    time.sleep(5)
    return pd.DataFrame(data, columns=headers)

# iteratively scrape each season worth of player stats and concatenate data into 1 dataframe
df = pd.DataFrame(columns=headers)
for season in tqdm(SEASONS):
    df = pd.concat([df, scrape_season(season)], ignore_index=True, axis=0)

df

100%|██████████| 23/23 [02:20<00:00,  6.11s/it]


Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Season
0,0,Tariq Abdul-Wahad,SG,25,TOT,61,56,25.9,4.5,10.6,...,1.7,3.1,4.8,1.6,1.0,0.5,1.7,2.4,11.4,2000.0
1,1,Shareef Abdur-Rahim,SF,23,VAN,82,82,39.3,7.2,15.6,...,2.7,7.4,10.1,3.3,1.1,1.1,3.0,3.0,20.3,2000.0
2,2,Cory Alexander,PG,26,DEN,29,2,11.3,1.0,3.4,...,0.3,1.2,1.4,2.0,0.8,0.1,1.0,1.3,2.8,2000.0
3,3,Ray Allen*,SG,24,MIL,82,82,37.4,7.8,17.2,...,1.0,3.4,4.4,3.8,1.3,0.2,2.2,2.3,22.1,2000.0
4,4,Rafer Alston,PG,23,MIL,27,0,13.4,1.0,3.5,...,0.2,0.7,0.9,2.6,0.4,0.0,1.1,1.1,2.2,2000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10982,600,Thaddeus Young,PF,33,TOT,52,1,16.3,2.7,5.2,...,1.5,2.5,4.0,2.0,1.0,0.3,1.0,1.6,6.2,2022.0
10983,601,Trae Young,PG,23,ATL,76,76,34.9,9.4,20.3,...,0.7,3.1,3.7,9.7,0.9,0.1,4.0,1.7,28.4,2022.0
10984,602,Omer Yurtseven,C,23,MIA,56,12,12.6,2.3,4.4,...,1.5,3.7,5.3,0.9,0.3,0.4,0.7,1.5,5.3,2022.0
10985,603,Cody Zeller,C,29,POR,27,0,13.1,1.9,3.3,...,1.9,2.8,4.6,0.8,0.3,0.2,0.7,2.1,5.2,2022.0


In [91]:
# sort the data by player, then by season
df = df.sort_values(by=['Player', 'Season'], ascending=[True, True]).reset_index(drop=True)
df['Season'] = df['Season'].astype(int)

In [92]:
df.head(20)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Season
0,153,A.C. Green,PF,36,LAL,82,82,23.5,2.1,4.7,...,2.0,4.0,5.9,1.0,0.6,0.2,0.6,1.5,5.0,2000
1,151,A.C. Green,PF,37,MIA,82,1,17.2,1.8,4.0,...,1.3,2.5,3.8,0.5,0.4,0.1,0.5,1.5,4.5,2001
2,46,A.J. Bramlett,C,23,CLE,8,0,7.6,0.5,2.6,...,1.5,1.3,2.8,0.0,0.1,0.0,0.4,1.6,1.0,2000
3,154,A.J. Guyton,PG,22,CHI,33,8,19.1,2.4,5.8,...,0.3,0.8,1.1,1.9,0.3,0.2,0.7,1.1,6.0,2001
4,159,A.J. Guyton,PG,23,CHI,45,6,13.5,2.0,5.4,...,0.3,0.7,1.0,1.8,0.2,0.2,0.8,0.5,5.4,2002
5,145,A.J. Guyton,PG,24,GSW,2,0,4.5,0.0,2.0,...,0.0,0.0,0.0,1.0,0.5,0.0,0.5,0.0,0.0,2003
6,170,A.J. Hammons,C,24,DAL,22,0,7.4,0.8,1.9,...,0.4,1.3,1.6,0.2,0.0,0.6,0.5,1.0,2.2,2017
7,334,A.J. Price,PG,23,IND,56,2,15.4,2.6,6.3,...,0.2,1.4,1.6,1.9,0.6,0.1,1.1,0.9,7.3,2010
8,338,A.J. Price,PG,24,IND,50,0,15.9,2.3,6.4,...,0.3,1.1,1.4,2.2,0.6,0.0,1.1,1.2,6.5,2011
9,356,A.J. Price,PG,25,IND,44,1,12.9,1.3,4.0,...,0.3,1.1,1.4,2.0,0.5,0.0,0.7,0.7,3.9,2012


In [93]:
df.to_csv('player_seasonal_stats.csv', index=False)

In [6]:
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import requests
import urllib.request
import pandas as pd
import time

SEASONS = list(range(2000,2023))

def scrape_season(season):
    # grab html contents
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_advanced.html"
    hdr = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
    req = urllib.request.Request(url, headers=hdr)
    html = urllib.request.urlopen(req)

    # access relevant parts of the page via beautiful soup
    soup = BeautifulSoup(html, "html.parser")
    table = soup.find('table', class_='sortable stats_table')
    header_row = table.find('thead')
    body = table.find('tbody')

    # use getText()to extract the headers into a list
    headers = [*[th.getText() for th in header_row.findAll('th')], 'Season']
    print("headers", headers)

    # iterate through table row by row and scrape player data
    data = [[i, *[td.getText() for td in row.findAll('td')], int(season)] for i, row in enumerate(body.findAll('tr', class_='full_table'))]

    # NOTE: sports reference does not allow more than 20 requests per minute
    # so we must sleep for 5 seconds
    time.sleep(5)
    return pd.DataFrame(data, columns=headers)

# iteratively scrape each season worth of player stats and concatenate data into 1 dataframe
df = pd.DataFrame()
for season in tqdm(SEASONS):
    df = pd.concat([df, scrape_season(season)], ignore_index=True, axis=0)

df

  0%|          | 0/23 [00:00<?, ?it/s]

headers ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '\xa0', 'OWS', 'DWS', 'WS', 'WS/48', '\xa0', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Season']


  4%|▍         | 1/23 [00:06<02:26,  6.66s/it]

headers ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '\xa0', 'OWS', 'DWS', 'WS', 'WS/48', '\xa0', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Season']


  9%|▊         | 2/23 [00:14<02:38,  7.55s/it]

headers ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '\xa0', 'OWS', 'DWS', 'WS', 'WS/48', '\xa0', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Season']


 13%|█▎        | 3/23 [00:22<02:27,  7.40s/it]

headers ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '\xa0', 'OWS', 'DWS', 'WS', 'WS/48', '\xa0', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Season']


 17%|█▋        | 4/23 [00:28<02:11,  6.91s/it]

headers ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '\xa0', 'OWS', 'DWS', 'WS', 'WS/48', '\xa0', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Season']


 22%|██▏       | 5/23 [00:34<02:01,  6.78s/it]

headers ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '\xa0', 'OWS', 'DWS', 'WS', 'WS/48', '\xa0', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Season']


 26%|██▌       | 6/23 [00:41<01:52,  6.60s/it]

headers ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '\xa0', 'OWS', 'DWS', 'WS', 'WS/48', '\xa0', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Season']


 30%|███       | 7/23 [00:48<01:48,  6.79s/it]

headers ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '\xa0', 'OWS', 'DWS', 'WS', 'WS/48', '\xa0', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Season']


 35%|███▍      | 8/23 [00:54<01:40,  6.67s/it]

headers ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '\xa0', 'OWS', 'DWS', 'WS', 'WS/48', '\xa0', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Season']


 39%|███▉      | 9/23 [01:02<01:36,  6.90s/it]

headers ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '\xa0', 'OWS', 'DWS', 'WS', 'WS/48', '\xa0', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Season']


 43%|████▎     | 10/23 [01:08<01:29,  6.87s/it]

headers ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '\xa0', 'OWS', 'DWS', 'WS', 'WS/48', '\xa0', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Season']


 48%|████▊     | 11/23 [01:15<01:21,  6.81s/it]

headers ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '\xa0', 'OWS', 'DWS', 'WS', 'WS/48', '\xa0', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Season']


 52%|█████▏    | 12/23 [01:23<01:17,  7.06s/it]

headers ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '\xa0', 'OWS', 'DWS', 'WS', 'WS/48', '\xa0', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Season']


 57%|█████▋    | 13/23 [01:29<01:09,  6.93s/it]

headers ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '\xa0', 'OWS', 'DWS', 'WS', 'WS/48', '\xa0', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Season']


 61%|██████    | 14/23 [01:36<01:00,  6.74s/it]

headers ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '\xa0', 'OWS', 'DWS', 'WS', 'WS/48', '\xa0', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Season']


 65%|██████▌   | 15/23 [01:42<00:52,  6.56s/it]

headers ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '\xa0', 'OWS', 'DWS', 'WS', 'WS/48', '\xa0', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Season']


 70%|██████▉   | 16/23 [01:50<00:48,  6.96s/it]

headers ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '\xa0', 'OWS', 'DWS', 'WS', 'WS/48', '\xa0', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Season']


 74%|███████▍  | 17/23 [01:57<00:41,  6.99s/it]

headers ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '\xa0', 'OWS', 'DWS', 'WS', 'WS/48', '\xa0', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Season']


 78%|███████▊  | 18/23 [02:04<00:35,  7.05s/it]

headers ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '\xa0', 'OWS', 'DWS', 'WS', 'WS/48', '\xa0', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Season']


 83%|████████▎ | 19/23 [02:12<00:29,  7.29s/it]

headers ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '\xa0', 'OWS', 'DWS', 'WS', 'WS/48', '\xa0', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Season']


 87%|████████▋ | 20/23 [02:18<00:21,  7.03s/it]

headers ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '\xa0', 'OWS', 'DWS', 'WS', 'WS/48', '\xa0', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Season']


 91%|█████████▏| 21/23 [02:26<00:14,  7.22s/it]

headers ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '\xa0', 'OWS', 'DWS', 'WS', 'WS/48', '\xa0', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Season']


 96%|█████████▌| 22/23 [02:33<00:07,  7.29s/it]

headers ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', '\xa0', 'OWS', 'DWS', 'WS', 'WS/48', '\xa0', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Season']


100%|██████████| 23/23 [02:40<00:00,  7.00s/it]


Unnamed: 0,Rk,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,...,OWS,DWS,WS,WS/48,Unnamed: 16,OBPM,DBPM,BPM,VORP,Season
0,0,Tariq Abdul-Wahad,SG,25,TOT,61,1578,13.6,.477,.036,...,0.4,1.8,2.2,.068,,-1.2,-0.1,-1.2,0.3,2000
1,1,Shareef Abdur-Rahim,SF,23,VAN,82,3223,20.2,.547,.075,...,6.2,2.6,8.8,.132,,2.6,-0.4,2.2,3.4,2000
2,2,Cory Alexander,PG,26,DEN,29,329,8.8,.381,.357,...,-0.5,0.4,-0.1,-0.012,,-4.1,1.8,-2.4,0.0,2000
3,3,Ray Allen*,SG,24,MIL,82,3070,20.6,.570,.288,...,9.0,1.0,10.1,.157,,4.7,-1.1,3.6,4.3,2000
4,4,Rafer Alston,PG,23,MIL,27,361,4.3,.310,.147,...,-0.7,0.0,-0.7,-0.095,,-5.0,-2.3,-7.3,-0.5,2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10982,600,Thaddeus Young,PF,33,TOT,52,845,17.0,.548,.176,...,0.9,1.3,2.2,.126,,0.1,2.1,2.2,0.9,2022
10983,601,Trae Young,PG,23,ATL,76,2652,25.4,.603,.395,...,9.0,1.0,10.0,.181,,7.1,-2.0,5.2,4.8,2022
10984,602,Omer Yurtseven,C,23,MIA,56,706,17.4,.546,.045,...,0.8,1.4,2.1,.145,,-1.4,0.4,-1.0,0.2,2022
10985,603,Cody Zeller,C,29,POR,27,355,17.2,.627,.044,...,0.9,0.2,1.1,.143,,-1.2,-1.0,-2.1,0.0,2022
