In [74]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import time

In [86]:
# get list of urls for players stats
players_url = 'https://www.basketball-reference.com/players/'
r = requests.get(players_url)
bs = BeautifulSoup(r.text)
r.ok

True

In [87]:
# get all last name directory links
    # links found in 'ul' tag with 'page_index' class
    # directory links will have a text with length 1 (e.g. "A")
    # base url must be added to directory path
letter_links = bs.find('ul', {'class': 'page_index'}).find_all('a')
# letter_links = ['https://www.basketball-reference.com' + link.get('href') for link in letter_links if len(link.text) == 1]
letter_links = ['https://www.basketball-reference.com' + link.get('href') for link in letter_links if link.text == 'D']
print(len(letter_links))

1


In [88]:
# get all player links from last name directory links

# initialize empty list of player links
player_links = []

# player links found in 'th' tag w/specific scope and class
# active players with 'strong' tag
# add base url
for letter_link in letter_links:
    time.sleep(2.5)
    r = requests.get(letter_link)
    bs = BeautifulSoup(r.text)
    player_tags = bs.find('tbody').find_all('th', {'scope': 'row', 'class': 'left'})
    player_links += ['https://www.basketball-reference.com' + player_tag.find('a').get('href') for player_tag in player_tags if player_tag.find('strong')]

print(len(player_links))

43


In [89]:
# build dataframe of all players' stats by season

# initialize empty list to which dfs will be appended
appended_data = []

for player_link in tqdm(player_links):

    time.sleep(5)

    # get tables from player page
    dfs = pd.read_html(player_link)
    # save df as second table in html table code
    df = dfs[0]

    # add player name as col
    r = requests.get(player_link)
    bs = BeautifulSoup(r.text)
    name = bs.find("div", {"id": "info", "class": "players"}).find("h1").text
    name = name.split('\n')[1]

    df['Name'] = name
    # add years of experience (use index + 1)
    df['Exp'] = df.index + 1

    # append individual df to list
    appended_data.append(df)

# combine all dfs together
nba = pd.concat(appended_data)

100%|██████████| 43/43 [04:06<00:00,  5.73s/it]


In [90]:
# move Exp and Name to be beginning cols
first = nba.pop('Exp')
nba.insert(0, 'Exp', first)

first = nba.pop('Name')
nba.insert(0, 'Name', first)

# drop all rows where Age is NaN (these are not specific years, could be career or specific team average)
nba = nba.dropna(subset=['Age'])

# reset index
nba = nba.reset_index(drop=True)

In [91]:
nba.head()

Unnamed: 0,Name,Exp,Season,Age,Tm,Lg,Pos,G,GS,MP,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Dyson Daniels,1,2022-23,19.0,NOP,NBA,PG,7.0,0.0,11.9,...,0.667,0.7,2.7,3.4,1.3,0.6,0.6,0.4,1.4,4.0
1,Troy Daniels,1,2013-14,22.0,HOU,NBA,SG,5.0,1.0,15.0,...,,0.0,0.8,0.8,1.0,0.0,0.0,0.6,1.2,8.4
2,Troy Daniels,2,2014-15,23.0,TOT,NBA,SG,47.0,0.0,8.4,...,0.846,0.2,0.5,0.7,0.5,0.1,0.0,0.4,0.9,3.7
3,Troy Daniels,3,2014-15,23.0,HOU,NBA,SG,17.0,0.0,6.4,...,0.75,0.0,0.4,0.4,0.2,0.0,0.0,0.3,0.8,2.7
4,Troy Daniels,4,2014-15,23.0,MIN,NBA,SG,19.0,0.0,8.1,...,1.0,0.3,0.7,1.0,0.7,0.2,0.0,0.4,0.7,2.8


In [92]:
nba.shape

(260, 32)

In [93]:
# check to see # unique players, should be 859
nba.Name.nunique()

43

In [94]:
# read csv's, concatatenate, write out
all = pd.read_csv('nba.csv')

nba = pd.concat([all, nba])

nba.to_csv('nba.csv')

In [None]:
# 11-18
# A-D, next up: E