In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import time

In [None]:
# get list of urls for players stats
players_url = 'https://www.basketball-reference.com/players/'
r = requests.get(players_url)
bs = BeautifulSoup(r.text)
r.ok

In [None]:
# get all last name directory links
    # links found in 'ul' tag with 'page_index' class
    # directory links will have a text with length 1 (e.g. "A")
    # base url must be added to directory path
letter_links = bs.find('ul', {'class': 'page_index'}).find_all('a')
letter_links = ['https://www.basketball-reference.com' + link.get('href') for link in letter_links if len(link.text) == 1]
print(len(letter_links))

In [None]:
# get all player links from last name directory links

# initialize empty list of player links
player_links = []

# player links found in 'th' tag w/specific scope and class
# active players with 'strong' tag
# add base url
for letter_link in letter_links:
    time.sleep(2.5)
    r = requests.get(letter_link)
    bs = BeautifulSoup(r.text)
    player_tags = bs.find('tbody').find_all('th', {'scope': 'row', 'class': 'left'})
    player_links += ['https://www.basketball-reference.com' + player_tag.find('a').get('href') for player_tag in player_tags if player_tag.find('strong')]

print(len(player_links))

In [None]:
# build dataframe of all players' stats by season

# initialize empty list to which dfs will be appended
appended_data = []

for player_link in tqdm(player_links):

    time.sleep(5)

    # get tables from player page
    dfs = pd.read_html(player_link)
    # save df as second table in html table code
    df = dfs[0]

    # add player name as col
    r = requests.get(player_link)
    bs = BeautifulSoup(r.text)
    name = bs.find("div", {"id": "info", "class": "players"}).find("h1").text
    name = name.split('\n')[1]

    df['Name'] = name
    # add years of experience (use index + 1)
    df['Exp'] = df.index + 1

    # append individual df to list
    appended_data.append(df)

# combine all dfs together
nba = pd.concat(appended_data)

In [None]:
# move Exp and Name to be beginning cols
first = nba.pop('Exp')
nba.insert(0, 'Exp', first)

first = nba.pop('Name')
nba.insert(0, 'Name', first)

# drop all rows where Age is NaN (these are not specific years, could be career or specific team average)
nba = nba.dropna(subset=['Age'])

# reset index
nba = nba.reset_index(drop=True)

In [None]:
nba.head()

In [None]:
nba.shape

In [None]:
# check to see # unique players, should be 859
nba.Name.nunique()

In [None]:
# write nba df to csv
nba.to_csv('nbaA.csv')