In [23]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time

In [24]:
def format_pay(pay_str):
    # Remove non-numeric characters
    numeric_string = ''.join(c for c in pay_str if c.isdigit())
    numeric_value = int(numeric_string)
    return numeric_value

In [25]:
def get_all_salaries_from_page(page_num, year):
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

    url = "https://www.espn.com/nba/salaries/_/year/" + str(year) + "/page/" + str(page_num)

    response = requests.get(url, headers=headers)

    content = response.content

    soup = BeautifulSoup(content)
    player_rows = soup.find_all("tr", {"class": "evenrow"})
    player_rows.extend(soup.find_all("tr", {"class": "oddrow"}))
    
    names = []
    positions = []
    pays = []

    if len(player_rows) == 0: # no results
        return None

    for row in player_rows:
        player_soup = BeautifulSoup(str(row))
        name =  player_soup.find_all("a")[0].get_text()
        pos = player_soup.find_all("td")[1].get_text().split(", ")[1]
        pay = format_pay(player_soup.find_all("td")[-1].get_text())
        names.append(name)
        if pos == "G":
            pos = "SG"
        if pos == "F":
            pos = "PF"
        positions.append(pos)
        pays.append(pay)
    
    years = np.tile(year, reps=len(pays))

    return pd.DataFrame({"name":names, "position":positions, "annual_salary":pays, "season":years})

In [26]:
def get_all_salaries(start_year=2004, end_year=2024):
    out_df = pd.DataFrame({"name":[], "position":[], "annual_salary":[]})
    for year in np.arange(start_year, end_year):
        print(year)
        page = 1
        old_len = len(out_df)
        while True:
            time.sleep(1) # trying to mitigate network latency for max data retrieval
            res = get_all_salaries_from_page(page, year)
            if res is not None:
                out_df = pd.concat([out_df, res.reset_index(drop=True)])
                page += 1
            else:
                break
        print(len(out_df)-old_len)
    return out_df.sort_values(["annual_salary", "season"], ascending=[False, True]).reset_index(drop=True)

In [27]:
all_salaries = get_all_salaries()
all_salaries

2004
180
2005
120
2006
436
2007
469
2008
488
2009
481
2010
478
2011
537
2012
613
2013
600
2014
426
2015
515
2016
508
2017
577
2018
584
2019
493
2020
520
2021
555
2022
496
2023
528


Unnamed: 0,name,position,annual_salary,season
0,Stephen Curry,PG,48070014.0,2023.0
1,Stephen Curry,PG,45780966.0,2022.0
2,LeBron James,SF,44474988.0,2023.0
3,James Harden,SG,44310840.0,2022.0
4,John Wall,PG,44310840.0,2022.0
...,...,...,...,...
9599,Aaron Jackson,PF,4608.0,2018.0
9600,Billy Thomas,SG,4533.0,2008.0
9601,Linton Johnson,PF,4533.0,2008.0
9602,Oliver Lafayette,SG,2692.0,2010.0


In [28]:
all_salaries.to_csv("data/all_salaries.csv")

In [29]:
avged = all_salaries.loc[:, ["name", "position", "annual_salary"]].groupby(["name", "position"]).mean().sort_values("annual_salary", ascending=False).reset_index()
avged = avged[~avged.name.duplicated()] # retain only one record per player (in case they have played multiple positions)
avged

Unnamed: 0,name,position,annual_salary
0,LeBron James,SF,2.331801e+07
1,Stephen Curry,PG,2.205026e+07
2,Kobe Bryant,SF,2.192588e+07
3,James Harden,SG,2.178189e+07
4,Damian Lillard,PG,2.141555e+07
...,...,...,...
1802,Larry Owens,SF,2.228700e+04
1803,Moses Brown,C,1.918600e+04
1804,Andre Ingram,SG,1.382400e+04
1805,Xavier Sneed,PF,8.558000e+03


In [30]:
avged.to_csv("data/avg_player_salary.csv")