# Notebook 1:  Web Scraping

In this notebook we scrape www.basketball-reference.com for NBA player stats and salaries.
Our scraping workflow has two parts:
1.  **Past Years Data**: Using free agent lists for the last five years, scrape each listed player's previous season stats and his salary the following year.  These will form the features and targets, respectively, for our player market value model.
2.  **Current Year Data**: Using the entire list of players who have played in the 2021-22 season, scrape each player's current season stats and salary.  The stats will be used in our web app to predict each player's current market value based on their stats, which can be compared with his actual current season salary to calculate his surplus value.

## Imports 

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup, Comment
import requests
import lxml
import unicodedata
import json, pickle

## Utility Functions

In [3]:
def strip_accents_and_punctuation(text):
    '''Normalize player name spellings'''
    try:
        text = unicode(text, 'utf-8')
    except NameError: # unicode is a default on python 3 
        pass
    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")
    return str(text).replace('.','').replace(',','').replace("'",'')


def update_row_with_dict(df,d,idx):
    '''Update the entries of a row in a dataframe, 
       using a dictionary to supply the vlues'''
    for key in d.keys():
        df.loc[idx, key] = d.get(key)

# (1) Scrape Player Stats and Salaries: Past Years


## Scrape individual player stats and salaries


In [4]:
def scrape_player(playerurl, fa_year):
    '''Get salaries and per-game stats from basketball-reference.com
       Edge case if a player played on multiple teams in a year: 
           — stats: his total stats are given across all teams ('Tm'='TOT') 
           — salary: his salaries are concatenated (eg: "$27,957,238$794,536")
    '''
    
    d = {}
    prev_season = str(fa_year-1) + '-' + str(fa_year-2000)
    next_season = str(fa_year) + '-' + str(fa_year-2000+1)
    
    playerresponse = requests.get(playerurl)
    playerpage = playerresponse.text
    playersoup = BeautifulSoup(playerpage, "lxml")

    ## Get Height and Weight:
    
    try:
        script_text = playersoup.find('script',{'type': 'application/ld+json'}).getText()
        biodata = json.loads(script_text) # a dictionary!
        weight = biodata['weight']['value'].replace('lbs','').strip()
        height = biodata['height']['value']
    except:
        weight = np.nan
        height = np.nan
        
    d['Weight']= weight
    d['Height']= height

    ## Get Basic Per-Game Stats:
    
    dfpergame = pd.read_html(str(playersoup.find(id='per_game')))[0]
    dfpergame = dfpergame.drop_duplicates(subset=['Season'])
    dfpergame = dfpergame.set_index('Season')

    featurelist = ['Age', 'Tm', 'G', 'GS', 'MP', 'FG%', '3P', '3P%', '3PA', 'FT', 'FT%','FTA', 'ORB', 'TRB', 
                    'AST', 'STL', 'BLK', 'TOV', 'PTS']
    
    for feature in featurelist:
        try: 
            d[feature] = dfpergame.loc[prev_season, feature] 
        except: 
            d[feature] = np.nan

    ## Get Advanced Per-Game Stats:  

    dfadvanced = pd.read_html(str(playersoup.find(id='advanced')))[0]
    dfadvanced = dfadvanced.drop_duplicates(subset=['Season'])
    dfadvanced = dfadvanced.set_index('Season')
    
    featurelist = ['USG%', 'TS%', 'PER', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']
    
    for feature in featurelist:
        try: 
            d[feature] = dfadvanced.loc[prev_season, feature] 
        except: 
            d[feature] = np.nan

    ## Get previous and next year's salary
    #  — If the player was on one team, it will be a string that looks like "$27,093,019"
    #  — If the player was on two teams, it will be a string that looks like "$27,957,238$794,536"
    #  — For 2021 free agents, Next Salary is incorrect (need 2021-22 from current stats)
    
    placeholder = playersoup.select_one('#all_all_salaries .placeholder')
    comment = next(elem for elem in placeholder.next_siblings if isinstance(elem, Comment))
    table = BeautifulSoup(comment, 'lxml')

    dfsalaries = pd.read_html(str(table.find(id="all_salaries")))[0]
    
    try: 
        d['PrevSal'] = dfsalaries[dfsalaries['Season'] == prev_season]['Salary'].sum()
    except: 
        d['PrevSal'] = np.nan
    try: 
        d['NextSal'] = dfsalaries[dfsalaries['Season'] == next_season]['Salary'].sum()
    except: 
        d['NextSal'] = np.nan
               
    return d

### Example player:

In [5]:
playerurl = 'https://www.basketball-reference.com/players/d/davisan02.html'

fayear = 2019    # we get his stats for the previous season 2018-2019
d = scrape_player(playerurl, fayear)
print(d)

{'Weight': '253', 'Height': '6-10', 'Age': 25.0, 'Tm': 'NOP', 'G': 56.0, 'GS': 56.0, 'MP': 33.0, 'FG%': 0.517, '3P': 0.9, '3P%': 0.331, '3PA': 2.6, 'FT': 6.1, 'FT%': 0.794, 'FTA': 7.7, 'ORB': 3.1, 'TRB': 12.0, 'AST': 3.9, 'STL': 1.6, 'BLK': 2.4, 'TOV': 2.0, 'PTS': 25.9, 'USG%': 29.5, 'TS%': 0.597, 'PER': 30.3, 'OWS': 6.4, 'DWS': 3.1, 'WS': 9.5, 'WS/48': 0.247, 'OBPM': 7.1, 'DBPM': 2.3, 'BPM': 9.4, 'VORP': 5.3, 'PrevSal': '$25,434,262', 'NextSal': '$27,093,019'}


## Scrape stats and salaries for all free agents in a given year

In [6]:
def scrape_fa_year(fa_year):
    '''Find all free agents in a given year and call the function "scrape_player" 
       for each of them to get their previous season stats
    '''

    FA_url = 'https://www.basketball-reference.com/friv/free_agents.cgi?year=' + str(fa_year) 
    prev_season = str(fa_year-1) + '-' + str(fa_year-2000)

    response = requests.get(FA_url)
    page = response.text
    FAsoup = BeautifulSoup(page, "lxml")

    table = FAsoup.find('table')
    rows = [row for row in table.find_all('tr')]  # tr tag is for rows
    rows_data = [[td.getText() for td in rows[i].findAll('td')]
                    for i in range(len(rows))]
        
    df = pd.DataFrame()

    for i in range(1,len(rows)):
        try:
            name = rows_data[i][0]
            name_year = (name + ' ' + str(fa_year-1)).replace(' ','_')  #prev year
            name_year = strip_accents_and_punctuation(name_year)
             
            pos = rows_data[i][1]         #Pos
            fatype = rows_data[i][3]     #Type (UFA, RFA)
            oldteam = rows_data[i][4]    #OTm 
            prevstats = rows_data[i][5]  #Previous Year Stats  ('Did not play' if didn't play)
            newteam = rows_data[i][7]    #NTm

            nameid = rows[i].find_all('td')[0].find('a')['href']

            d = {'Name': name,  'Pos': pos, 'Type': fatype, 'OTm': oldteam, 
                 'PrevStats': prevstats, 'NTm' : newteam, 'ID': nameid}
            update_row_with_dict(df,d,name_year)
        
        
            playerurl = 'https://www.basketball-reference.com' + nameid
            playerdict = scrape_player(playerurl, fa_year)
            
            update_row_with_dict(df,playerdict,name_year)
            
        except:
            continue
            
    df['PrevYear'] = fa_year-1
    df.index.name = 'NameYear'
    
    return df

## Scrape last 5 years of free agent stats and salaries

In [7]:
# This takes 1-2 minutes per year

df2016 = scrape_fa_year(2017)
df2017 = scrape_fa_year(2018)
df2018 = scrape_fa_year(2019)
df2019 = scrape_fa_year(2020)
df2020 = scrape_fa_year(2021)

df2020.head(3)
# Note that for df2020 in particular, NextSal data cannot be obtained this way and is just listed as "0.0".  
# We will obtain in in a different way in Notebook 2 and populate the field in Notebook 3

Unnamed: 0_level_0,PrevYear
NameYear,Unnamed: 1_level_1


# (2) Scrape Player Stats and Salaries:  Current Year

## Scrape player stats:  current year

In [8]:
##  Per-Game Stats

url_pergame = 'https://www.basketball-reference.com/leagues/NBA_2022_per_game.html'

def scrape_current_season_stats_pergame(url):
    '''Get current season stats for all players from basketball-reference.com
    '''
    d = {}
  
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")

    df = pd.read_html(str(soup.find(id='all_per_game_stats')))[0]
    return df

##  Advanced Stats

url_advanced = 'https://www.basketball-reference.com/leagues/NBA_2022_advanced.html'

def scrape_current_season_stats_advanced(url):
    '''Get current season stats for all players from basketball-reference.com
    '''
    d = {}
  
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")

    df = pd.read_html(str(soup.find(id='advanced_stats')))[0]
    return df

dfpergame = scrape_current_season_stats_pergame(url_pergame)
dfadvanced = scrape_current_season_stats_advanced(url_advanced)

## Merge per-game and advanced stats

cols_to_use = dfadvanced.columns.difference(dfpergame.columns)
dfcurrentstats = pd.merge(dfpergame, dfadvanced[cols_to_use], left_index=True, right_index=True, how='outer')
dfcurrentstats = dfcurrentstats.drop(['Unnamed: 19','Unnamed: 24'], axis=1)
print(dfcurrentstats.columns)
dfcurrentstats.sample(5)

Index(['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', '3PAr',
       'AST%', 'BLK%', 'BPM', 'DBPM', 'DRB%', 'DWS', 'FTr', 'OBPM', 'ORB%',
       'OWS', 'PER', 'STL%', 'TOV%', 'TRB%', 'TS%', 'USG%', 'VORP', 'WS',
       'WS/48'],
      dtype='object')


Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,OWS,PER,STL%,TOV%,TRB%,TS%,USG%,VORP,WS,WS/48
562,402,Svi Mykhailiuk,SF,24,TOR,56,5,12.8,1.6,4.2,...,0.1,9.6,1.8,11.3,7.0,0.509,16.9,-0.2,0.8,0.052
116,93,Clint Capela,C,27,ATL,74,73,27.6,5.0,8.2,...,5.6,21.4,1.3,6.1,23.5,0.604,15.5,2.1,8.3,0.195
620,452,Michael Porter Jr.,SF,23,DEN,9,9,29.4,4.1,11.4,...,-0.5,7.5,1.9,10.1,12.4,0.416,19.7,-0.1,-0.2,-0.032
503,360,Miles McBride,PG,21,NYK,40,2,9.3,0.8,2.7,...,-0.1,6.1,1.9,4.2,6.1,0.393,13.9,-0.2,0.2,0.032
392,281,Cameron Johnson,PF,25,PHO,66,16,26.2,4.2,9.2,...,3.2,15.2,1.6,6.7,8.5,0.625,17.5,2.1,5.6,0.156


## Scrape player salaries:  current year

In [9]:
def scrape_team(team):
    '''Returns dataframe with 2021-22 salary for all players from basketball-reference.com
    '''
    
    url = 'https://www.basketball-reference.com/contracts/' + team + '.html'
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")

    df = pd.read_html(str(soup.find(id='contracts')))[0]
        
    df.columns = df.columns.to_flat_index()
    df.rename(columns= {df.columns[0]: 'Name', df.columns[2]: 'CurrentSalary'}, inplace=True)
    df['CurrentTeam'] = team

    df = df.drop(df.tail(1).index)
    df = df[['Name','CurrentSalary','CurrentTeam']]
    
    df = df.dropna()
    
    return df

test = scrape_team('BOS')
display(test)

Unnamed: 0,Name,CurrentSalary,CurrentTeam
0,Jayson Tatum,"$30,351,780",BOS
1,Jaylen Brown,"$28,741,071",BOS
2,Al Horford,"$26,500,000",BOS
3,Malcolm Brogdon,"$22,600,000",BOS
4,Marcus Smart,"$17,207,142",BOS
5,Derrick White,"$16,892,857",BOS
6,Robert Williams,"$10,937,502",BOS
7,Danilo Gallinari,"$6,479,000",BOS
8,Grant Williams,"$4,306,281",BOS
9,Mike Muscala,"$3,500,000",BOS


In [10]:
tm_to_team =  {
 'TOR': 'Toronto Raptors',         'MEM': 'Memphis Grizzlies',
 'MIA': 'Miami Heat',              'BRK': 'Brooklyn Nets',
 'NOP': 'New Orleans Pelicans',    'MIL': 'Milwaukee Bucks',
 'CLE': 'Cleveland Cavaliers' ,    'LAL': 'Los Angeles Lakers',
 'ORL': 'Orlando Magic',           'HOU': 'Houston Rockets' ,
 'WAS': 'Washington Wizards' ,     'PHO': 'Phoenix Suns',
 'UTA': 'Utah Jazz',               'SAC': 'Sacramento Kings',
 'CHO': 'Charlotte Hornets',       'CHI': 'Chicago Bulls' ,
 'NYK': 'New York Knicks',         'DEN': 'Denver Nuggets' ,
 'PHI': 'Philadephia 76ers' ,      'SAS': 'San Antonio Spurs' ,
 'LAC': 'Los Angeles Clippers',    'OKC': 'Oklahoma City Thunder' ,
 'MIN': 'Minnesota Timberwolves',  'DET': 'Detroit Pistons' ,
 'IND': 'Indiana Pacers',          'GSW': 'Golden State Warriors' ,
 'POR': 'Portland Trailblazers',   'ATL': 'Atlanta Hawks',
 'BOS': 'Boston Celtics',          'DAL':'Dallas Mavericks',
 }

teams = list(tm_to_team.keys())  #We just need the team names as used in the urls

df = pd.DataFrame(columns=['Name', 'CurrentSalary', 'CurrentTeam'])

for team in teams:
    df_team = scrape_team(team)
    df = pd.concat([df, df_team])
    
df.sample(5)

Unnamed: 0,Name,CurrentSalary,CurrentTeam
2,Lonzo Ball,"$19,534,884",CHI
2,D'Angelo Russell,"$31,377,750",LAL
7,Joe Ingles,"$6,479,000",MIL
14,Will Barton,"$432,642",TOR
2,James Harden,"$33,000,000",PHI


There are a 16 duplicate names (same player, multiple teams/salaries).  We deal with it manually, keeping the entry corresponding to the actual current team.

In [11]:
# df[df.duplicated(subset = ['Name'])].sort_values("Name")
ids = df["Name"]
df[ids.isin(ids[ids.duplicated()])].sort_values("Name")

Unnamed: 0,Name,CurrentSalary,CurrentTeam
19,Charles Bassey,"$74,742",PHI
9,Charles Bassey,"$2,600,000",SAS
9,Danny Green,"$2,000,000",CLE
18,Danny Green,"$9,710,528",HOU
12,DeAndre Jordan,"$1,836,090",DEN
19,DeAndre Jordan,"$7,827,907",DET
20,Dewayne Dedmon,"$2,866,667",DET
14,Dewayne Dedmon,"$580,373",PHI
17,Dewayne Dedmon,"$4,700,000",SAS
18,Goga Bitadze,"$4,765,339",IND


In [12]:
mask1 = ((df.Name == 'Armoni Brooks') & ~(df.CurrentTeam == 'TOR'))
mask2 = ((df.Name == 'Blake Griffin') & ~(df.CurrentTeam == 'BRK'))
mask3 = ((df.Name == 'D.J. Augustin') & ~(df.CurrentTeam == 'LAL'))
mask4 = ((df.Name == 'Danuel House Jr.') & ~(df.CurrentTeam == 'UTA'))
mask5 = ((df.Name == 'DeAndre Jordan') & ~(df.CurrentTeam == 'PHI'))
mask6 = ((df.Name == "DeAndre' Bembry") & ~(df.CurrentTeam == 'MIL'))
mask7 = ((df.Name == 'DeMarcus Cousins') & ~(df.CurrentTeam == 'DEN'))
mask8 = ((df.Name == 'Dewayne Dedmon') & ~(df.CurrentTeam == 'MIA'))
mask9 = ((df.Name == 'Goran Dragić') & ~(df.CurrentTeam == 'BRK'))
mask10 = ((df.Name == 'Isaiah Hartenstein') & ~(df.CurrentTeam == 'LAC'))
mask11 = ((df.Name == 'Jevon Carter') & ~(df.CurrentTeam == 'MIL'))
mask12 = ((df.Name == 'Kemba Walker') & ~(df.CurrentTeam == 'NYK'))
mask13 = ((df.Name == 'Moses Brown') & ~(df.CurrentTeam == 'CLE'))
mask14 = ((df.Name == 'Nicolas Batum') & ~(df.CurrentTeam == 'LAC'))
mask15 = ((df.Name == 'Tomáš Satoranský') & ~(df.CurrentTeam == 'WAS'))
mask16 = ((df.Name == 'Tristan Thompson') & ~(df.CurrentTeam == 'CHI'))

df = df[~mask1 & ~mask2 & ~mask3 & ~mask4 & ~mask5 & ~mask6 & ~mask7  & ~mask8 & ~mask9
           & ~mask10 & ~mask11 & ~mask12 & ~mask13 & ~mask14  & ~mask15 & ~mask16] 

dfcurrentsalaries = df.copy()
dfcurrentsalaries.shape

(473, 3)

## Create player-to-url dictionary

* Get the player page url for any player who played in a game this past season (this is mostly useful for easily creating hyperlinks in the final web app, and could also potentially disambiguate identically named players)


In [13]:
url = 'https://www.basketball-reference.com/leagues/NBA_2022_per_game.html'

player_to_url = {}

response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, "lxml")


table = soup.find('table')
rows = [row for row in table.find_all('tr')]  # tr tag is for rows

for row in rows[1:]:
    try:
        name = row.findAll('td')[0].getText()
        nameid = row.findAll('td')[0].contents[0]['href']
        if name not in player_to_url:
            player_to_url[name] = nameid
    except:
        pass

{k: player_to_url[k] for k in list(player_to_url)[:5]}

{'Precious Achiuwa': '/players/a/achiupr01.html',
 'Steven Adams': '/players/a/adamsst01.html',
 'Bam Adebayo': '/players/a/adebaba01.html',
 'Santi Aldama': '/players/a/aldamsa01.html',
 'LaMarcus Aldridge': '/players/a/aldrila01.html'}

In [14]:
dfplayer_to_url = pd.Series(player_to_url).to_frame().reset_index()
dfplayer_to_url.rename(columns= {'index': 'Name',0: 'ID'}, inplace=True)
dfplayer_to_url.head(5)

Unnamed: 0,Name,ID
0,Precious Achiuwa,/players/a/achiupr01.html
1,Steven Adams,/players/a/adamsst01.html
2,Bam Adebayo,/players/a/adebaba01.html
3,Santi Aldama,/players/a/aldamsa01.html
4,LaMarcus Aldridge,/players/a/aldrila01.html


# Save All Scraped Data

In [15]:
df2016.to_csv(r'/Users/richardsihombing/Documents/BigDataNBA/data/df2016_raw.csv')
df2017.to_csv(r'/Users/richardsihombing/Documents/BigDataNBA/data/df2017_raw.csv')
df2018.to_csv(r'/Users/richardsihombing/Documents/BigDataNBA/data/df2018_raw.csv')
df2019.to_csv(r'/Users/richardsihombing/Documents/BigDataNBA/data/df2019_raw.csv')
df2020.to_csv(r'/Users/richardsihombing/Documents/BigDataNBA/data/df2020_raw.csv')

dfcurrentstats.to_csv(r'/Users/richardsihombing/Documents/BigDataNBA/data/dfcurrentstats.csv')
dfcurrentsalaries.to_csv(r'/Users/richardsihombing/Documents/BigDataNBA/data/dfcurrentsalaries.csv')

dfplayer_to_url.to_csv(r'/Users/richardsihombing/Documents/BigDataNBA/data/dfplayer_to_url.csv')

OSError: Cannot save file into a non-existent directory: '/Users/andrei/Dropbox/Metis/HoopsHero/data'