# Notebook Web Scraping:  Approach to scrape data from a statistics website

We try to use the data from www.basketball-reference.com and scrape for the respective statitics of the basketball players.
With the gained data we will analyze if a player is underpaid, overpaid or paid fairly in regard to their performance.

As the scraping was a more sophisticated approach, we tried to make it work. However at the end we had to resolve it by using the export function of the basketball-reference.com website, to get out data in a dataframe.
At one point the scraping did work, but very inconsistently. So we changed some of the code in order to make it reproducable every time. Resulting in it to not even completely work occasionally. The current status is that most part of the code is working correctly for the current year and at some stages even display the correct information. However the data at the very end is not being displayed due to an error. Also the scraping code for the multiple years prior to 2023 is running without any errors, but does not seem to actually scrape. Once calling the table, it is either empty or not existing.


## Imports 

In [35]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import unicodedata
import json, pickle
from bs4 import BeautifulSoup, Comment
import requests
import lxml
import html5lib

## Setting up the scraping for players data


In [36]:
def strip_accents_and_punctuation(text):
    '''preprocessing the player names'''
    try:
        text = unicode(text, 'utf-8')
    except NameError:
        pass
    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")
    return f"{text}".replace('.','').replace(',','').replace("'",'')

def update_row_with_dict(df, data, idx):
# updating cells with new entries by using a dictionary
    for key, value in data.items():
        df.at[idx, key] = value


## Scrape player statistics


In [37]:
year = 2022

In [38]:
def scrape_player(playerurl, year):
# Scraping data from basketball-reference.com by using the respective URL for each player.
    
    data = {}
    prev_season = f"{year-1}-{year-2000}"
    next_season = f"{year}-{year-2000+1}"
    
    playerresponse = requests.get(playerurl)
    playerpage = playerresponse.text
    playersoup = BeautifulSoup(playerpage, "lxml")

    ## Get additonal data like Height and Weight:
    
    try:
        script_text = playersoup.find('script', {'type': 'application/ld+json'}).getText()
        biodata = json.loads(script_text)
        weight = biodata.get('weight', {}).get('value', '').replace('lbs', '').strip()
        height = biodata.get('height', {}).get('value', '')
    except (AttributeError, KeyError, json.JSONDecodeError):
        weight = np.nan
        height = np.nan
        
    data['Weight']= weight
    data['Height']= height

    ## Getting most common game statistics:
    
    dfpergame = pd.read_html(f"{playersoup.find(id='per_game')}")[0]
    dfpergame = dfpergame.drop_duplicates(subset=['Season'])
    dfpergame = dfpergame.set_index('Season')

    featurelist = ['Age', 'Tm', 'G', 'GS', 'MP', 'FG%', '3P', '3P%', '3PA', 'FT', 'FT%', 'FTA', 'ORB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PTS']

    for feature in featurelist:
        try:
            data[feature] = dfpergame.at[prev_season, feature]
        except KeyError:
            data[feature] = np.nan

    ## Get detailed game statistics:  

    dfadvanced = pd.read_html(f"{playersoup.find(id='advanced')}")[0]
    dfadvanced = dfadvanced.drop_duplicates(subset=['Season'])
    dfadvanced = dfadvanced.set_index('Season')
    
    featurelist = ['USG%', 'TS%', 'PER', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']
    
    for feature in featurelist:
        try: 
            data[feature] = dfadvanced.at[prev_season, feature] 
        except KeyError: 
            data[feature] = np.nan

    # Get last year's and next year's salary
    # If a player has played for more than one team, his salary is concatenated

    placeholder = playersoup.select_one('#all_all_salaries .placeholder')
    comment = next(elem for elem in placeholder.next_siblings if isinstance(elem, Comment))
    table = BeautifulSoup(comment, 'lxml')

    dfsalaries = pd.read_html(f"{table.find(id='all_salaries')}")[0]

    try:
        data['PrevSal'] = dfsalaries.loc[dfsalaries['Season'] == prev_season, 'Salary'].sum()
    except (KeyError, TypeError):
        data['PrevSal'] = np.nan

    try:
        data['NextSal'] = dfsalaries.loc[dfsalaries['Season'] == next_season, 'Salary'].sum()
    except (KeyError, TypeError):
        data['NextSal'] = np.nan
               
    return data

## Example player: Lebron James

In [39]:
playerurl = 'https://www.basketball-reference.com/players/j/jamesle01.html'

fayear = 2023 
data = scrape_player(playerurl, fayear)
print(data)

{'Weight': '250', 'Height': '6-9', 'Age': 38.0, 'Tm': 'LAL', 'G': 55.0, 'GS': 54.0, 'MP': 35.5, 'FG%': 0.5, '3P': 2.2, '3P%': 0.321, '3PA': 6.9, 'FT': 4.6, 'FT%': 0.768, 'FTA': 5.9, 'ORB': 1.2, 'TRB': 8.3, 'AST': 6.8, 'STL': 0.9, 'BLK': 0.6, 'TOV': 3.2, 'PTS': 28.9, 'USG%': 33.3, 'TS%': 0.583, 'PER': 23.9, 'OWS': 3.2, 'DWS': 2.4, 'WS': 5.6, 'WS/48': 0.138, 'OBPM': 5.5, 'DBPM': 0.6, 'BPM': 6.1, 'VORP': 4.0, 'PrevSal': '$44,474,988', 'NextSal': 0}


## Scrape stats and salaries for all free agents in a given year

In [40]:
def update_row_with_dict(df, data, idx):
    for key in data.keys():
        df.loc[idx, key] = data.get(key)

def scrape_year(year):
    # For calculating if a player is over or underpaid, we look at so called "free agents", as these are players
    # who do not have a contract with a team for the next season. Meaning they will have to negotiate their salary based on their performance. 
    # Giving us a baseline of how much a player should earn for their performance. 

    free_agent_url = 'https://www.basketball-reference.com/contracts/players.html' + f"{year}"
    # Checking which players do not have a contract for the next season

    prev_season = f"{year-1}-{year-2000}"
    response = requests.get(free_agent_url)
    page = response.text
    FAsoup = BeautifulSoup(page, "lxml")

    table = FAsoup.find('table')
    rows = [row for row in table.find_all('tr')]  # tr tag is for rows
    rows_data = [[td.get_text() for td in row.find_all('td')] for row in rows]

    df = pd.DataFrame()

    for i in range(1, len(rows)):
        try:
            print("Scraping row", i)

            name = rows_data[i][0]
            name_year = f"{name} {year - 1}".replace(' ', '_')  # past year
            name_year = strip_accents_and_punctuation(name_year)

            pos = rows_data[i][1]        # Position/ role of the player
            fatype = rows_data[i][3]     # Type (UFA, RFA)
            oldteam = rows_data[i][4]    # As "OTm" in the dataframe
            prevstats = rows_data[i][5]  # Previous Year Stats ('Did not play' if didn't play)
            newteam = rows_data[i][7]    # As "NTm" in the dataframe

            nameid = rows[i].find_all('td')[0].find('a')['href']

            data = {
                'Name': name, 'Pos': pos, 'Type': fatype, 'OTm': oldteam,
                'PrevStats': prevstats, 'NTm': newteam, 'ID': nameid
            }
            update_row_with_dict(df, data, name_year)

            playerurl = 'https://www.basketball-reference.com' + nameid
            playerdict = scrape_player(playerurl, year) 
            update_row_with_dict(df, playerdict, name_year)

        except Exception as e:
            print(f"Error occurred for row {i}: {str(e)}")
            continue

    df['PrevYear'] = year - 1
    df.index.name = 'NameYear'

    return df


In [41]:
def update_row_with_dict(df, data, idx):
    for key in data.keys():
        df.loc[idx, key] = data.get(key)

def scrape_year(year):
    free_agent_url = 'https://www.basketball-reference.com/contracts/players.html' + f"{year}"
    # Checking which players do not have a contract for the next season

    prev_season = f"{year-1}-{year-2000}"
    response = requests.get(free_agent_url)
    page = response.text
    FAsoup = BeautifulSoup(page, "lxml")

    table = FAsoup.find('table')
    if table is None:
        print(f"Table not found for year {year}")
        return pd.DataFrame()

    rows = [row for row in table.find_all('tr')]  # tr tag is for rows
    rows_data = [[td.get_text() for td in row.find_all('td')] for row in rows]

    df = pd.DataFrame()

    for i in range(1, len(rows)):
        try:
            print("Scraping row", i)

            name = rows_data[i][0]
            name_year = f"{name} {year - 1}".replace(' ', '_')  # past year
            name_year = strip_accents_and_punctuation(name_year)

            pos = rows_data[i][1]        # Position/ role of the player
            fatype = rows_data[i][3]     # Type (UFA, RFA)
            oldteam = rows_data[i][4]    # As "OTm" in the dataframe
            prevstats = rows_data[i][5]  # Previous Year Stats ('Did not play' if didn't play)
            newteam = rows_data[i][7]    # As "NTm" in the dataframe

            nameid = rows[i].find_all('td')[0].find('a')['href']

            data = {
                'Name': name, 'Pos': pos, 'Type': fatype, 'OTm': oldteam,
                'PrevStats': prevstats, 'NTm': newteam, 'ID': nameid
            }
            update_row_with_dict(df, data, name_year)

            playerurl = 'https://www.basketball-reference.com' + nameid
            playerdict = scrape_player(playerurl, year)  # Assuming 'scrape_player' function is defined elsewhere
            update_row_with_dict(df, playerdict, name_year)

        except Exception as e:
            print(f"Error occurred for row {i}: {str(e)}")
            continue

    df['PrevYear'] = year - 1
    df.index.name = 'NameYear'

    return df

# Example usage
df2016 = scrape_year(2017)
df2017 = scrape_year(2018)
df2018 = scrape_year(2019)
df2019 = scrape_year(2020)
df2020 = scrape_year(2021)

#added the following 2 lines of code
df2021 = scrape_year(2022)
df2022 = scrape_year(2023)

df2022.head(3)


Table not found for year 2017
Table not found for year 2018
Table not found for year 2019
Table not found for year 2020
Table not found for year 2021
Table not found for year 2022
Table not found for year 2023


## Scrape last years of free agent stats and salaries

In [42]:
df2020 = pd.read_csv(r'C:\Users\Vincenzo\Documents\Master 2. Semester\GutHub_Sofi\BigDataNBA\data\df2020_merged.csv')
df2021 = pd.read_csv(r'C:\Users\Vincenzo\Documents\Master 2. Semester\GutHub_Sofi\BigDataNBA\data\df2021_merged.csv')
df2022 = pd.read_csv(r'C:\Users\Vincenzo\Documents\Master 2. Semester\GutHub_Sofi\BigDataNBA\data\df2022_merged.csv')


In [43]:
df2016 = scrape_year(2017)
df2017 = scrape_year(2018)
df2018 = scrape_year(2019)
df2019 = scrape_year(2020)
df2020 = scrape_year(2021)

#added the following 2 lines of code
df2021 = scrape_year(2022)
df2022 = scrape_year(2023)

df2022.head(3)
# Note that for df2020 in particular, NextSal data cannot be obtained this way and is just listed as "0.0".  
# We will obtain in in a different way in Notebook 2 and populate the field in Notebook 3

Table not found for year 2017
Table not found for year 2018
Table not found for year 2019
Table not found for year 2020
Table not found for year 2021
Table not found for year 2022
Table not found for year 2023


# (2) Scrape Player Stats and Salaries:  Current Year

## Scrape player stats:  current year

In [44]:
##  Per-Game Stats

url_pergame = 'https://www.basketball-reference.com/leagues/NBA_2023_per_game.html'

def scrape_current_season_stats_pergame(url):
    '''Get current season stats for all players from basketball-reference.com
    '''
    data = {}
  
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")

    df = pd.read_html(f"{soup.find(id='all_per_game_stats')}")[0]
    return df

##  Advanced Stats

url_advanced = 'https://www.basketball-reference.com/leagues/NBA_2023_advanced.html'

def scrape_current_season_stats_advanced(url):
    '''Get current season stats for all players from basketball-reference.com
    '''
    data = {}
  
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")

    df = pd.read_html(f"{soup.find(id='advanced_stats')}")[0]
    return df

dfpergame = scrape_current_season_stats_pergame(url_pergame)
dfadvanced = scrape_current_season_stats_advanced(url_advanced)

## Merge per-game and advanced stats

cols_to_use = dfadvanced.columns.difference(dfpergame.columns)
dfcurrentstats = pd.merge(dfpergame, dfadvanced[cols_to_use], left_index=True, right_index=True, how='outer')
dfcurrentstats = dfcurrentstats.drop(['Unnamed: 19','Unnamed: 24'], axis=1)
print(dfcurrentstats.columns)
dfcurrentstats.sample(5)

Index(['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', '3PAr',
       'AST%', 'BLK%', 'BPM', 'DBPM', 'DRB%', 'DWS', 'FTr', 'OBPM', 'ORB%',
       'OWS', 'PER', 'STL%', 'TOV%', 'TRB%', 'TS%', 'USG%', 'VORP', 'WS',
       'WS/48'],
      dtype='object')


Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,OWS,PER,STL%,TOV%,TRB%,TS%,USG%,VORP,WS,WS/48
692,529,Justise Winslow,PF,26,POR,29,11,26.8,2.8,6.8,...,-0.1,9.3,1.9,17.0,10.7,0.466,14.2,-0.2,0.4,0.022
15,14,OG Anunoby,SF,25,TOR,67,67,35.6,6.3,13.2,...,1.8,14.6,2.7,12.1,7.9,0.586,19.5,1.5,4.7,0.094
183,138,Jalen Duren,C,19,DET,67,31,24.9,3.9,5.9,...,2.9,17.3,1.3,16.5,19.7,0.655,14.3,0.5,4.5,0.129
200,148,Simone Fontecchio,SF,27,UTA,52,6,14.7,2.2,6.0,...,-0.4,8.0,0.9,10.5,6.1,0.495,20.2,-0.6,-0.1,-0.004
680,518,Jalen Williams,SG,21,OKC,75,62,30.3,5.5,10.6,...,3.0,15.6,2.1,12.3,7.9,0.601,18.4,1.3,5.6,0.119


## Scrape player salaries:  current year

In [45]:
def scrape_team(team):
    
    url = 'https://www.basketball-reference.com/contracts/' + team + '.html'
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")

    df = pd.read_html(f"{soup.find(id='contracts')}")[0]
        
    df.columns = df.columns.to_flat_index()
    df.rename(columns= {df.columns[0]: 'Name', df.columns[2]: 'CurrentSalary'}, inplace=True)
    df['CurrentTeam'] = team

    df = df.drop(df.tail(1).index)
    df = df[['Name','CurrentSalary','CurrentTeam']]
    
    df = df.dropna()
    
    return df

In [46]:
test = scrape_team('LAL')
display(test)

Unnamed: 0,Name,CurrentSalary,CurrentTeam
0,LeBron James,"$47,607,350",LAL
1,Anthony Davis,"$40,600,080",LAL
2,D'Angelo Russell,"$17,307,693",LAL
3,Rui Hachimura,"$15,740,741",LAL
4,Austin Reaves,"$12,015,150",LAL
5,Gabe Vincent,"$10,500,000",LAL
6,Jarred Vanderbilt,"$4,698,000",LAL
7,Taurean Prince,"$4,516,000",LAL
8,Jalen Hood-Schifino,"$3,695,040",LAL
9,Jaxson Hayes,"$2,165,000",LAL


In [47]:
def scrape_team(team):
    url = f'https://www.basketball-reference.com/contracts/{team}.html'
    response = requests.get(url)
    if response.status_code == 404:
        print(f"Table not found for team: {team}")
        return pd.DataFrame(columns=['Name', 'CurrentSalary', 'CurrentTeam'])

    page = response.text
    soup = BeautifulSoup(page, "lxml")
    df = pd.read_html(f"{soup.find(id='contracts')}")[0]
    df.columns = df.columns.to_flat_index()
    df.rename(columns={df.columns[0]: 'Name', df.columns[2]: 'CurrentSalary'}, inplace=True)
    df['CurrentTeam'] = team_short [team]
    

    team_short =  {
    'IND': 'Indiana Pacers',          'GSW': 'Golden State Warriors' ,
    'TOR': 'Toronto Raptors',         'MEM': 'Memphis Grizzlies',
    'MIA': 'Miami Heat',              'BRK': 'Brooklyn Nets',
    'POR': 'Portland Trailblazers',   'PHO': 'Phoenix Suns',
    'NOP': 'New Orleans Pelicans',    'MIL': 'Milwaukee Bucks',
    'DET': 'Detroit Pistons' ,        'LAL': 'Los Angeles Lakers',
    'ORL': 'Orlando Magic',           'HOU': 'Houston Rockets' ,
    'WAS': 'Washington Wizards' ,     'ATL': 'Atlanta Hawks',
    'UTA': 'Utah Jazz',               'SAC': 'Sacramento Kings',
    'NYK': 'New York Knicks',         'DEN': 'Denver Nuggets' ,
    'PHI': 'Philadephia 76ers' ,      'SAS': 'San Antonio Spurs' ,
    'LAC': 'Los Angeles Clippers',    'OKC': 'Oklahoma City Thunder' ,
    'MIN': 'Minnesota Timberwolves',  'CLE': 'Cleveland Cavaliers' ,
    'CHO': 'Charlotte Hornets',       'CHI': 'Chicago Bulls' ,
    'BOS': 'Boston Celtics',          'DAL':'Dallas Mavericks',
    }

    teams = list(team_short.keys())  #We just need the team names as used in the urls

    df = pd.DataFrame(columns=['Name', 'CurrentSalary', 'CurrentTeam'])

    for team in teams:
        df_team = scrape_team(team)
        df = pd.concat([df, df_team])
    
    df.sample(5)


## Create player-to-url dictionary


In [48]:
url = 'https://www.basketball-reference.com/leagues/NBA_2023_per_game.html'

player_to_url = {}

response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, "lxml")


table = soup.find('table')
rows = [row for row in table.find_all('tr')]  # tr tag is for rows

for row in rows[1:]:
    try:
        name = row.findAll('td')[0].getText()
        nameid = row.findAll('td')[0].contents[0]['href']
        if name not in player_to_url:
            player_to_url[name] = nameid
    except:
        pass

{k: player_to_url[k] for k in list(player_to_url)[:5]}




{'Precious Achiuwa': '/players/a/achiupr01.html',
 'Steven Adams': '/players/a/adamsst01.html',
 'Bam Adebayo': '/players/a/adebaba01.html',
 'Ochai Agbaji': '/players/a/agbajoc01.html',
 'Santi Aldama': '/players/a/aldamsa01.html'}

In [49]:
dfplayer_to_url = pd.Series(player_to_url).to_frame().reset_index()
dfplayer_to_url.rename(columns= {'index': 'Name',0: 'ID'}, inplace=True)
dfplayer_to_url.head(5)

Unnamed: 0,Name,ID
0,Precious Achiuwa,/players/a/achiupr01.html
1,Steven Adams,/players/a/adamsst01.html
2,Bam Adebayo,/players/a/adebaba01.html
3,Ochai Agbaji,/players/a/agbajoc01.html
4,Santi Aldama,/players/a/aldamsa01.html


## Saving Data

In [50]:
df.sample(10)
#check for correct data gathering

NameError: name 'df' is not defined

In [None]:
df2020.to_csv(r'C:\Users\Vincenzo\Documents\Master 2. Semester\GutHub_Sofi\BigDataNBA\data\df2020_raw.csv')

#added the following 3 lines of code
df2021.to_csv(r'C:\Users\Vincenzo\Documents\Master 2. Semester\GutHub_Sofi\BigDataNBA\data\df2021_raw.csv')
df2022.to_csv(r'C:\Users\Vincenzo\Documents\Master 2. Semester\GutHub_Sofi\BigDataNBA\data\df2022_raw.csv')


dfcurrentstats.to_csv(r'C:\Users\Vincenzo\Documents\Master 2. Semester\GutHub_Sofi\BigDataNBA\data\dfcurrentstats.csv')
df.to_csv(r'C:\Users\Vincenzo\Documents\Master 2. Semester\GutHub_Sofi\BigDataNBA\data\dfcurrentsalaries.csv')

dfplayer_to_url.to_csv(r'C:\Users\Vincenzo\Documents\Master 2. Semester\GutHub_Sofi\BigDataNBA\data\dfplayer_to_url.csv')