### This file creates a DataFrame containing stats and salaries for all NBA players in the specified years.

- NBA salary cap by year: https://basketball.realgm.com/nba/info/salary_cap
- NBA player salary by year: http://www.espn.com/nba/salaries/_/year/2019/page/1/seasontype/1
- NBA player stats: https://www.basketball-reference.com/leagues/NBA_2019_per_game.html

In [2]:
# Import module
from bs4 import BeautifulSoup as soup
import requests
import pandas as pd
import numpy as np
import unicodedata  # Used because basketball-reference has accented names, ESPN does not
from nameparser import HumanName  # Used because some basketball-reference doesn't have suffixes
import pickle  # Pickle dataframe to use in other project file

pd.set_option('display.max_columns', None)  # No limit on number of columns displayed

In [3]:
# Strip accents from names
def strip_accents(text):

    try:
        text = unicode(text, 'utf-8')
    except NameError:  # unicode is a default on python 3 
        pass

    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")

    return str(text)

In [4]:
# category_list = list of category (pos, name, age, etc.)
# category_str = name of category in string form
# data_type = dtype of the category's stats (int, float, etc.)
# iterator = i

def get_single_data(category_list, category_str, data_type, iterator):
    
    if data_type == 'str':
        if category_str == 'left':  # Names
            category_list.append(strip_accents  # Need to strip accent characters from names
                                 (full_table[iterator]
                                  .find('td', {'class': 'left'})
                                  .text
                                  .upper()
                                  .replace(',', '')
                                  .replace('.', '')
                                 )
                                )
        elif category_str == 'pos':  # Positions
            category_list.append(full_table[iterator]
                                 .find('td', {'data-stat': category_str})
                                 .text
                                 .upper()
                                 .split('-')[0])  # Split any players with multiple positions, and only
                                                  # keep their primary position (first string)
                                                  # Ex: PF-SF becomes PF
        else:
            category_list.append(full_table[iterator].find('td', {'data-stat': category_str}).text.upper())
            
    elif data_type == 'int':
        try:
            category_list.append(int(full_table[iterator].find('td', {'data-stat': category_str}).text))
        except:  # Fill any empty values with null
            category_list.append(np.nan)

    elif data_type == 'float':
        try: 
            category_list.append(float(full_table[iterator].find('td', {'data-stat': category_str}).text))
        except:  # Fill any empty values with null
            category_list.append(np.nan)

In [5]:
# Return dataframe with stats info for all players in the year passed
def get_stats_data(year):
    
    # Initialize per_game stats lists
    (name, pos, age, team_id, g, gs, mp_per_g, fg_per_g, fga_per_g, fg_pct, fg3_per_g, 
     fg3a_per_g, fg3_pct, fg2_per_g, fg2a_per_g, fg2_pct, efg_pct, ft_per_g, fta_per_g, 
     ft_pct, orb_per_g, drb_per_g, trb_per_g, ast_per_g,  stl_per_g, blk_per_g, tov_per_g, 
     pf_per_g, pts_per_g) = ([] for j in range(29))
    
    # Initialize per_minute stats lists
    (fg_per_mp, fga_per_mp, fg3_per_mp, fg3a_per_mp, fg2_per_mp, fg2a_per_mp, ft_per_mp, fta_per_mp,
     orb_per_mp, drb_per_mp, trb_per_mp, ast_per_mp, stl_per_mp, blk_per_mp, tov_per_mp, pf_per_mp,
     pts_per_mp) = ([] for j in range(17))
    
    # Initialize per_poss stats lists
    (fg_per_poss, fga_per_poss, fg3_per_poss, fg3a_per_poss, fg2_per_poss, fg2a_per_poss, ft_per_poss,
     fta_per_poss, orb_per_poss, drb_per_poss, trb_per_poss, ast_per_poss, stl_per_poss, blk_per_poss,
     tov_per_poss, pf_per_poss, pts_per_poss, off_rtg, def_rtg) = ([] for j in range(19))
    
    # Initialize totals stats lists
    (fg, fga, fg3, fg3a, fg2, fg2a, ft, fta, orb, drb, trb, ast, stl, blk, tov, 
     pf, pts) = ([] for j in range(17))

    # Initialize advanced stats lists
    (per, ts_pct, fg3a_per_fga_pct, fta_per_fga_pct, orb_pct, drb_pct, trb_pct, ast_pct, 
     stl_pct, blk_pct, tov_pct, usg_pct, ows, dws, ws, ws_per_48, obpm, dbpm, bpm, 
     vorp) = ([] for j in range(20))
    
    stat_types = ['per_game', 'per_minute', 'per_poss', 'totals', 'advanced']
    for stat_type in stat_types: 
        # Get HTML code from URL
        url = 'https://www.basketball-reference.com/leagues/NBA_{}_{}.html'.format(str(year), stat_type)
        response = requests.get(url)
        if response.status_code != 200:  # 200 = success
            print(response.status_code)
            raise ValueError('Could not get response from URL!')
            
        # Contains all HTML code from URL
        page = response.text
        page_soup = soup(page, 'lxml')
        
        # Find all <tr tags that contain class 'full_table'
        global full_table  # Global so that get_single_data() can access it
        full_table = page_soup.findAll('tr', {'class': 'full_table'})
        
        for i in range(len(full_table)):  # len(full_table) = # of players in that season
            if stat_type == 'per_game':
                get_single_data(pos, 'pos', 'str', i)
                get_single_data(name, 'left', 'str', i)
                get_single_data(age, 'age', 'int', i)
                get_single_data(team_id, 'team_id', 'str', i)
                get_single_data(g, 'g', 'int', i)
                get_single_data(gs, 'gs', 'int', i)
                get_single_data(mp_per_g, 'mp_per_g', 'float', i)
                get_single_data(fg_per_g, 'fg_per_g', 'float', i)
                get_single_data(fga_per_g, 'fga_per_g', 'float', i)
                get_single_data(fg_pct, 'fg_pct', 'float', i)
                get_single_data(fg3_per_g, 'fg3_per_g', 'float', i)
                get_single_data(fg3a_per_g, 'fg3a_per_g', 'float', i)
                get_single_data(fg3_pct, 'fg3_pct', 'float', i)
                get_single_data(fg2_per_g, 'fg2_per_g', 'float', i)
                get_single_data(fg2a_per_g, 'fg2a_per_g', 'float', i)
                get_single_data(fg2_pct, 'fg2_pct', 'float', i)
                get_single_data(efg_pct, 'efg_pct', 'float', i)
                get_single_data(ft_per_g, 'ft_per_g', 'float', i)
                get_single_data(fta_per_g, 'fta_per_g', 'float', i)
                get_single_data(ft_pct, 'ft_pct', 'float', i)
                get_single_data(orb_per_g, 'orb_per_g', 'float', i)
                get_single_data(drb_per_g, 'drb_per_g', 'float', i)
                get_single_data(trb_per_g, 'trb_per_g', 'float', i)
                get_single_data(ast_per_g, 'ast_per_g', 'float', i)
                get_single_data(stl_per_g, 'stl_per_g', 'float', i)
                get_single_data(blk_per_g, 'blk_per_g', 'float', i)
                get_single_data(tov_per_g, 'tov_per_g', 'float', i)
                get_single_data(pf_per_g, 'pf_per_g', 'float', i)
                get_single_data(pts_per_g, 'pts_per_g', 'float', i)
            elif stat_type == 'per_minute':
                get_single_data(fg_per_mp, 'fg_per_mp', 'float', i)
                get_single_data(fga_per_mp, 'fga_per_mp', 'float', i)
                get_single_data(fg3_per_mp, 'fg3_per_mp', 'float', i)
                get_single_data(fg3a_per_mp, 'fg3a_per_mp', 'float', i)
                get_single_data(fg2_per_mp, 'fg2_per_mp', 'float', i)
                get_single_data(fg2a_per_mp, 'fg2a_per_mp', 'float', i)
                get_single_data(ft_per_mp, 'ft_per_mp', 'float', i)
                get_single_data(fta_per_mp, 'fta_per_mp', 'float', i)
                get_single_data(orb_per_mp, 'orb_per_mp', 'float', i)
                get_single_data(drb_per_mp, 'drb_per_mp', 'float', i)
                get_single_data(trb_per_mp, 'trb_per_mp', 'float', i)
                get_single_data(ast_per_mp, 'ast_per_mp', 'float', i)
                get_single_data(stl_per_mp, 'stl_per_mp', 'float', i)
                get_single_data(blk_per_mp, 'blk_per_mp', 'float', i)
                get_single_data(tov_per_mp, 'tov_per_mp', 'float', i)
                get_single_data(pf_per_mp, 'pf_per_mp', 'float', i)
                get_single_data(pts_per_mp, 'pts_per_mp', 'float', i)
            elif stat_type == 'per_poss':
                get_single_data(fg_per_poss, 'fg_per_poss', 'float', i)
                get_single_data(fga_per_poss, 'fga_per_poss', 'float', i)
                get_single_data(fg3_per_poss, 'fg3_per_poss', 'float', i)
                get_single_data(fg3a_per_poss, 'fg3a_per_poss', 'float', i)
                get_single_data(fg2_per_poss, 'fg2_per_poss', 'float', i)
                get_single_data(fg2a_per_poss, 'fg2a_per_poss', 'float', i)
                get_single_data(ft_per_poss, 'ft_per_poss', 'float', i)
                get_single_data(fta_per_poss, 'fta_per_poss', 'float', i)
                get_single_data(orb_per_poss, 'orb_per_poss', 'float', i)
                get_single_data(drb_per_poss, 'drb_per_poss', 'float', i)
                get_single_data(trb_per_poss, 'trb_per_poss', 'float', i)
                get_single_data(ast_per_poss, 'ast_per_poss', 'float', i)
                get_single_data(stl_per_poss, 'stl_per_poss', 'float', i)
                get_single_data(blk_per_poss, 'blk_per_poss', 'float', i)
                get_single_data(tov_per_poss, 'tov_per_poss', 'float', i)
                get_single_data(pf_per_poss, 'pf_per_poss', 'float', i)
                get_single_data(pts_per_poss, 'pts_per_poss', 'float', i)
                get_single_data(off_rtg, 'off_rtg', 'int', i)
                get_single_data(def_rtg, 'def_rtg', 'int', i)                
            elif stat_type == 'totals':
                get_single_data(fg, 'fg', 'int', i)
                get_single_data(fga, 'fga', 'int', i)
                get_single_data(fg3, 'fg3', 'int', i)
                get_single_data(fg3a, 'fg3a', 'int', i)
                get_single_data(fg2, 'fg2', 'int', i)
                get_single_data(fg2a, 'fg2a', 'int', i)
                get_single_data(ft, 'ft', 'int', i)
                get_single_data(fta, 'fta', 'int', i)
                get_single_data(orb, 'orb', 'int', i)
                get_single_data(drb, 'drb', 'int', i)
                get_single_data(trb, 'trb', 'int', i)
                get_single_data(ast, 'ast', 'int', i)
                get_single_data(stl, 'stl', 'int', i)
                get_single_data(blk, 'blk', 'int', i)
                get_single_data(tov, 'tov', 'int', i)
                get_single_data(pf, 'pf', 'int', i)
                get_single_data(pts, 'pts', 'int', i)
            elif stat_type == 'advanced':
                get_single_data(per, 'per', 'float', i)
                get_single_data(ts_pct, 'ts_pct', 'float', i)
                get_single_data(fg3a_per_fga_pct, 'fg3a_per_fga_pct', 'float', i)
                get_single_data(fta_per_fga_pct, 'fta_per_fga_pct', 'float', i)
                get_single_data(orb_pct, 'orb_pct', 'float', i)
                get_single_data(drb_pct, 'drb_pct', 'float', i)
                get_single_data(trb_pct, 'trb_pct', 'float', i)
                get_single_data(ast_pct, 'ast_pct', 'float', i)
                get_single_data(stl_pct, 'stl_pct', 'float', i)
                get_single_data(blk_pct, 'blk_pct', 'float', i)
                get_single_data(tov_pct, 'tov_pct', 'float', i)
                get_single_data(usg_pct, 'usg_pct', 'float', i)
                get_single_data(ows, 'ows', 'float', i)
                get_single_data(dws, 'dws', 'float', i)
                get_single_data(ws, 'ws', 'float', i)
                get_single_data(ws_per_48, 'ws_per_48', 'float', i)
                get_single_data(obpm, 'obpm', 'float', i)
                get_single_data(dbpm, 'dbpm', 'float', i)
                get_single_data(bpm, 'bpm', 'float', i)
                get_single_data(vorp, 'vorp', 'float', i)
    
    # Make dict to contain all stats data
    stats_dict = (
                    {
                        # Per game stats 
                        'name': name, 'pos': pos, 'age': age, 'team_id': team_id, 'g': g, 'gs': gs, 
                        'mp_per_g': mp_per_g, 'fg_per_g': fg_per_g, 'fga_per_g': fga_per_g, 
                        'fg_pct': fg_pct, 'fg3_per_g': fg3_per_g, 'fg3a_per_g': fg3a_per_g,
                        'fg3_pct': fg3_pct, 'fg2_per_g': fg2_per_g, 'fg2a_per_g': fg2a_per_g, 
                        'fg2_pct': fg2_pct, 'efg_pct': efg_pct,'ft_per_g': ft_per_g, 
                        'fta_per_g': fta_per_g, 'ft_pct': ft_pct, 'orb_per_g': orb_per_g, 
                        'drb_per_g': drb_per_g,'trb_per_g': trb_per_g, 'ast_per_g': ast_per_g, 
                        'stl_per_g': stl_per_g, 'blk_per_g': blk_per_g, 'tov_per_g': tov_per_g,
                        'pf_per_g': pf_per_g, 'pts_per_g': pts_per_g, 
                        # Per 36 minutes stats
                        'fg_per_mp': fg_per_mp, 'fga_per_mp': fga_per_mp, 'fg3_per_mp': fg3_per_mp, 
                        'fg3a_per_mp': fg3a_per_mp, 'fg2_per_mp': fg2_per_mp, 
                        'fg2a_per_mp': fg2a_per_mp, 'ft_per_mp': ft_per_mp, 'fta_per_mp': fta_per_mp, 
                        'orb_per_mp': orb_per_mp, 'drb_per_mp': drb_per_mp, 'trb_per_mp': trb_per_mp, 
                        'ast_per_mp': ast_per_mp, 'stl_per_mp': stl_per_mp, 'blk_per_mp': blk_per_mp, 
                        'tov_per_mp': tov_per_mp, 'pf_per_mp': pf_per_mp, 'pts_per_mp': pts_per_mp,
                        # Per 100 possession stats
                        'fg_per_poss': fg_per_poss, 'fga_per_poss': fga_per_poss, 
                        'fg3_per_poss': fg3_per_poss, 'fg3a_per_poss': fg3a_per_poss, 
                        'fg2_per_poss': fg2_per_poss, 'fg2a_per_poss': fg2a_per_poss, 
                        'ft_per_poss': ft_per_poss, 'fta_per_poss': fta_per_poss, 
                        'orb_per_poss': orb_per_poss, 'drb_per_poss': drb_per_poss, 
                        'trb_per_poss': trb_per_poss, 'ast_per_poss': ast_per_poss, 
                        'stl_per_poss': stl_per_poss, 'blk_per_poss': blk_per_poss, 
                        'tov_per_poss': tov_per_poss, 'pf_per_poss': pf_per_poss, 
                        'pts_per_poss': pts_per_poss, 'off_rtg': off_rtg, 'def_rtg': def_rtg,
                        # Stat totals
                        'fg': fg, 'fga': fga, 'fg3': fg3, 'fg3a': fg3a, 'fg2': fg2, 'fg2a': fg2a, 
                        'ft': ft, 'fta': fta, 'orb': orb, 'drb': drb, 'trb': trb, 'ast': ast, 
                        'stl': stl, 'blk': blk, 'tov': tov, 'pf': pf, 'pts': pts,
                        # Advanced stats
                        'per': per, 'ts_pct': ts_pct, 'fg3a_per_fga_pct': fg3a_per_fga_pct,
                        'fta_per_fga_pct': fta_per_fga_pct, 'orb_pct': orb_pct, 'drb_pct': drb_pct, 
                        'trb_pct': trb_pct, 'ast_pct': ast_pct, 'stl_pct': stl_pct, 'blk_pct': blk_pct, 
                        'tov_pct': tov_pct, 'usg_pct': usg_pct, 'ows': ows, 'dws': dws, 'ws': ws,
                        'ws_per_48': ws_per_48, 'obpm': obpm, 'dbpm': dbpm, 'bpm': bpm, 'vorp': vorp
                    }
                )
    stats_df = pd.DataFrame(stats_dict)
    stats_df.insert(0, 'year', year)      # Insert a column with the year
    
    return stats_df

In [11]:
# Return dict in {'player_name': salary} format
def get_salary_data(year):
    
    # Dictionary to be filled with player names and salaries
    salary_info = {}
    
    # Get HTML code from URL. By default, start from page 1
    url = 'http://www.espn.com/nba/salaries/_/year/{}/page/1/seasontype/1'.format(str(year))
    response = requests.get(url)
    if response.status_code != 200:  # 200 = success
        print(response.status_code)
        raise ValueError('Could not get response from URL!')
     
    # Contains all HTML code from URL
    page = response.text
    page_soup = soup(page, 'lxml')  # Make into soup object
    
    # Find number of pages
    num_pages = int(page_soup.
                find('div', {'class': 'page-numbers'})  # Find string where '1 of last_page' is
                .text                                   # Convert to text
                .split()                                # Split string at spaces, and get last_page
                [-1])
    
    for page_num in range(num_pages):
        page_num += 1  # Index starts at 1
        # Get HTML code from URL. By default, start from page 1
        url = ('http://www.espn.com/nba/salaries/_/year/{}/page/{}/seasontype/1'
               .format(str(year), str(page_num))
              )
        response = requests.get(url)
        if response.status_code != 200:  # 200 = success
            print(response.status_code)
            raise ValueError('Could not get response from URL!')
     
        # Contains all HTML code from URL
        page = response.text
        page_soup = soup(page, 'lxml')  # Make into soup object
        
        # Find all <tr tags that contain class 'oddrow' or 'evenrow'
        table = page_soup.findAll('tr', {'class': ['oddrow', 'evenrow']})

        # Iterate thru every row in the salary info table
        for i in range(len(table)):
            # key is name
            # value is salary
            name = table[i].find('a').text.strip().upper().replace(',', '').replace('.', '')
            
            if (HumanName(name).first + ' ' + HumanName(name).last) != name:
                name = HumanName(name).first + ' ' + HumanName(name).last
            
            value = int(table[i].findAll('td')[-1]  # The last 'td' tag in the table has the salary
                        .text                       # Get string from HTML
                        .replace('$', '')           # Format so data can become an int
                        .replace(',', ''))
            salary_info[name] = value
    
    return salary_info

In [12]:
# Get salary cap for specified year
def get_salary_cap(year):
    
    url = 'https://basketball.realgm.com/nba/info/salary_cap'
    response = requests.get(url)
    if response.status_code != 200:  # 200 = success
        print(response.status_code)
        raise ValueError('Could not get response from URL!')

    # Contains all HTML code from URL
    page = response.text
    page_soup = soup(page, 'lxml')

    # Find <tbody tag, inside that find all items with <tr tag
    basketball_compact = page_soup.find('tbody').findAll('tr')
    
    # Create dict where we can fill dict[season] = salary_cap
    salary_cap_dict = {}
    
    for i in range(len(basketball_compact)):
        # Season is the year
        season = int(basketball_compact[i]
                     .findAll('td')[2]  # The third <td tag has the season year
                     .text
                     .split('-')[-1])
        salary_cap = int(basketball_compact[i]
                         .findAll('td')[3]  # The fourth <td tag has the season salary cap
                         .text
                         .replace('$', '')
                         .replace(',', ''))
        # Store salary cap to the corresponding season
        salary_cap_dict[season] = salary_cap
        
    return salary_cap_dict[year]

In [13]:
# Fill passed dataframe with salaries from passed dictionary
def combine_data(year, stats_df, salaries_dict, salary_cap):
    
    stats_df.insert(1, 'salary', np.nan)          # Fill 'salary' column with NaN until data is added
    stats_df.insert(2, 'salary cap', salary_cap)  # Fill with year's salary_cap
    stats_df.insert(len(stats_df.columns), 'salary ratio', np.nan)    # Fill 'salary ratio' column with NaN until data is added
    
    index = 0  # Keep index of dataframe, so it can be used for changing values
    
    for name in stats_df['name']:
        if name in salaries_dict:
            # Fill salary for each player
            stats_df.at[index, 'salary'] = salaries_dict[name]
            
            # Salary ratio = salary / salary_cap
            # Salary ratio is a means to normalize player salaries across different seasons
            # It is required because thes salary cap increases every year
            stats_df.at[index, 'salary ratio'] = salaries_dict[name] / salary_cap
        index += 1
        

In [14]:
# Get all required data for a list of specified years. Return a df with all data
def get_data(years_list):
    
    df = pd.DataFrame()  # Create empty df that will be appended with ever year's data
    
    for year in years_list:
        stats_df = get_stats_data(year)
        salaries_dict = get_salary_data(year)
        salary_cap = get_salary_cap(year)
        combine_data(year, stats_df, salaries_dict, salary_cap)
        df = df.append(stats_df)
     
    # Drop any rows that have NaN values
    # These rows have either missing salary data from ESPN or the
    # player didn't have enough stats to calculate advance stats             
    return df.dropna()

In [15]:
# ENTER WHICH YEARS YOU WANT HERE. ONLY PART OF THIS NOTEBOOK THAT NEEDS TO BE TOUCHED!!!
# Earliest year is 2000 (1999-2000 season)
# Create df with data from the specified years
df = get_data([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 
               2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])

In [16]:
# # Pickle dataframe to use in other project file
with open('stats_df.pkl', 'wb') as picklefile:
    pickle.dump(df, picklefile)