In [1]:
# import dependencies
from splinter import Browser
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import time
import re

In [2]:
# set up browser for scraping
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [3]:
# base url
base = 'https://stats.nba.com/draft/'

# list of seasons
seasons = np.arange(2000, 2020)

# list of pages to scrape
pages = ['combine-strength-agility', 'combine-anthro']

In [4]:
nba_stats_strength = pd.DataFrame()
nba_stats_anthro = pd.DataFrame()

In [5]:
for page in pages:
    main = f'{base}{page}'
    
    for season in seasons:
        
        if season == 2019:
            browser.visit(main)
            
            time.sleep(3)
            
            html = browser.html
            soup = BeautifulSoup(html, 'html.parser')
            
            table = soup.find('div', class_ = 'nba-stat-table')
            
            if page == 'combine-strength-agility':
                combine_df = pd.DataFrame(pd.read_html(str(table))[0])
                combine_df['season'] = np.nan
                combine_df['season'].fillna(season, inplace = True)
                nba_stats_strength.append(combine_df)
                
            if page == 'combine-anthro':
                combine_df = pd.DataFrame(pd.read_html(str(table))[0])
                combine_df['season'] = np.nan
                combine_df['season'].fillna(season, inplace = True)
                nba_stats_anthro.append(combine_df)
        else:
            url = f'{main}/?SeasonYear={season}-{str(season + 1)[2:]}'
            browser.visit(url)
            
            time.sleep(3)
            
            html = browser.html
            soup = BeautifulSoup(html, 'html.parser')
            
            table = soup.find('div', class_ = 'nba-stat-table')
            
            if page == 'combine-strength-agility':
                combine_df = pd.DataFrame(pd.read_html(str(table))[0])
                combine_df['season'] = np.nan
                combine_df['season'].fillna(season, inplace = True)
                nba_stats_strength = nba_stats_strength.append(combine_df)
                
            if page == 'combine-anthro':
                combine_df = pd.DataFrame(pd.read_html(str(table))[0])
                combine_df['season'] = np.nan
                combine_df['season'].fillna(season, inplace = True)
                nba_stats_anthro = nba_stats_anthro.append(combine_df)

In [6]:
nba_stats_strength.rename(columns = {nba_stats_strength.columns[0]: 'player',
                                     nba_stats_strength.columns[1]: 'position',
                                     nba_stats_strength.columns[2]: 'lane_agility',
                                     nba_stats_strength.columns[3]: 'shuttle_run',
                                     nba_stats_strength.columns[4]: 'three_quarter_sprint',
                                     nba_stats_strength.columns[5]: 'standing_vertical',
                                     nba_stats_strength.columns[6]: 'max_vertical',
                                     nba_stats_strength.columns[7]: 'bench_press',
                                     nba_stats_strength.columns[8]: 'season'}, inplace = True)
nba_stats_strength.replace('-', np.nan, inplace = True)

nba_stats_strength.head()

Unnamed: 0,player,position,lane_agility,shuttle_run,three_quarter_sprint,standing_vertical,max_vertical,bench_press,season
0,Malik Allen,PF-C,11.83,,3.38,25.5,29.0,13.0,2000.0
1,Harold Arceneaux,SG-SF,13.8,,,,29.0,0.0,2000.0
2,Lamont Barnes,PF-C,12.3,,3.4,28.0,29.5,10.0,2000.0
3,Mario Bland,PF,13.04,,3.47,27.0,31.0,15.0,2000.0
4,Primoz Brezec,C,11.53,,3.55,26.0,29.5,,2000.0


In [7]:
def parse_ht(ht):
    try:
        ht_ = re.split("' |'", str(ht))
        ft_ = float(ht_[0])
        in_ = float(ht_[1].replace("\"",""))
        return (12 * ft_) + in_
    except:
        return np.nan

In [8]:
nba_stats_anthro.rename(columns = {nba_stats_anthro.columns[0]: 'player',
                                   nba_stats_anthro.columns[1]: 'position',
                                   nba_stats_anthro.columns[2]: 'body_fat',
                                   nba_stats_anthro.columns[3]: 'hand_length',
                                   nba_stats_anthro.columns[4]: 'hand_width',
                                   nba_stats_anthro.columns[5]: 'height_no_shoes',
                                   nba_stats_anthro.columns[6]: 'height_shoes',
                                   nba_stats_anthro.columns[7]: 'standing_reach',
                                   nba_stats_anthro.columns[8]: 'weight',
                                   nba_stats_anthro.columns[9]: 'wingspan',
                                   nba_stats_anthro.columns[10]: 'season'}, inplace = True)
nba_stats_anthro.replace(['-', '-%'], np.nan, inplace = True)

nba_stats_anthro['body_fat'] = nba_stats_anthro['body_fat'].str.split('%').str[0]
nba_stats_anthro['body_fat'] = nba_stats_anthro['body_fat'].astype(float)

nba_stats_anthro['height_no_shoes'] = nba_stats_anthro['height_no_shoes'].apply(lambda x:parse_ht(x))
nba_stats_anthro['height_shoes'] = nba_stats_anthro['height_shoes'].apply(lambda x:parse_ht(x))
nba_stats_anthro['standing_reach'] = nba_stats_anthro['standing_reach'].apply(lambda x:parse_ht(x))
nba_stats_anthro['wingspan'] = nba_stats_anthro['wingspan'].apply(lambda x:parse_ht(x))

nba_stats_anthro.drop(['height_no_shoes'], axis = 1, inplace = True)
nba_stats_anthro.head()

Unnamed: 0,player,position,body_fat,hand_length,hand_width,height_shoes,standing_reach,weight,wingspan,season
0,Malik Allen,PF-C,,,,,109.0,271.0,86.5,2000.0
1,Harold Arceneaux,SG-SF,,,,,103.0,219.0,80.5,2000.0
2,Lamont Barnes,PF-C,,,,,108.0,235.5,87.5,2000.0
3,Mario Bland,PF,,,,,103.0,287.0,84.0,2000.0
4,Primoz Brezec,C,,,,,110.0,243.0,86.0,2000.0


In [9]:
combine_full = pd.merge(nba_stats_strength, nba_stats_anthro, on = ['player', 'position', 'season'], how = 'inner')

combine_full.to_csv('../data/combine_data.csv', index = False)