In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# url that we are scraping
url = "http://www.basketball-reference.com/draft/NBA_2014.html"

# this is the html from the given url
html = urlopen(url)

In [3]:
soup = BeautifulSoup(html)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [4]:
type(soup)

bs4.BeautifulSoup

In [5]:
column_headers = [th.getText() for th in 
                 soup.findAll('tr', limit=2)[1].findAll('th') if th.text !='Rk']

In [6]:
column_headers

['Pk',
 'Tm',
 'Player',
 'College',
 'Yrs',
 'G',
 'MP',
 'PTS',
 'TRB',
 'AST',
 'FG%',
 '3P%',
 'FT%',
 'MP',
 'PTS',
 'TRB',
 'AST',
 'WS',
 'WS/48',
 'BPM',
 'VORP']

In [7]:
url_template = "http://www.basketball-reference.com/draft/NBA_{year}.html"

In [8]:
# create an empty DataFrame
draft_df = pd.DataFrame()

In [9]:
for year in range(2013, 2018): #for each year
    url = url_template.format(year=year)
    
    html = urlopen(url)
    soup = BeautifulSoup(html, 'html5lib')
    
    data_rows = soup.findAll('tr')[2:]
    player_data = [[td.getText() for td in data_rows[i].findAll('td')]
                  for i in range(len(data_rows))]
    
    year_df = pd.DataFrame(player_data, columns=column_headers)
    year_df.insert(0, 'Draft_Yr', year)
    
    draft_df = draft_df.append(year_df, ignore_index=True)

In [10]:
draft_df.head()

Unnamed: 0,Draft_Yr,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,...,3P%,FT%,MP.1,PTS.1,TRB.1,AST,WS,WS/48,BPM,VORP
0,2013,1,CLE,Anthony Bennett,"University of Nevada, Las Vegas",4,151,1905,658,472,...,0.261,0.67,12.6,4.4,3.1,0.5,0.5,0.013,-5.6,-1.7
1,2013,2,ORL,Victor Oladipo,Indiana University,6,369,12303,6419,1674,...,0.354,0.797,33.3,17.4,4.5,3.8,22.2,0.086,1.0,9.2
2,2013,3,WAS,Otto Porter,Georgetown University,6,345,9123,3623,1687,...,0.404,0.789,26.4,10.5,4.9,1.4,26.0,0.137,2.2,9.7
3,2013,4,CHA,Cody Zeller,Indiana University,6,315,7097,2496,1771,...,0.294,0.734,22.5,7.9,5.6,1.3,20.6,0.139,0.7,4.8
4,2013,5,PHO,Alex Len,University of Maryland,6,337,6704,2437,2183,...,0.25,0.707,19.9,7.2,6.5,0.8,12.3,0.088,-1.8,0.3


In [11]:
draft_df.tail()

Unnamed: 0,Draft_Yr,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,...,3P%,FT%,MP.1,PTS.1,TRB.1,AST,WS,WS/48,BPM,VORP
305,2017,56,BOS,Jabari Bird,University of California,1.0,13.0,115.0,39.0,19.0,...,0.429,0.462,8.8,3.0,1.5,0.6,0.2,0.098,-2.1,0.0
306,2017,57,BRK,Aleksandar Vezenkov,,,,,,,...,,,,,,,,,,
307,2017,58,NYK,Ognjen Jaramaz,,,,,,,...,,,,,,,,,,
308,2017,59,SAS,Jaron Blossomgame,Clemson University,,,,,,...,,,,,,,,,,
309,2017,60,ATL,Alpha Kaba,,,,,,,...,,,,,,,,,,


In [21]:
# Convert data to proper data types
draft_df = draft_df.convert_objects(convert_numeric=True)

# Get rid of the rows full of null values
draft_df = draft_df[draft_df.Player.notnull()]

# Replace NaNs with 0s
draft_df = draft_df.fillna(0)

# Rename Columns
draft_df.rename(columns={'WS/48':'WS_per_48'}, inplace=True)
# Change % symbol
draft_df.columns = draft_df.columns.str.replace('%', '_Perc')
# Add per_G to per game stats
draft_df.columns.values[15:19] = [draft_df.columns.values[15:19][col] + 
                                  "_per_G" for col in range(4)]

# Changing the Data Types to int
draft_df.loc[:,'Yrs':'AST'] = draft_df.loc[:,'Yrs':'AST'].astype(int)

# Delete the 'Rk' column
draft_df.drop('Rk', axis='columns', inplace=True)

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  


In [22]:
draft_df.dtypes

Draft_Yr             int64
Pk                   int32
Tm                  object
Player              object
College             object
Yrs                  int32
G                    int32
MP                   int32
PTS                  int32
TRB                  int32
AST                  int32
FG_Perc            float64
3P_Perc            float64
FT_Perc            float64
MP                   int32
PTS_per_G_per_G    float64
TRB_per_G_per_G    float64
AST_per_G_per_G    float64
WS_per_G_per_G     float64
WS_per_48          float64
BPM                float64
VORP               float64
dtype: object

In [23]:
draft_df['Pk'] = draft_df['Pk'].astype(int) # change Pk to int

In [24]:
draft_df.dtypes

Draft_Yr             int64
Pk                   int32
Tm                  object
Player              object
College             object
Yrs                  int32
G                    int32
MP                   int32
PTS                  int32
TRB                  int32
AST                  int32
FG_Perc            float64
3P_Perc            float64
FT_Perc            float64
MP                   int32
PTS_per_G_per_G    float64
TRB_per_G_per_G    float64
AST_per_G_per_G    float64
WS_per_G_per_G     float64
WS_per_48          float64
BPM                float64
VORP               float64
dtype: object

In [25]:
draft_df.isnull().sum() # No missing values in our DataFrame

Draft_Yr           0
Pk                 0
Tm                 0
Player             0
College            0
Yrs                0
G                  0
MP                 0
PTS                0
TRB                0
AST                0
FG_Perc            0
3P_Perc            0
FT_Perc            0
MP                 0
PTS_per_G_per_G    0
TRB_per_G_per_G    0
AST_per_G_per_G    0
WS_per_G_per_G     0
WS_per_48          0
BPM                0
VORP               0
dtype: int64

In [26]:
draft_df.head()

Unnamed: 0,Draft_Yr,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,...,3P_Perc,FT_Perc,MP.1,PTS_per_G_per_G,TRB_per_G_per_G,AST_per_G_per_G,WS_per_G_per_G,WS_per_48,BPM,VORP
0,2013,1,CLE,Anthony Bennett,"University of Nevada, Las Vegas",4,151,1905,658,472,...,0.261,0.67,1905,4.4,3.1,0.5,0.5,0.013,-5.6,-1.7
1,2013,2,ORL,Victor Oladipo,Indiana University,6,369,12303,6419,1674,...,0.354,0.797,12303,17.4,4.5,3.8,22.2,0.086,1.0,9.2
2,2013,3,WAS,Otto Porter,Georgetown University,6,345,9123,3623,1687,...,0.404,0.789,9123,10.5,4.9,1.4,26.0,0.137,2.2,9.7
3,2013,4,CHA,Cody Zeller,Indiana University,6,315,7097,2496,1771,...,0.294,0.734,7097,7.9,5.6,1.3,20.6,0.139,0.7,4.8
4,2013,5,PHO,Alex Len,University of Maryland,6,337,6704,2437,2183,...,0.25,0.707,6704,7.2,6.5,0.8,12.3,0.088,-1.8,0.3


In [18]:
draft_df.to_csv("draft_data_2013_to_2017.csv")