In [396]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [397]:
url = "http://www.basketball-reference.com/draft/NBA_2016.html"
html = urlopen(url)

In [398]:
soup = BeautifulSoup(html, "html5lib")
type(soup)

bs4.BeautifulSoup

In [399]:
# soup.findAll('tr', limit=2)

soup.findAll('tr', limit=4)[3].findAll('th') # Skip first 2--groups 3 columns for mobile. Third 'tr' is for browser.
column_headers = [th.getText() for th in soup.findAll('tr', limit=2)[1].findAll('th')[1:]]
column_headers



['Pk',
 'Tm',
 'Player',
 'College',
 'Yrs',
 'G',
 'MP',
 'PTS',
 'TRB',
 'AST',
 'FG%',
 '3P%',
 'FT%',
 'MP',
 'PTS',
 'TRB',
 'AST',
 'WS',
 'WS/48',
 'BPM',
 'VORP']

In [400]:
data_rows = soup.findAll('tr')[2:]  # skip the first 2 header rows

type(data_rows)
# data_rows

list

In [401]:
player_data = [[td.getText() for td in data_rows[i].findAll('td')[:]]
                for i in range(len(data_rows))]
# player_data


In [402]:
df = pd.DataFrame(player_data, columns=column_headers)
df.head(3)

Unnamed: 0,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,AST,...,3P%,FT%,MP.1,PTS.1,TRB.1,AST.1,WS,WS/48,BPM,VORP
0,1,PHI,Ben Simmons,Louisiana State University,,,,,,,...,,,,,,,,,,
1,2,LAL,Brandon Ingram,Duke University,1.0,79.0,2279.0,740.0,316.0,166.0,...,0.294,0.621,28.8,9.4,4.0,2.1,-0.3,-0.007,-3.8,-1.1
2,3,BOS,Jaylen Brown,University of California,1.0,78.0,1341.0,515.0,220.0,64.0,...,0.341,0.685,17.2,6.6,2.8,0.8,1.5,0.053,-4.0,-0.7


In [403]:
df[df['Pk'].isnull()] # Find none values

Unnamed: 0,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,AST,...,3P%,FT%,MP.1,PTS.1,TRB.1,AST.1,WS,WS/48,BPM,VORP
30,,,,,,,,,,,...,,,,,,,,,,
31,,,,,,,,,,,...,,,,,,,,,,


In [404]:
df = df[df.Player.notnull()] # Assign a df subset to 'df' as a way of removing ^2 rows

In [405]:
df[df['Pk'].isnull()]


Unnamed: 0,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,AST,...,3P%,FT%,MP.1,PTS.1,TRB.1,AST.1,WS,WS/48,BPM,VORP


In [406]:
df.columns

Index(['Pk', 'Tm', 'Player', 'College', 'Yrs', 'G', 'MP', 'PTS', 'TRB', 'AST',
       'FG%', '3P%', 'FT%', 'MP', 'PTS', 'TRB', 'AST', 'WS', 'WS/48', 'BPM',
       'VORP'],
      dtype='object')

In [407]:
# Renaming columns with dictionary {'key/oldName:value/newName'}
df.rename(columns={'WS/48':'WS_per_48'}, inplace=True)
# df.rename(columns={'career_AST.1':'AST_per_g'}, inplace=True)

df.columns
cols=pd.Series(df.columns)
for dup in df.columns.get_duplicates(): cols[df.columns.get_loc(dup)]=[dup+'.'+str(d_idx) if d_idx!=0 else dup for d_idx in range(df.columns.get_loc(dup).sum())]
df.columns=cols
df.columns

Index(['Pk', 'Tm', 'Player', 'College', 'Yrs', 'G', 'MP', 'PTS', 'TRB', 'AST',
       'FG%', '3P%', 'FT%', 'MP.1', 'PTS.1', 'TRB.1', 'AST.1', 'WS',
       'WS_per_48', 'BPM', 'VORP'],
      dtype='object')

In [408]:
# Use built in string method .replace()
df.columns = df.columns.str.replace('%', '_Perc')
df.columns
df.dtypes


Pk           object
Tm           object
Player       object
College      object
Yrs          object
G            object
MP           object
PTS          object
TRB          object
AST          object
FG_Perc      object
3P_Perc      object
FT_Perc      object
MP.1         object
PTS.1        object
TRB.1        object
AST.1        object
WS           object
WS_per_48    object
BPM          object
VORP         object
dtype: object

In [438]:
# df1 = df.loc[:,'MP':'PTS'].astype(float)
df1 = df.loc[:,'FG_Perc':'VORP'].apply(pd.to_numeric, errors='coerce')
df1 = df1[:].fillna(0)

df2 = df.loc[:,'Yrs':'AST'].apply(pd.to_numeric, errors='coerce')
df2 = df2[:].fillna(0)
df2 = df2.loc[:,'Yrs':'AST'].astype(int)
# df1 = df.loc[:,'FG_Perc':'VORP'].astype(float)
# df2 = df.loc[:,'FG_Perc':'VORP'].apply(pd.to_numeric, errors='coerce')


df3 = df.loc[:,'Pk':'College']

print(df1)
print(df2)
print(df3)

"""
originally str, .apply.to_numeric into floats, then fill NaNs with '0', before casting as int. 
"""


    FG_Perc  3P_Perc  FT_Perc  MP.1  PTS.1  TRB.1  AST.1   WS  WS_per_48  \
0     0.000    0.000    0.000   0.0    0.0    0.0    0.0  0.0      0.000   
1     0.402    0.294    0.621  28.8    9.4    4.0    2.1 -0.3     -0.007   
2     0.454    0.341    0.685  17.2    6.6    2.8    0.8  1.5      0.053   
3     0.354    0.277    0.364  13.3    3.4    2.4    0.5 -0.3     -0.029   
4     0.377    0.288    0.610  17.1    3.8    2.1    2.4  0.1      0.004   
5     0.426    0.391    0.842  23.0   10.6    3.3    1.5  1.3      0.032   
6     0.404    0.334    0.883  21.5    9.9    2.6    2.1  1.3      0.037   
7     0.449    0.321    0.624  21.3    9.2    4.2    0.7  1.8      0.050   
8     0.583    0.000    0.544  11.6    3.1    3.1    0.2  1.6      0.125   
9     0.459    0.378    0.653   9.9    4.0    2.0    0.4  1.3      0.113   
10    0.399    0.321    0.657  20.1    5.9    3.6    1.0  0.8      0.022   
11    0.400    0.324    0.787  16.6    5.7    2.7    0.9  1.1      0.054   
12    0.549 

In [444]:
df = pd.concat([df3, df2, df1], axis=1)
df.dtypes

Pk            object
Tm            object
Player        object
College       object
Yrs            int64
G              int64
MP             int64
PTS            int64
TRB            int64
AST            int64
FG_Perc      float64
3P_Perc      float64
FT_Perc      float64
MP.1         float64
PTS.1        float64
TRB.1        float64
AST.1        float64
WS           float64
WS_per_48    float64
BPM          float64
VORP         float64
dtype: object

In [392]:
# df.loc[:,'Yrs':'VORP'].apply(pd.to_numeric, errors='coerce')
# df.apply(pd.to_numeric, errors='coerce')

# df = df[:].fillna(0)
# print(df)

# df.loc[:,'Yrs':'VORP'].fillna(0)

# df.loc[:,'Yrs':'AST'] = df.loc[:,'Yrs':'AST'].astype(int)

df.loc[:,'Yrs':'AST'] = df.loc[:,'Yrs':'AST'].astype(int)
df.loc[:,'Yrs':'VORP'].apply(pd.to_numeric, errors='coerce')
df.loc[:,'Tm':'College'] = df.loc[:,'Tm':'College'].astype(str)

df.dtypes
# df = df.apply(pd.to_numeric, args=('coerce',))

ValueError: invalid literal for int() with base 10: ''

In [None]:
df.dtypes # Look at each column's datatype


# df.loc[:,'Yrs':'AST'] = df.loc[:,'Yrs':'AST'].astype(int)
# df.dtypes

In [234]:
df.loc[:,'Tm':'College'] = df.loc[:,'Tm':'College'].astype(str)
df.dtypes

Pk             int64
Tm            object
Player        object
College       object
Yrs            int64
G              int64
MP             int64
PTS            int64
TRB            int64
AST            int64
FG_Perc      float64
3P_Perc      float64
FT_Perc      float64
MP.1         float64
PTS.1        float64
TRB.1        float64
AST.1        float64
WS           float64
WS_per_48    float64
BPM          float64
VORP         float64
dtype: object

In [235]:
df

Unnamed: 0,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,AST,...,3P_Perc,FT_Perc,MP.1,PTS.1,TRB.1,AST.1,WS,WS_per_48,BPM,VORP
0,1,0.0,0.0,0.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,1,79,2279,740,316,166,...,0.294,0.621,28.8,9.4,4.0,2.1,-0.3,-0.007,-3.8,-1.1
2,3,0.0,0.0,0.0,1,78,1341,515,220,64,...,0.341,0.685,17.2,6.6,2.8,0.8,1.5,0.053,-4.0,-0.7
3,4,0.0,0.0,0.0,1,43,574,146,103,23,...,0.277,0.364,13.3,3.4,2.4,0.5,-0.3,-0.029,-4.3,-0.3
4,5,0.0,0.0,0.0,1,78,1333,293,166,188,...,0.288,0.61,17.1,3.8,2.1,2.4,0.1,0.004,-2.2,-0.1
5,6,0.0,0.0,0.0,1,82,1888,866,269,121,...,0.391,0.842,23.0,10.6,3.3,1.5,1.3,0.032,-2.7,-0.4
6,7,0.0,0.0,0.0,1,82,1764,811,214,170,...,0.334,0.883,21.5,9.9,2.6,2.1,1.3,0.037,-2.6,-0.3
7,8,0.0,0.0,0.0,1,82,1743,753,348,60,...,0.321,0.624,21.3,9.2,4.2,0.7,1.8,0.05,-1.6,0.2
8,9,0.0,0.0,0.0,1,54,626,165,165,12,...,0.0,0.544,11.6,3.1,3.1,0.2,1.6,0.125,-0.7,0.2
9,10,0.0,0.0,0.0,1,57,562,226,114,23,...,0.378,0.653,9.9,4.0,2.0,0.4,1.3,0.113,-1.5,0.1
