In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
import string

In [2]:
url = "http://www.basketball-reference.com/players/{}/"

In [3]:
def scrape_data(url):
    """Function to scrape data from Bref's player pages"""
    
    # Use bs4 to get the links for each player in order to extract their 
    # bref ids
    html = urlopen(url)
    soup = BeautifulSoup(html, "lxml")
    players = soup.select("th a")
    links = [player.attrs["href"] for player in players]
    
    # use pandss to easily scrape the table
    df = pd.read_html(url)[0]
    df["Link"] = links
    return df

In [4]:
# get each letter of the alphabet except for x
# which will be used in the url
abc = string.ascii_lowercase.replace("x", "")

In [5]:
dfs = [scrape_data(url.format(i)) for i in abc]

In [6]:
len(dfs)

25

In [7]:
df = pd.concat(dfs, ignore_index=True)

In [8]:
df.head()

Unnamed: 0,Player,From,To,Pos,Ht,Wt,Birth Date,College,Link
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke University,/players/a/abdelal01.html
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State University,/players/a/abdulza01.html
2,Kareem Abdul-Jabbar*,1970,1989,C,7-2,225.0,"April 16, 1947","University of California, Los Angeles",/players/a/abdulka01.html
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",Louisiana State University,/players/a/abdulma02.html
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974",San Jose State University,/players/a/abdulta01.html


In [9]:
df.to_csv("raw_data/bref_players_06_06_17_raw.csv", index=False)

In [11]:
df["Bref_ID"] = df.Link.str.extract("/.*/.*/(.*)\.", expand=False)

In [12]:
df.head()

Unnamed: 0,Player,From,To,Pos,Ht,Wt,Birth Date,College,Link,Bref_ID
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke University,/players/a/abdelal01.html,abdelal01
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State University,/players/a/abdulza01.html,abdulza01
2,Kareem Abdul-Jabbar*,1970,1989,C,7-2,225.0,"April 16, 1947","University of California, Los Angeles",/players/a/abdulka01.html,abdulka01
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",Louisiana State University,/players/a/abdulma02.html,abdulma02
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974",San Jose State University,/players/a/abdulta01.html,abdulta01


In [13]:
df.rename(columns={"Birth Date": "Birth_Date"}, inplace=True)

In [14]:
df.head()

Unnamed: 0,Player,From,To,Pos,Ht,Wt,Birth_Date,College,Link,Bref_ID
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke University,/players/a/abdelal01.html,abdelal01
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State University,/players/a/abdulza01.html,abdulza01
2,Kareem Abdul-Jabbar*,1970,1989,C,7-2,225.0,"April 16, 1947","University of California, Los Angeles",/players/a/abdulka01.html,abdulka01
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",Louisiana State University,/players/a/abdulma02.html,abdulma02
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974",San Jose State University,/players/a/abdulta01.html,abdulta01


In [15]:
df["Birth_Date"] = pd.to_datetime(df.Birth_Date)

In [16]:
df.head()

Unnamed: 0,Player,From,To,Pos,Ht,Wt,Birth_Date,College,Link,Bref_ID
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,1968-06-24,Duke University,/players/a/abdelal01.html,abdelal01
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,1946-04-07,Iowa State University,/players/a/abdulza01.html,abdulza01
2,Kareem Abdul-Jabbar*,1970,1989,C,7-2,225.0,1947-04-16,"University of California, Los Angeles",/players/a/abdulka01.html,abdulka01
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,1969-03-09,Louisiana State University,/players/a/abdulma02.html,abdulma02
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,1974-11-03,San Jose State University,/players/a/abdulta01.html,abdulta01


In [19]:
# get indicator for Hall of Fame
df["HOF"] = df.Player.str.contains("\*").astype(int)

In [20]:
# now strip the * from the player names
df["Player"] = df.Player.str.strip("\*")

In [30]:
heights = df.Ht.str.split("-", expand=True)

In [37]:
df.Ht[(heights[0].astype(float) * 12 + heights[1].astype(float)).isnull()]

2097    NaN
Name: Ht, dtype: object

In [38]:
df.loc[2097]

Player                         George Karl
From                                  1974
To                                    1978
Pos                                    NaN
Ht                                     NaN
Wt                                     NaN
Birth_Date             1952-05-12 00:00:00
College       University of North Carolina
Link              /players/k/karlge01.html
Bref_ID                           karlge01
HOF                                      0
Name: 2097, dtype: object

In [40]:
ht_inches = heights[0].astype(float) * 12 + heights[1].astype(float)

In [41]:
df["Ht_Inches"] = ht_inches

In [42]:
df.head()

Unnamed: 0,Player,From,To,Pos,Ht,Wt,Birth_Date,College,Link,Bref_ID,HOF,Ht_Inches
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,1968-06-24,Duke University,/players/a/abdelal01.html,abdelal01,0,82.0
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,1946-04-07,Iowa State University,/players/a/abdulza01.html,abdulza01,0,81.0
2,Kareem Abdul-Jabbar,1970,1989,C,7-2,225.0,1947-04-16,"University of California, Los Angeles",/players/a/abdulka01.html,abdulka01,1,86.0
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,1969-03-09,Louisiana State University,/players/a/abdulma02.html,abdulma02,0,73.0
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,1974-11-03,San Jose State University,/players/a/abdulta01.html,abdulta01,0,78.0


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4460 entries, 0 to 4459
Data columns (total 12 columns):
Player        4460 non-null object
From          4460 non-null int64
To            4460 non-null int64
Pos           4459 non-null object
Ht            4459 non-null object
Wt            4454 non-null float64
Birth_Date    4429 non-null datetime64[ns]
College       4169 non-null object
Link          4460 non-null object
Bref_ID       4460 non-null object
HOF           4460 non-null int64
Ht_Inches     4459 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(3), object(6)
memory usage: 418.2+ KB


In [45]:
df.to_csv("processed_data/bref_player_info_and_id_06_06_17.csv", index=False)