In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
import string

In [2]:
url = "http://www.sports-reference.com/cbb/players/{}-index.html"

In [3]:
def scrape_data(url):
    html = urlopen(url)
    soup = BeautifulSoup(html, "lxml")
    players = soup.select("#content div p")
    data = [[player.find("a").attrs["href"], player.get_text()] 
            for player in players]
    df = pd.DataFrame(data, columns=["link", "text"])
    return df

In [4]:
abc = string.ascii_lowercase

In [5]:
dfs = [scrape_data(url.format(i)) for i in abc]

In [6]:
len(dfs)

26

In [7]:
df = pd.concat(dfs, ignore_index=True)

In [8]:
df.head()

Unnamed: 0,link,text
0,/cbb/players/menghe-anyam-1.html,Menghe a'Nyam (2007-2008) Canisius
1,/cbb/players/jordan-aaberg-1.html,Jordan Aaberg (2010-2014) North Dakota State
2,/cbb/players/karl-aaker-1.html,Karl Aaker (2002-2005) Portland
3,/cbb/players/steve-aaker-1.html,Steve Aaker (1971-1973) Colorado
4,/cbb/players/waine-aalto-1.html,Waine Aalto (1971-1971) Fresno State


In [9]:
df.tail()

Unnamed: 0,link,text
103503,/cbb/players/jack-zyla-1.html,Jack Zyla (1963-1965) New Hampshire
103504,/cbb/players/michal-zylinski-1.html,Michal Zylinski (2009-2009) Stony Brook
103505,/cbb/players/brock-zylstra-1.html,Brock Zylstra (2010-2013) Brigham Young
103506,/cbb/players/gene-zysda-1.html,Gene Zysda (1961-1961) Colorado
103507,/cbb/players/gene-zyzda-1.html,Gene Zyzda (1960-1962) Colorado


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103508 entries, 0 to 103507
Data columns (total 2 columns):
link    103508 non-null object
text    103508 non-null object
dtypes: object(2)
memory usage: 1.6+ MB


In [11]:
df[["Player", "Years", "College"]] = df.text.str.split("( \(.*\) )", expand=True)

In [12]:
df.head()

Unnamed: 0,link,text,Player,Years,College
0,/cbb/players/menghe-anyam-1.html,Menghe a'Nyam (2007-2008) Canisius,Menghe a'Nyam,(2007-2008),Canisius
1,/cbb/players/jordan-aaberg-1.html,Jordan Aaberg (2010-2014) North Dakota State,Jordan Aaberg,(2010-2014),North Dakota State
2,/cbb/players/karl-aaker-1.html,Karl Aaker (2002-2005) Portland,Karl Aaker,(2002-2005),Portland
3,/cbb/players/steve-aaker-1.html,Steve Aaker (1971-1973) Colorado,Steve Aaker,(1971-1973),Colorado
4,/cbb/players/waine-aalto-1.html,Waine Aalto (1971-1971) Fresno State,Waine Aalto,(1971-1971),Fresno State


In [13]:
df.Player.values[:10]

array(["Menghe a'Nyam", 'Jordan Aaberg', 'Karl Aaker', 'Steve Aaker',
       'Waine Aalto', 'Mike Aaman', 'Craig Aamot', 'Art Aaron',
       'Carlton Aaron', 'David Aaron'], dtype=object)

In [14]:
df.Years.values[:10]

array([' (2007-2008) ', ' (2010-2014) ', ' (2002-2005) ', ' (1971-1973) ',
       ' (1971-1971) ', ' (2013-2017) ', ' (1992-1993) ', ' (1981-1984) ',
       ' (2003-2005) ', ' (1986-1989) '], dtype=object)

In [16]:
df.College.values[:10]

array(['Canisius', 'North Dakota State', 'Portland', 'Colorado',
       'Fresno State', 'Rhode Island; Wagner', 'Marquette', 'Northwestern',
       'Missouri-Kansas City', 'Santa Clara'], dtype=object)

In [24]:
df[["link", "text"]].to_csv("raw_data/sref_cbb_player_info_07_06_17_raw.csv", index=False)

In [19]:
df.loc[:, "Years"] = df.Years.str.replace("(\(|\))", "").str.rstrip().str.lstrip()

In [20]:
df.head()

Unnamed: 0,link,text,Player,Years,College
0,/cbb/players/menghe-anyam-1.html,Menghe a'Nyam (2007-2008) Canisius,Menghe a'Nyam,2007-2008,Canisius
1,/cbb/players/jordan-aaberg-1.html,Jordan Aaberg (2010-2014) North Dakota State,Jordan Aaberg,2010-2014,North Dakota State
2,/cbb/players/karl-aaker-1.html,Karl Aaker (2002-2005) Portland,Karl Aaker,2002-2005,Portland
3,/cbb/players/steve-aaker-1.html,Steve Aaker (1971-1973) Colorado,Steve Aaker,1971-1973,Colorado
4,/cbb/players/waine-aalto-1.html,Waine Aalto (1971-1971) Fresno State,Waine Aalto,1971-1971,Fresno State


In [21]:
df["From"] = df.Years.str.split("-", expand=True)[0].astype(int)
df["To"] = df.Years.str.split("-", expand=True)[1].astype(int)

ValueError: invalid literal for int() with base 10: 'Gordo Castillo 2011'

In [22]:
df.Years.unique()

array(['2007-2008', '2010-2014', '2002-2005', '1971-1973', '1971-1971',
       '2013-2017', '1992-1993', '1981-1984', '2003-2005', '1986-1989',
       '2013-2014', '1994-1994', '1985-1985', '1990-1992', '2000-2000',
       '1987-1988', '2005-2008', '2015-2017', '2004-2007', '2014-2014',
       '1963-1963', '2007-2007', '2013-2013', '2014-2017', '1994-1995',
       '1955-1957', '1979-1982', '1962-1962', '1965-1965', '1995-1995',
       '1956-1958', '1958-1958', '2016-2016', '2006-2006', '1996-1998',
       '1961-1963', '2008-2011', '1987-1987', '1994-1997', '1987-1990',
       '1991-1992', '2012-2012', '1992-1992', '2006-2007', '2013-2015',
       '2014-2015', '2007-2010', '1967-1969', '1996-1999', '1997-1997',
       '1995-1996', '1989-1990', '1993-1993', '1993-1994', '2013-2016',
       '1998-2002', '2016-2017', '1995-1997', '2015-2016', '2002-2004',
       '2017-2017', '1996-1996', '1998-1998', '2010-2011', '1967-1967',
       '1959-1960', '1980-1980', '1978-1979', '1999-2001', '1995

In [23]:
df.loc[df.Years=="Gordo Castillo 2011-2011", :]

Unnamed: 0,link,text,Player,Years,College
15188,/cbb/players/aaron-gordo-castillo-1.html,Aaron (Gordo) Castillo (2011-2011) New Mexico ...,Aaron,Gordo Castillo 2011-2011,New Mexico State
