In [1]:
import requests
from pyquery import PyQuery as pq
import pandas as pd

In [2]:
url_template = "http://www.sports-reference.com/cbb/play-index/psl_finder.cgi"\
               "?request=1&match=single&year_min=&year_max=&conf_id=&school"\
               "_id=&class_is_fr=Y&class_is_so=Y&class_is_jr=Y&class_is_sr=Y"\
               "&pos_is_g=Y&pos_is_gf=Y&pos_is_fg=Y&pos_is_f=Y&pos_is_fc=Y&"\
               "pos_is_cf=Y&pos_is_c=Y&games_type=A&qual=&c1stat=&c1comp=&c1"\
               "val=&c2stat=&c2comp=&c2val=&c3stat=&c3comp=&c3val=&c4stat=&c4"\
               "comp=&c4val=&order_by=pts&order_by_asc=&offset={}"

In [3]:
def get_cbb_link_and_data(idx, td):
    """Helps get the player link as you scrape the data""" 
    
    if idx == 1:
        return td.text_content(), td.getchildren()[0].attrib["href"]
    else:
        return td.text_content()

def create_pq(url):
    """Creates PyQuery object used for scraping"""
    
    response = requests.get(url)
    html = response.text.replace('<!--', '').replace('-->', '')
    return pq(html)

def get_table(url, table_id_selector):
    """Scrapes the data and returns it as a DataFrame"""
    
    pq_obj = create_pq(url)
    
    rows = pq_obj("{} > tbody > tr".format(table_id_selector))
    headers = pq_obj("{} > thead > tr > th".format(table_id_selector))
    
    # get data from each row and coll
    data = [[get_cbb_link_and_data(idx, td) 
             for idx, td in enumerate(row.iterchildren())] 
            for row in rows if row.attrib == ""]
    cols = [th.text_content() for th in headers[2:]]
    df = pd.DataFrame(data=data, columns=cols)
    
    return df

def scrape_data(offset):
    url = url_template.format(offset)
    try:
        return get_table(url, "#stats")
    except Exception as e:
        return [url, e]

In [4]:
dfs = [scrape_data(offset) for offset in range(0, 108601, 100)]

In [5]:
len(dfs)

1087

In [6]:
errors = [error for error in dfs if isinstance(error, list)]

In [7]:
len(errors)

1

In [8]:
dfs[0].head()

Unnamed: 0,Rk,Player,Class,Season,Pos,School,Conf,G,MP,FG,...,FTA,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,"(Jimmer Fredette, /cbb/players/jimmer-fredette...",SR,2010-11,G,Brigham Young,MWC,37,1323.0,346,...,282,24.0,103.0,127,160,49,1,131.0,49.0,1068
1,2,"(Glenn Robinson, /cbb/players/glenn-robinson-1...",JR,1993-94,F,Purdue,Big Ten,34,1166.0,368,...,270,,,344,66,56,31,139.0,88.0,1030
2,3,"(Stephen Curry, /cbb/players/stephen-curry-1.h...",JR,2008-09,G,Davidson,Southern,34,,312,...,251,,,151,189,86,8,126.0,,974
3,4,"(Kemba Walker, /cbb/players/kemba-walker-1.html)",JR,2010-11,G,Connecticut,Big East,41,1543.0,316,...,315,53.0,170.0,223,184,77,7,93.0,56.0,965
4,5,"(J.J. Redick, /cbb/players/jj-redick-1.html)",SR,2005-06,G,Duke,ACC,36,1336.0,302,...,256,,,71,95,52,2,,,964


In [9]:
actual_dfs = [df for df in dfs if isinstance(df, pd.DataFrame)]

In [10]:
len(actual_dfs)

1086

In [11]:
df = pd.concat(actual_dfs, ignore_index=True)

In [12]:
df.head()

Unnamed: 0,Rk,Player,Class,Season,Pos,School,Conf,G,MP,FG,...,FTA,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,"(Jimmer Fredette, /cbb/players/jimmer-fredette...",SR,2010-11,G,Brigham Young,MWC,37,1323.0,346,...,282,24.0,103.0,127,160,49,1,131.0,49.0,1068
1,2,"(Glenn Robinson, /cbb/players/glenn-robinson-1...",JR,1993-94,F,Purdue,Big Ten,34,1166.0,368,...,270,,,344,66,56,31,139.0,88.0,1030
2,3,"(Stephen Curry, /cbb/players/stephen-curry-1.h...",JR,2008-09,G,Davidson,Southern,34,,312,...,251,,,151,189,86,8,126.0,,974
3,4,"(Kemba Walker, /cbb/players/kemba-walker-1.html)",JR,2010-11,G,Connecticut,Big East,41,1543.0,316,...,315,53.0,170.0,223,184,77,7,93.0,56.0,965
4,5,"(J.J. Redick, /cbb/players/jj-redick-1.html)",SR,2005-06,G,Duke,ACC,36,1336.0,302,...,256,,,71,95,52,2,,,964


In [13]:
df.tail()

Unnamed: 0,Rk,Player,Class,Season,Pos,School,Conf,G,MP,FG,...,FTA,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
108525,108626,"(Milos Zivanovic, /cbb/players/milos-zivanovic...",SR,2005-06,C,Hawaii,WAC,1,,0.0,...,0.0,,,1,0,0,0,,,0
108526,108627,"(Paul Zorad, /cbb/players/paul-zorad-1.html)",SO,1995-96,F,Southern Methodist,SWC,3,,,...,,,,2,1,0,0,,,0
108527,108628,"(Paul Zorad, /cbb/players/paul-zorad-1.html)",JR,1996-97,F,Southern Methodist,WAC,3,,0.0,...,0.0,,,0,0,0,0,,,0
108528,108629,"(Stephen Zurich, /cbb/players/stephen-zurich-1...",JR,2014-15,F,Rutgers,Big Ten,8,29.0,0.0,...,2.0,0.0,2.0,2,1,0,1,2.0,4.0,0
108529,108630,"(Brian Zvonceck, /cbb/players/brian-zvonceck-1...",SR,1992-93,C,Baylor,SWC,3,,0.0,...,,,,1,0,0,0,,,0


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108530 entries, 0 to 108529
Data columns (total 26 columns):
Rk        108530 non-null object
Player    108530 non-null object
Class     108530 non-null object
Season    108530 non-null object
Pos       108530 non-null object
School    108530 non-null object
Conf      108530 non-null object
G         108530 non-null object
MP        108530 non-null object
FG        108530 non-null object
FGA       108530 non-null object
2P        108530 non-null object
2PA       108530 non-null object
3P        108530 non-null object
3PA       108530 non-null object
FT        108530 non-null object
FTA       108530 non-null object
ORB       108530 non-null object
DRB       108530 non-null object
TRB       108530 non-null object
AST       108530 non-null object
STL       108530 non-null object
BLK       108530 non-null object
TOV       108530 non-null object
PF        108530 non-null object
PTS       108530 non-null object
dtypes: object(26)
memory usage

In [15]:
df.to_csv("raw_data/cbb_player_season_trad_stats_07_08_17_raw.csv",
          index=False)

In [16]:
errors

[['http://www.sports-reference.com/cbb/play-index/psl_finder.cgi?request=1&match=single&year_min=&year_max=&conf_id=&school_id=&class_is_fr=Y&class_is_so=Y&class_is_jr=Y&class_is_sr=Y&pos_is_g=Y&pos_is_gf=Y&pos_is_fg=Y&pos_is_f=Y&pos_is_fc=Y&pos_is_cf=Y&pos_is_c=Y&games_type=A&qual=&c1stat=&c1comp=&c1val=&c2stat=&c2comp=&c2val=&c3stat=&c3comp=&c3val=&c4stat=&c4comp=&c4val=&order_by=pts&order_by_asc=&offset=105800',
  IndexError('list index out of range')]]

In [18]:
errors[0][0]

'http://www.sports-reference.com/cbb/play-index/psl_finder.cgi?request=1&match=single&year_min=&year_max=&conf_id=&school_id=&class_is_fr=Y&class_is_so=Y&class_is_jr=Y&class_is_sr=Y&pos_is_g=Y&pos_is_gf=Y&pos_is_fg=Y&pos_is_f=Y&pos_is_fc=Y&pos_is_cf=Y&pos_is_c=Y&games_type=A&qual=&c1stat=&c1comp=&c1val=&c2stat=&c2comp=&c2val=&c3stat=&c3comp=&c3val=&c4stat=&c4comp=&c4val=&order_by=pts&order_by_asc=&offset=105800'