In [1]:
import requests
from pyquery import PyQuery as pq
import pandas as pd

In [2]:
url_template = "http://www.sports-reference.com/cbb/play-index/psl_finder.cgi"\
               "?request=1&match=single&year_min=1996&year_max=&conf_id=&"\
               "school_id=&class_is_fr=Y&class_is_so=Y&class_is_jr=Y&class_is"\
               "_sr=Y&pos_is_g=Y&pos_is_gf=Y&pos_is_fg=Y&pos_is_f=Y&pos_is_fc"\
               "=Y&pos_is_cf=Y&pos_is_c=Y&games_type=A&qual=&c1stat=&c1comp=&"\
               "c1val=&c2stat=&c2comp=&c2val=&c3stat=&c3comp=&c3val=&c4stat=&"\
               "c4comp=&c4val=&order_by=ws&order_by_asc=&offset={}"

In [3]:
def get_cbb_link_and_data(idx, td):
    """Helps get the player link as you scrape the data""" 
    
    if idx == 1:
        return td.text_content(), td.getchildren()[0].attrib["href"]
    else:
        return td.text_content()

def create_pq(url):
    """Creates PyQuery object used for scraping"""
    
    response = requests.get(url)
    html = response.text.replace('<!--', '').replace('-->', '')
    return pq(html)

def get_table(url, table_id_selector):
    """Scrapes the data and returns it as a DataFrame"""
    
    pq_obj = create_pq(url)
    
    rows = pq_obj("{} > tbody > tr".format(table_id_selector))
    headers = pq_obj("{} > thead > tr > th".format(table_id_selector))
    
    # get data from each row and coll
    data = [[get_cbb_link_and_data(idx, td) 
             for idx, td in enumerate(row.iterchildren())] 
            for row in rows if row.attrib == ""]
    cols = [th.text_content() for th in headers[2:]]
    df = pd.DataFrame(data=data, columns=cols)
    
    return df

def scrape_data(offset):
    url = url_template.format(offset)
    try:
        return get_table(url, "#stats")
    except Exception as e:
        return [url, e]

In [4]:
dfs = [scrape_data(offset) for offset in range(0, 96701, 100)]

In [5]:
len(dfs)

968

In [6]:
errors = [error for error in dfs if isinstance(error, list)]

In [7]:
len(errors)

0

In [8]:
actual_dfs = [df for df in dfs if isinstance(df, pd.DataFrame)]

In [9]:
len(actual_dfs)

968

In [10]:
df = pd.concat(dfs, ignore_index=True)

In [11]:
df.head()

Unnamed: 0,Rk,Player,Class,Season,Pos,School,Conf,G,MP,PER,...,USG%,PProd,ORtg,DRtg,OWS,DWS,WS,OBPM,DBPM,BPM
0,1,"(Kevin Love, /cbb/players/kevin-love-1.html)",FR,2007-08,C,UCLA,Pac-10,39,1156,,...,,,,,5.6,5.7,11.3,,,
1,2,"(Andrew Bogut, /cbb/players/andrew-bogut-1.html)",SO,2004-05,F,Utah,MWC,35,1224,,...,,,,,6.4,4.5,10.9,,,
2,3,"(Paul Millsap, /cbb/players/paul-millsap-1.html)",JR,2005-06,F,Louisiana Tech,WAC,33,1126,,...,,,,,4.1,6.7,10.8,,,
3,4,"(Michael Beasley, /cbb/players/michael-beasley...",FR,2007-08,F,Kansas State,Big 12,33,1041,,...,,,,,5.4,5.4,10.7,,,
4,5,"(Tim Duncan, /cbb/players/tim-duncan-1.html)",SR,1996-97,C,Wake Forest,ACC,31,1137,,...,,,,,5.5,4.9,10.4,,,


In [12]:
df.tail()

Unnamed: 0,Rk,Player,Class,Season,Pos,School,Conf,G,MP,PER,...,USG%,PProd,ORtg,DRtg,OWS,DWS,WS,OBPM,DBPM,BPM
96784,96785,"(Will Hornsby, /cbb/players/will-hornsby-1.html)",FR,2012-13,G,Grambling,SWAC,22,440.0,1.4,...,20.6,93.0,60.9,122.0,-1.3,-0.3,-1.6,-9.9,-6.1,-15.9
96785,96786,"(Jermaine Holliway, /cbb/players/jermaine-holl...",SR,1999-00,F,Howard,MEAC,28,,,...,,,,,-1.3,-0.3,-1.7,,,
96786,96787,"(Darwayne Smith, /cbb/players/darwayne-smith-1...",FR,2005-06,F,Savannah State,Ind,25,,,...,,,,,-0.8,-1.0,-1.8,,,
96787,96788,"(Antoine Bronner, /cbb/players/antoine-bronner...",FR,2005-06,F,Savannah State,Ind,30,,,...,,,,,-1.2,-0.8,-2.0,,,
96788,96789,"(Kris Walden, /cbb/players/kris-walden-1.html)",FR,2011-12,G,Towson,CAA,32,1098.0,2.3,...,19.2,229.0,66.6,114.3,-2.4,0.0,-2.4,-6.2,-3.3,-9.5


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96789 entries, 0 to 96788
Data columns (total 29 columns):
Rk        96789 non-null object
Player    96789 non-null object
Class     96789 non-null object
Season    96789 non-null object
Pos       96789 non-null object
School    96789 non-null object
Conf      96789 non-null object
G         96789 non-null object
MP        96789 non-null object
PER       96789 non-null object
TS%       96789 non-null object
eFG%      96789 non-null object
ORB%      96789 non-null object
DRB%      96789 non-null object
TRB%      96789 non-null object
AST%      96789 non-null object
STL%      96789 non-null object
BLK%      96789 non-null object
TOV%      96789 non-null object
USG%      96789 non-null object
PProd     96789 non-null object
ORtg      96789 non-null object
DRtg      96789 non-null object
OWS       96789 non-null object
DWS       96789 non-null object
WS        96789 non-null object
OBPM      96789 non-null object
DBPM      96789 non-null ob

In [16]:
df.to_csv("raw_data/cbb_player_season_adv_stats_07_08_17_raw.csv",
          index=False)