In [1]:
import requests
import pandas as pd
from pyquery import PyQuery as pq

In [2]:
url_template = "http://www.draftexpress.com/stats/ncaa/{year}/all/efficiency/"\
               "pace/0/all/all/{pg}"

In [3]:
def create_pq(url):
    """Creates PyQuery object used for scraping"""
    response = requests.get(url)
    html = response.text
    return pq(html)

def get_last_pg(url):
    """Get the last page number to be scraped"""
    pq_obj = create_pq(url)
    last_pg_selector = ".disabled+ li a"
    last_pg = pq_obj(last_pg_selector)
    return last_pg[0].text_content()

def get_links(pq_obj, url_selector):
    """Gets the links associated with the given css selector"""
    urls = pq_obj(url_selector)
    links = [url.get("href") for url in urls]
    return links

def get_data(pq_obj, row_selector):
    """Get table data"""
    rows = pq_obj(row_selector)
    data = [[td.text_content() for td in row] for row in rows]
    return data

def create_df(url, cols):
    """Scrapes data from url and returns a DataFrame with given columns"""
    pq_obj = create_pq(url)
    
    # Extract the links for the players and teams
    player_selector = "#cmn_wrap > div.row.two-cols > div >"\
                      "div.row.inner-page.stats > div > table > tbody > tr > "\
                      "td.text.key > a"
    player_links = get_links(pq_obj, player_selector)
    team_selector = ".key~ .text+ td a"
    team_links = get_links(pq_obj, team_selector)
    
    # get the table data
    row_selector = "#cmn_wrap > div.row.two-cols > div > "\
                   "div.row.inner-page.stats > div > table > tbody > tr"
    data = get_data(pq_obj, row_selector)

    df = pd.DataFrame(data=data, columns=cols)
    df["Player_Link"] = player_links
    df["Team_Link"] = team_links
    
    return df

In [4]:
# type out cols just cuz the dx col headers are a mess
cols = ["box", "Player", "Team_Logo", "Team", "G", "MP", "PTS", "FGA", 
        "PTS_per_Play", "TS_Pct", "eFG_Pct", "FT_Rate", "Three_Pt_Rate",
         "AST", "AST_FGA_Ratio", "AST_TO_Ratio", "PPR", "STL", "BLK", "PF"]

In [5]:
len(cols)

20

In [6]:
last_pgs = {yr: int(get_last_pg(url_template.format(year=yr, pg=1)))
            for yr in range(2003,2018)}

In [7]:
last_pgs

{2003: 176,
 2004: 178,
 2005: 178,
 2006: 179,
 2007: 182,
 2008: 184,
 2009: 185,
 2010: 189,
 2011: 183,
 2012: 184,
 2013: 185,
 2014: 189,
 2015: 190,
 2016: 188,
 2017: 190}

In [8]:
dfs = []
errors = []

In [9]:
for yr in range(2003, 2018):
    for pg in range(1, last_pgs.get(yr)):
        url = url_template.format(year=yr, pg=pg)
        try:
            df = create_df(url, cols)
            df["Season"] = yr
            dfs.append(df)
        except Exception as e:
            errors.append([url, e])

In [10]:
len(dfs)

2745

In [11]:
len(errors)

0

In [12]:
df = pd.concat(dfs, ignore_index=True)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68625 entries, 0 to 68624
Data columns (total 23 columns):
box              68625 non-null object
Player           68625 non-null object
Team_Logo        68625 non-null object
Team             68625 non-null object
G                68625 non-null object
MP               68625 non-null object
PTS              68625 non-null object
FGA              68625 non-null object
PTS_per_Play     68625 non-null object
TS_Pct           68625 non-null object
eFG_Pct          68625 non-null object
FT_Rate          68625 non-null object
Three_Pt_Rate    68625 non-null object
AST              68625 non-null object
AST_FGA_Ratio    68625 non-null object
AST_TO_Ratio     68625 non-null object
PPR              68625 non-null object
STL              68625 non-null object
BLK              68625 non-null object
PF               68625 non-null object
Player_Link      68625 non-null object
Team_Link        68625 non-null object
Season           68625 non-null i

In [14]:
df.head()

Unnamed: 0,box,Player,Team_Logo,Team,G,MP,PTS,FGA,PTS_per_Play,TS_Pct,...,AST,AST_FGA_Ratio,AST_TO_Ratio,PPR,STL,BLK,PF,Player_Link,Team_Link,Season
0,,Terrence Hill,\n \n ...,\n \n ...,1,40.0,6.0,12.0,0.36,0.23,...,1.0,0.08,0.33,-7.13,3.0,1.0,3.0,/profile/Terrence-Hill-18998/,/stats/ncaa/2003/Kennesaw%20St,2003
1,,Tommy Thompson,\n \n ...,\n \n ...,1,40.0,15.0,13.0,0.91,0.52,...,3.0,0.23,1.5,0.0,3.0,0.0,2.0,/profile/Tommy-Thompson-33117/,/stats/ncaa/2003/Kennesaw%20St,2003
2,,Luis Flores,\n \n ...,\n \n ...,30,38.9,24.6,16.9,1.01,0.6,...,2.9,0.17,0.89,-3.38,1.9,0.4,2.3,/profile/Luis-Flores-2430/,/stats/ncaa/2003/Manhattan,2003
3,,Rick Apodaca,\n \n ...,\n \n ...,15,38.8,18.1,15.6,0.8,0.5,...,4.27,0.27,1.05,-3.07,1.0,0.5,1.7,/profile/Rick-Apodaca-5607/,/stats/ncaa/2003/Hofstra,2003
4,,Michael Watson,\n \n ...,\n \n ...,29,38.8,25.5,22.6,0.87,0.51,...,3.76,0.17,1.03,-3.16,1.4,0.2,2.4,/profile/Michael-Watson-32035/,/stats/ncaa/2003/UMKC,2003


In [15]:
df.tail()

Unnamed: 0,box,Player,Team_Logo,Team,G,MP,PTS,FGA,PTS_per_Play,TS_Pct,...,AST,AST_FGA_Ratio,AST_TO_Ratio,PPR,STL,BLK,PF,Player_Link,Team_Link,Season
68620,,Harrison Brown,\n \n ...,\n \n ...,1,1.0,2.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,/profile/Harrison-Brown-96214/,/stats/ncaa/2017/Rice,2017
68621,,Bernard Cherestal,\n \n ...,\n \n ...,1,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,/profile/Bernard-Cherestal-104596/,/stats/ncaa/2017/Southern%20Illinois,2017
68622,,Will Hollmann,\n \n ...,\n \n ...,3,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,/profile/Will-Hollmann-98705/,/stats/ncaa/2017/Stetson,2017
68623,,Joe Schwartz,\n \n ...,\n \n ...,1,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,/profile/Joe-Schwartz-88362/,/stats/ncaa/2017/Texas,2017
68624,,Kyle Nugent,\n \n ...,\n \n ...,2,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,/profile/Kyle-Nugent-103960/,/stats/ncaa/2017/Texas%20A%26M,2017


In [17]:
df["DX_Player_ID"] = df.Player_Link.str.extract("/.*/(.*)/", expand=False)
df["DX_College_ID"] = df.Team_Link.str.extract("/.*/.*/.*/(.*)", expand=False)
df.head()

Unnamed: 0,box,Player,Team_Logo,Team,G,MP,PTS,FGA,PTS_per_Play,TS_Pct,...,AST_TO_Ratio,PPR,STL,BLK,PF,Player_Link,Team_Link,Season,DX_Player_ID,DX_College_ID
0,,Terrence Hill,\n \n ...,\n \n ...,1,40.0,6.0,12.0,0.36,0.23,...,0.33,-7.13,3.0,1.0,3.0,/profile/Terrence-Hill-18998/,/stats/ncaa/2003/Kennesaw%20St,2003,Terrence-Hill-18998,Kennesaw%20St
1,,Tommy Thompson,\n \n ...,\n \n ...,1,40.0,15.0,13.0,0.91,0.52,...,1.5,0.0,3.0,0.0,2.0,/profile/Tommy-Thompson-33117/,/stats/ncaa/2003/Kennesaw%20St,2003,Tommy-Thompson-33117,Kennesaw%20St
2,,Luis Flores,\n \n ...,\n \n ...,30,38.9,24.6,16.9,1.01,0.6,...,0.89,-3.38,1.9,0.4,2.3,/profile/Luis-Flores-2430/,/stats/ncaa/2003/Manhattan,2003,Luis-Flores-2430,Manhattan
3,,Rick Apodaca,\n \n ...,\n \n ...,15,38.8,18.1,15.6,0.8,0.5,...,1.05,-3.07,1.0,0.5,1.7,/profile/Rick-Apodaca-5607/,/stats/ncaa/2003/Hofstra,2003,Rick-Apodaca-5607,Hofstra
4,,Michael Watson,\n \n ...,\n \n ...,29,38.8,25.5,22.6,0.87,0.51,...,1.03,-3.16,1.4,0.2,2.4,/profile/Michael-Watson-32035/,/stats/ncaa/2003/UMKC,2003,Michael-Watson-32035,UMKC


In [18]:
df.to_csv("raw_data/draft_express_player_efficiency_stats_07_10_17.csv", 
          index=False)