In [1]:
import requests
import pandas as pd
from pyquery import PyQuery as pq

In [2]:
url_template = "http://www.draftexpress.com/stats/ncaa/{year}/all/basic/pace/"\
               "0/all/all/{pg}"

In [3]:
def create_pq(url):
    """Creates PyQuery object used for scraping"""
    response = requests.get(url)
    html = response.text
    return pq(html)

def get_last_pg(url):
    """Get the last page number to be scraped"""
    pq_obj = create_pq(url)
    last_pg_selector = ".disabled+ li a"
    last_pg = pq_obj(last_pg_selector)
    return last_pg[0].text_content()

def get_links(pq_obj, url_selector):
    """Gets the links associated with the given css selector"""
    urls = pq_obj(url_selector)
    links = [url.get("href") for url in urls]
    return links

def get_data(pq_obj, row_selector):
    """Get table data"""
    rows = pq_obj(row_selector)
    data = [[td.text_content() for td in row] for row in rows]
    return data

def create_df(url, cols):
    """Scrapes data from url and returns a DataFrame with given columns"""
    pq_obj = create_pq(url)
    
    # Extract the links for the players and teams
    player_selector = "#cmn_wrap > div.row.two-cols > div >"\
                      "div.row.inner-page.stats > div > table > tbody > tr > "\
                      "td.text.key > a"
    player_links = get_links(pq_obj, player_selector)
    team_selector = ".key~ .text+ td a"
    team_links = get_links(pq_obj, team_selector)
    
    # get the table data
    row_selector = "#cmn_wrap > div.row.two-cols > div > "\
                   "div.row.inner-page.stats > div > table > tbody > tr"
    data = get_data(pq_obj, row_selector)

    df = pd.DataFrame(data=data, columns=cols)
    df["Player_Link"] = player_links
    df["Team_Link"] = team_links
    
    return df

In [4]:
# type out cols just cuz the dx col headers are a mess
cols = ["box", "Player", "Team_Logo", "Team", "G", "MP", "PTS", "FG_2P", 
        "FG_2PA",  "FG_2P_Pct", "FG_3P", "FGA_3P", "FG_3P_Pct", "FT", "FTA",
        "FT_Pct", "OREB", "DREB", "REB", "AST", "STL", "BLK", "TO", "PF"]

In [5]:
last_pgs = {yr: int(get_last_pg(url_template.format(year=yr, pg=1)))
            for yr in range(2003,2018)}

In [6]:
dfs = []
errors = []

In [7]:
for yr in range(2003, 2018):
    for pg in range(1, last_pgs.get(yr)):
        url = url_template.format(year=yr, pg=pg)
        try:
            df = create_df(url, cols)
            df["Season"] = yr
            dfs.append(df)
        except Exception as e:
            errors.append([url, e])

In [8]:
len(dfs)

2745

In [9]:
len(errors)

0

In [10]:
df = pd.concat(dfs, ignore_index=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68625 entries, 0 to 68624
Data columns (total 27 columns):
box            68625 non-null object
Player         68625 non-null object
Team_Logo      68625 non-null object
Team           68625 non-null object
G              68625 non-null object
MP             68625 non-null object
PTS            68625 non-null object
FG_2P          68625 non-null object
FG_2PA         68625 non-null object
FG_2P_Pct      68625 non-null object
FG_3P          68625 non-null object
FGA_3P         68625 non-null object
FG_3P_Pct      68625 non-null object
FT             68625 non-null object
FTA            68625 non-null object
FT_Pct         68625 non-null object
OREB           68625 non-null object
DREB           68625 non-null object
REB            68625 non-null object
AST            68625 non-null object
STL            68625 non-null object
BLK            68625 non-null object
TO             68625 non-null object
PF             68625 non-null object
Pla

In [12]:
df.head()

Unnamed: 0,box,Player,Team_Logo,Team,G,MP,PTS,FG_2P,FG_2PA,FG_2P_Pct,...,DREB,REB,AST,STL,BLK,TO,PF,Player_Link,Team_Link,Season
0,,Ray Aguirre,\n \n ...,\n \n ...,1,1.0,73.0,36.5,36.5,100.0%,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,/profile/Ray-Aguirre-32429/,/stats/ncaa/2003/TX%20A%26M%20Corpus%20Christi,2003
1,,Anwar Rohan,\n \n ...,\n \n ...,1,2.0,59.6,0.0,0.0,0.0%,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,/profile/Anwar-Rohan-33434/,/stats/ncaa/2003/Delaware,2003
2,,Jeff Severinghaus,\n \n ...,\n \n ...,6,4.7,56.5,5.1,10.3,50.0%,...,10.3,15.4,5.1,0.0,5.1,0.0,2.6,/profile/Jeff-Severinghaus-31615/,/stats/ncaa/2003/Davidson,2003
3,,Daniel Sperry,\n \n ...,\n \n ...,3,1.0,54.4,27.2,27.2,100.0%,...,0.0,13.6,0.0,13.6,0.0,13.6,0.0,/profile/Daniel-Sperry-28373/,/stats/ncaa/2003/Furman,2003
4,,Daniel Daccarett,\n \n ...,\n \n ...,5,1.4,53.6,17.9,17.9,100.0%,...,6.0,11.9,0.0,0.0,0.0,11.9,17.9,/profile/Daniel-Daccarett-31828/,/stats/ncaa/2003/Boston%20U.,2003


In [13]:
df.tail()

Unnamed: 0,box,Player,Team_Logo,Team,G,MP,PTS,FG_2P,FG_2PA,FG_2P_Pct,...,DREB,REB,AST,STL,BLK,TO,PF,Player_Link,Team_Link,Season
68620,,Isaiah Hobbs,\n \n ...,\n \n ...,2,2.0,0.0,0.0,0.0,0.0%,...,0.0,0.0,10.1,0.0,0.0,10.1,10.1,/profile/Isaiah-Hobbs-87586/,/stats/ncaa/2017/Texas,2017
68621,,Matt Kennedy,\n \n ...,\n \n ...,7,1.9,0.0,0.0,0.0,0.0%,...,0.0,3.3,0.0,0.0,0.0,0.0,0.0,/profile/Matt-Kennedy-103678/,/stats/ncaa/2017/Villanova,2017
68622,,Rendijs Feikners,\n \n ...,\n \n ...,4,1.3,0.0,0.0,0.0,0.0%,...,8.1,8.1,0.0,0.0,0.0,0.0,8.1,/profile/Rendijs-Feikners-62901/,/stats/ncaa/2017/Florida%20Gulf%20Coast,2017
68623,,Tyler Simmons,\n \n ...,\n \n ...,1,2.0,0.0,0.0,0.0,0.0%,...,0.0,0.0,19.3,0.0,0.0,0.0,0.0,/profile/Tyler-Simmons-87354/,/stats/ncaa/2017/Central%20Arkansas,2017
68624,,Nick Ayers,\n \n ...,\n \n ...,2,1.0,0.0,0.0,0.0,0.0%,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,/profile/Nick-Ayers-105627/,/stats/ncaa/2017/Northern%20Kentucky,2017


In [44]:
df["DX_Player_ID"] = df.Player_Link.str.extract("/.*/(.*)/", expand=False)
df["DX_College_ID"] = df.Team_Link.str.extract("/.*/.*/.*/(.*)", expand=False)

In [45]:
df.head()

Unnamed: 0,box,Player,Team_Logo,Team,G,MP,PTS,FG_2P,FG_2PA,FG_2P_Pct,...,AST,STL,BLK,TO,PF,Player_Link,Team_Link,Season,DX_Player_ID,DX_College_ID
0,,Ray Aguirre,\n \n ...,\n \n ...,1,1.0,73.0,36.5,36.5,100.0%,...,0.0,0.0,0.0,0.0,0.0,/profile/Ray-Aguirre-32429/,/stats/ncaa/2003/TX%20A%26M%20Corpus%20Christi,2003,Ray-Aguirre-32429,TX%20A%26M%20Corpus%20Christi
1,,Anwar Rohan,\n \n ...,\n \n ...,1,2.0,59.6,0.0,0.0,0.0%,...,0.0,0.0,0.0,0.0,0.0,/profile/Anwar-Rohan-33434/,/stats/ncaa/2003/Delaware,2003,Anwar-Rohan-33434,Delaware
2,,Jeff Severinghaus,\n \n ...,\n \n ...,6,4.7,56.5,5.1,10.3,50.0%,...,5.1,0.0,5.1,0.0,2.6,/profile/Jeff-Severinghaus-31615/,/stats/ncaa/2003/Davidson,2003,Jeff-Severinghaus-31615,Davidson
3,,Daniel Sperry,\n \n ...,\n \n ...,3,1.0,54.4,27.2,27.2,100.0%,...,0.0,13.6,0.0,13.6,0.0,/profile/Daniel-Sperry-28373/,/stats/ncaa/2003/Furman,2003,Daniel-Sperry-28373,Furman
4,,Daniel Daccarett,\n \n ...,\n \n ...,5,1.4,53.6,17.9,17.9,100.0%,...,0.0,0.0,0.0,11.9,17.9,/profile/Daniel-Daccarett-31828/,/stats/ncaa/2003/Boston%20U.,2003,Daniel-Daccarett-31828,Boston%20U.


In [46]:
df.to_csv("raw_data/draft_express_player_basic_stats_pace_adj_07_10_17.csv", 
          index=False)