In [1]:
import requests
import pandas as pd
from pyquery import PyQuery as pq

In [2]:
url_template = "http://www.draftexpress.com/stats/ncaa/{year}/all/basic/"\
               "standard/0/all/all/{pg}"

In [3]:
def create_pq(url):
    """Creates PyQuery object used for scraping"""
    response = requests.get(url)
    html = response.text
    return pq(html)

def get_last_pg(url):
    """Get the last page number to be scraped"""
    pq_obj = create_pq(url)
    last_pg_selector = ".disabled+ li a"
    last_pg = pq_obj(last_pg_selector)
    return last_pg[0].text_content()

def get_links(pq_obj, url_selector):
    """Gets the links associated with the given css selector"""
    urls = pq_obj(url_selector)
    links = [url.get("href") for url in urls]
    return links

def get_data(pq_obj, row_selector):
    """Get table data"""
    rows = pq_obj(row_selector)
    data = [[td.text_content() for td in row] for row in rows]
    return data

def create_df(url, cols):
    """Scrapes data from url and returns a DataFrame with given columns"""
    pq_obj = create_pq(url)
    
    # Extract the links for the players and teams
    player_selector = "#cmn_wrap > div.row.two-cols > div >"\
                      "div.row.inner-page.stats > div > table > tbody > tr > "\
                      "td.text.key > a"
    player_links = get_links(pq_obj, player_selector)
    team_selector = ".key~ .text+ td a"
    team_links = get_links(pq_obj, team_selector)
    
    # get the table data
    row_selector = "#cmn_wrap > div.row.two-cols > div > "\
                   "div.row.inner-page.stats > div > table > tbody > tr"
    data = get_data(pq_obj, row_selector)

    df = pd.DataFrame(data=data, columns=cols)
    df["Player_Link"] = player_links
    df["Team_Link"] = team_links
    
    return df

In [4]:
# type out cols just cuz the dx col headers are a mess
cols = ["box", "Player", "Team_Logo", "Team", "G", "MP", "PTS", "FG_2P", 
        "FG_2PA",  "FG_2P_Pct", "FG_3P", "FGA_3P", "FG_3P_Pct", "FT", "FTA",
        "FT_Pct", "OREB", "DREB", "REB", "AST", "STL", "BLK", "TO", "PF"]

In [5]:
last_pgs = {yr: int(get_last_pg(url_template.format(year=yr, pg=1)))
            for yr in range(2003,2018)}

In [6]:
dfs = []
errors = []

In [7]:
for yr in range(2003, 2018):
    for pg in range(1, last_pgs.get(yr)):
        url = url_template.format(year=yr, pg=pg)
        try:
            df = create_df(url, cols)
            df["Season"] = yr
            dfs.append(df)
        except Exception as e:
            errors.append([url, e])

In [8]:
len(dfs)

2745

In [9]:
len(errors)

0

In [10]:
df = pd.concat(dfs, ignore_index=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68625 entries, 0 to 68624
Data columns (total 27 columns):
box            68625 non-null object
Player         68625 non-null object
Team_Logo      68625 non-null object
Team           68625 non-null object
G              68625 non-null object
MP             68625 non-null object
PTS            68625 non-null object
FG_2P          68625 non-null object
FG_2PA         68625 non-null object
FG_2P_Pct      68625 non-null object
FG_3P          68625 non-null object
FGA_3P         68625 non-null object
FG_3P_Pct      68625 non-null object
FT             68625 non-null object
FTA            68625 non-null object
FT_Pct         68625 non-null object
OREB           68625 non-null object
DREB           68625 non-null object
REB            68625 non-null object
AST            68625 non-null object
STL            68625 non-null object
BLK            68625 non-null object
TO             68625 non-null object
PF             68625 non-null object
Pla

In [12]:
df.head()

Unnamed: 0,box,Player,Team_Logo,Team,G,MP,PTS,FG_2P,FG_2PA,FG_2P_Pct,...,DREB,REB,AST,STL,BLK,TO,PF,Player_Link,Team_Link,Season
0,,Ruben Douglas,\n \n ...,\n \n ...,28,35.1,28.0,4.4,11.1,39.9%,...,4.8,6.6,2.1,1.3,0.3,3.9,2.7,/profile/Ruben-Douglas-5377/,/stats/ncaa/2003/New%20Mexico,2003
1,,Henry Domercant,\n \n ...,\n \n ...,29,33.7,27.9,5.8,12.1,47.7%,...,4.8,6.9,2.8,1.4,0.5,2.4,2.1,/profile/Henry-Domercant-5218/,/stats/ncaa/2003/EIU,2003
2,,Mike Helms,\n \n ...,\n \n ...,28,34.5,26.9,6.0,12.1,49.4%,...,2.7,4.0,2.0,1.4,0.3,3.8,3.1,/profile/Mike-Helms-30942/,/stats/ncaa/2003/Oakland,2003
3,,Shawn Ray,\n \n ...,\n \n ...,1,38.0,26.0,7.0,11.0,63.6%,...,4.0,6.0,4.0,1.0,2.0,1.0,2.0,/profile/Shawn-Ray-32826/,/stats/ncaa/2003/North%20Carolina%20Central,2003
4,,Michael Watson,\n \n ...,\n \n ...,29,38.8,25.5,4.4,11.0,40.4%,...,2.9,3.7,3.8,1.4,0.2,3.7,2.4,/profile/Michael-Watson-32035/,/stats/ncaa/2003/UMKC,2003


In [13]:
df.tail()

Unnamed: 0,box,Player,Team_Logo,Team,G,MP,PTS,FG_2P,FG_2PA,FG_2P_Pct,...,DREB,REB,AST,STL,BLK,TO,PF,Player_Link,Team_Link,Season
68620,,Nikola Marijan,\n \n ...,\n \n ...,3,3.3,0.0,0.0,0.3,0.0%,...,0.3,0.3,0.0,0.0,0.0,0.7,1.3,/profile/Nikola-Marijan-46608/,/stats/ncaa/2017/South%20Alabama,2017
68621,,Michael Klebon,\n \n ...,\n \n ...,7,2.1,0.0,0.0,0.3,0.0%,...,0.3,0.4,0.0,0.0,0.0,0.3,0.0,/profile/Michael-Klebon-96329/,/stats/ncaa/2017/Saint%20Francis%20%28PA%29,2017
68622,,Abel Porter,\n \n ...,\n \n ...,2,2.5,0.0,0.0,0.0,0.0%,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,/profile/Abel-Porter-103688/,/stats/ncaa/2017/Utah%20St,2017
68623,,Sam Burkart,\n \n ...,\n \n ...,1,1.0,0.0,0.0,0.0,0.0%,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,/profile/Sam-Burkart-106317/,/stats/ncaa/2017/Milwaukee,2017
68624,,Latio Cosmos,\n \n ...,\n \n ...,3,4.0,0.0,0.0,0.0,0.0%,...,0.3,0.3,0.7,0.0,0.0,0.7,1.0,/profile/Latio-Cosmos-106462/,/stats/ncaa/2017/Seattle%20University,2017


In [14]:
df["DX_Player_ID"] = df.Player_Link.str.extract("/.*/(.*)/", expand=False)
df["DX_College_ID"] = df.Team_Link.str.extract("/.*/.*/.*/(.*)", expand=False)

In [15]:
df.head()

Unnamed: 0,box,Player,Team_Logo,Team,G,MP,PTS,FG_2P,FG_2PA,FG_2P_Pct,...,AST,STL,BLK,TO,PF,Player_Link,Team_Link,Season,DX_Player_ID,DX_College_ID
0,,Ruben Douglas,\n \n ...,\n \n ...,28,35.1,28.0,4.4,11.1,39.9%,...,2.1,1.3,0.3,3.9,2.7,/profile/Ruben-Douglas-5377/,/stats/ncaa/2003/New%20Mexico,2003,Ruben-Douglas-5377,New%20Mexico
1,,Henry Domercant,\n \n ...,\n \n ...,29,33.7,27.9,5.8,12.1,47.7%,...,2.8,1.4,0.5,2.4,2.1,/profile/Henry-Domercant-5218/,/stats/ncaa/2003/EIU,2003,Henry-Domercant-5218,EIU
2,,Mike Helms,\n \n ...,\n \n ...,28,34.5,26.9,6.0,12.1,49.4%,...,2.0,1.4,0.3,3.8,3.1,/profile/Mike-Helms-30942/,/stats/ncaa/2003/Oakland,2003,Mike-Helms-30942,Oakland
3,,Shawn Ray,\n \n ...,\n \n ...,1,38.0,26.0,7.0,11.0,63.6%,...,4.0,1.0,2.0,1.0,2.0,/profile/Shawn-Ray-32826/,/stats/ncaa/2003/North%20Carolina%20Central,2003,Shawn-Ray-32826,North%20Carolina%20Central
4,,Michael Watson,\n \n ...,\n \n ...,29,38.8,25.5,4.4,11.0,40.4%,...,3.8,1.4,0.2,3.7,2.4,/profile/Michael-Watson-32035/,/stats/ncaa/2003/UMKC,2003,Michael-Watson-32035,UMKC


In [16]:
df.to_csv("raw_data/draft_express_player_basic_stats_per_game_07_10_17.csv", 
          index=False)