In [1]:
import requests
import pandas as pd
from pyquery import PyQuery as pq

In [2]:
url_template = "http://www.draftexpress.com/nba-pre-draft-measurements/all/"\
               "all/all/all/{}"

In [3]:
def create_pq(url):
    """Creates PyQuery object used for scraping"""
    response = requests.get(url)
    html = response.text
    return pq(html)

def get_links(pq_obj, url_selector):
    """Gets the links associated with the given css selector"""
    urls = pq_obj(url_selector)
    links = [url.get("href") for url in urls]
    return links

def get_data(pq_obj, row_selector):
    """Get table data"""
    rows = pq_obj(row_selector)
    data = [[td.text_content() for td in row] for row in rows]
    return data

def create_df(url, cols):
    """Scrapes data from url and returns a DataFrame with given columns"""
    pq_obj = create_pq(url)
    
    # Extract the links for the players and teams
    player_selector = "#cmn_wrap > div.row.two-cols > div >"\
                      "div.row.inner-page.stats > div > table > tbody > tr > "\
                      "td.text.key > a"
    player_links = get_links(pq_obj, player_selector)
    
    # get the table data
    row_selector = "#cmn_wrap > div.row.two-cols > div > "\
                   "div.row.inner-page.stats > div > table > tbody > tr"
    data = get_data(pq_obj, row_selector)

    df = pd.DataFrame(data=data, columns=cols)
    df["Player_Link"] = player_links
    
    return df

In [4]:
# type out cols just cuz the dx col headers are a mess
cols = ["Player", "Year", "Draft_Pick", "Ht_No_Shoes", "Ht_Shoes", "Wingspan",
        "Standing_reach", "Max_Vert", "Max_Vert_Reach", "No_Step_Vert", 
        "No_Step_Vert_Reach", "Weight", "Body_Fat", "Hand_Length", 
        "Hang_Width", "Bench", "Agility", "Sprint"]

In [5]:
dfs = []
errors = []

In [6]:
for pg in range(1, 43):
    url = url_template.format(pg)
    try:
        df = create_df(url, cols)
        dfs.append(df)
    except Exception as e:
        errors.append([url, e])

In [7]:
len(dfs)

42

In [8]:
len(errors)

0

In [9]:
df = pd.concat(dfs, ignore_index=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3378 entries, 0 to 3377
Data columns (total 19 columns):
Player                3378 non-null object
Year                  3378 non-null object
Draft_Pick            3378 non-null object
Ht_No_Shoes           3378 non-null object
Ht_Shoes              3378 non-null object
Wingspan              3378 non-null object
Standing_reach        3378 non-null object
Max_Vert              3378 non-null object
Max_Vert_Reach        3378 non-null object
No_Step_Vert          3378 non-null object
No_Step_Vert_Reach    3378 non-null object
Weight                3378 non-null object
Body_Fat              3378 non-null object
Hand_Length           3378 non-null object
Hang_Width            3378 non-null object
Bench                 3378 non-null object
Agility               3378 non-null object
Sprint                3378 non-null object
Player_Link           3378 non-null object
dtypes: object(19)
memory usage: 501.5+ KB


In [11]:
df.head()

Unnamed: 0,Player,Year,Draft_Pick,Ht_No_Shoes,Ht_Shoes,Wingspan,Standing_reach,Max_Vert,Max_Vert_Reach,No_Step_Vert,No_Step_Vert_Reach,Weight,Body_Fat,Hand_Length,Hang_Width,Bench,Agility,Sprint,Player_Link
0,\n\t\t\t\t\t\t\n\t\t\t\t\t\t\tShawn Bradley\n\...,1993,2,"7'5 ½""",-,"7'5""",-,-,-,-,-,248,-,-,-,-,-,-,/profile/Shawn-Bradley-1773/
1,\n\t\t\t\t\t\t\n\t\t\t\t\t\t\tMichael Fusek\n\...,2016,-,"7'3 ¾""","7'4 ¾""","7'5""","9'8""","31""","12'3""","25""","11'9""",222,-,-,-,-,12.94,3.54,/profile/Michael-Fusek-69117/
2,\n\t\t\t\t\t\t\n\t\t\t\t\t\t\tPavel Podkolzine...,2003,21,"7'3 ½""","7'5""","7'5 ¾""","9'8""","22.5""","11'6 ½""","19.5""","11'3 ½""",303,16.3,-,-,5,13.4,3.8,/profile/Pavel-Podkolzine-3963/
3,\n\t\t\t\t\t\t\n\t\t\t\t\t\t\tSamuel Deguara\n...,2011,-,"7'3 ¼""","7'4 ½""","7'5 ¾""","9'3 ¼""","24.2""","11'3 ½""","17.6""","10'9""",300,7.5,-,-,-,-,4.21,/profile/Samuel-Deguara-1400/
4,\n\t\t\t\t\t\t\n\t\t\t\t\t\t\tZydrunas Ilgausk...,1996,20,"7'3""",-,-,-,-,-,-,-,258,-,-,-,-,-,-,/profile/Zydrunas-Ilgauskas-2919/


In [12]:
df.tail()

Unnamed: 0,Player,Year,Draft_Pick,Ht_No_Shoes,Ht_Shoes,Wingspan,Standing_reach,Max_Vert,Max_Vert_Reach,No_Step_Vert,No_Step_Vert_Reach,Weight,Body_Fat,Hand_Length,Hang_Width,Bench,Agility,Sprint,Player_Link
3373,\n\t\t\t\t\t\t\n\t\t\t\t\t\t\tDavid Holston\n\...,2009,-,"5'6""","5'7""","5'10 ¼""",-,-,-,-,-,150,-,-,-,-,-,-,/profile/David-Holston-23978/
3374,\n\t\t\t\t\t\t\n\t\t\t\t\t\t\tChris Lykes\n\t\...,2015,-,"5'6""",-,"5'9""",-,-,-,-,-,160,-,-,-,-,-,-,/profile/Chris-Lykes-84485/
3375,\n\t\t\t\t\t\t\n\t\t\t\t\t\t\tSaah Nimley\n\t\...,2015,-,"5'5 ¾""",-,"5'9""","7'3 ½""",-,-,"30.5""","9'10""",155,-,-,-,-,11.86,3.2,/profile/Saah-Nimley-40932/
3376,\n\t\t\t\t\t\t\n\t\t\t\t\t\t\tAquille Carr\n\t...,2013,-,"5'4 ½""","5'5 ½""","5'11 ½""","7'3""","36.5""","10'3 ½""","33""","10'0""",144,-,-,-,-,12.45,3.19,/profile/Aquille-Carr-72546/
3377,\n\t\t\t\t\t\t\n\t\t\t\t\t\t\tChase Adams\n\t\...,2014,-,"5'2""","5'3""","5'5 ½""","6'11""",-,-,-,-,111,-,-,-,-,-,-,/profile/Chase-Adams-85708/


In [13]:
df["DX_Player_ID"] = df.Player_Link.str.extract("/.*/(.*)/", expand=False)
df.head()

Unnamed: 0,Player,Year,Draft_Pick,Ht_No_Shoes,Ht_Shoes,Wingspan,Standing_reach,Max_Vert,Max_Vert_Reach,No_Step_Vert,No_Step_Vert_Reach,Weight,Body_Fat,Hand_Length,Hang_Width,Bench,Agility,Sprint,Player_Link,DX_Player_ID
0,\n\t\t\t\t\t\t\n\t\t\t\t\t\t\tShawn Bradley\n\...,1993,2,"7'5 ½""",-,"7'5""",-,-,-,-,-,248,-,-,-,-,-,-,/profile/Shawn-Bradley-1773/,Shawn-Bradley-1773
1,\n\t\t\t\t\t\t\n\t\t\t\t\t\t\tMichael Fusek\n\...,2016,-,"7'3 ¾""","7'4 ¾""","7'5""","9'8""","31""","12'3""","25""","11'9""",222,-,-,-,-,12.94,3.54,/profile/Michael-Fusek-69117/,Michael-Fusek-69117
2,\n\t\t\t\t\t\t\n\t\t\t\t\t\t\tPavel Podkolzine...,2003,21,"7'3 ½""","7'5""","7'5 ¾""","9'8""","22.5""","11'6 ½""","19.5""","11'3 ½""",303,16.3,-,-,5,13.4,3.8,/profile/Pavel-Podkolzine-3963/,Pavel-Podkolzine-3963
3,\n\t\t\t\t\t\t\n\t\t\t\t\t\t\tSamuel Deguara\n...,2011,-,"7'3 ¼""","7'4 ½""","7'5 ¾""","9'3 ¼""","24.2""","11'3 ½""","17.6""","10'9""",300,7.5,-,-,-,-,4.21,/profile/Samuel-Deguara-1400/,Samuel-Deguara-1400
4,\n\t\t\t\t\t\t\n\t\t\t\t\t\t\tZydrunas Ilgausk...,1996,20,"7'3""",-,-,-,-,-,-,-,258,-,-,-,-,-,-,/profile/Zydrunas-Ilgauskas-2919/,Zydrunas-Ilgauskas-2919


In [14]:
df.to_csv("raw_data/draft_express_player_measurements_07_10_17.csv", 
          index=False)