In [1]:
import requests
import pandas as pd
from pyquery import PyQuery as pq

In [2]:
url_template = "http://www.draftexpress.com/RSCI/{}/"

In [3]:
def create_pq(url):
    """Creates PyQuery object used for scraping"""
    response = requests.get(url)
    html = response.text
    return pq(html)

def get_links(pq_obj, url_selector):
    """Gets the links associated with the given css selector"""
    urls = pq_obj(url_selector)
    links = [url.get("href") for url in urls]
    return links

def get_data(pq_obj, row_selector):
    """Get table data"""
    rows = pq_obj(row_selector)
    # only select a subset of the coulmns, don't need each individual
    # scouting rank and the NCAA team column is fucked
    data = [[td.text_content() for td in row[:2]] +
            [td.text_content() for td in row[-3:]]
             for row in rows]
    return data

def create_df(url, cols):
    """Scrapes data from url and returns a DataFrame with given columns"""
    pq_obj = create_pq(url)
    
    # Extract the links for the players and teams
    player_selector = ".key:nth-child(2) a"
    player_links = get_links(pq_obj, player_selector)
    
    # get the table data
    row_selector = "tr"
    data = get_data(pq_obj, row_selector)

    # note data[1:], because first row contains table headers
    df = pd.DataFrame(data=data[1:], columns=cols)
    df["Player_Link"] = player_links
    
    return df

In [4]:
cols = ["RSCI_Rank", "Player", "Draft_Pick", "Draft_Year", "NBA_EWA"]

In [5]:
dfs = []
errors = []

In [6]:
for year in range(1998, 2017):
    url = url_template.format(year)
    try:
        df = create_df(url, cols)
        df["Rank_Year"] = year
        dfs.append(df)
    except Exception as e:
        errors.append([url, e])

In [7]:
len(dfs)

19

In [8]:
len(errors)

0

In [9]:
df = pd.concat(dfs, ignore_index=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3556 entries, 0 to 3555
Data columns (total 7 columns):
RSCI_Rank      3556 non-null object
Player         3556 non-null object
Draft_Pick     3556 non-null object
Draft_Year     3556 non-null object
NBA_EWA        3556 non-null object
Player_Link    3556 non-null object
Rank_Year      3556 non-null int64
dtypes: int64(1), object(6)
memory usage: 194.5+ KB


In [11]:
df.head()

Unnamed: 0,RSCI_Rank,Player,Draft_Pick,Draft_Year,NBA_EWA,Player_Link,Rank_Year
0,1,\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\tAl Harrington\...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t25\n\t\t\t\t\t\t\t...,\n\t\t\t\t\t\t\t\t\t\t\t\t1998\n\t\t\t\t\t\t\...,2.65,/profile/Al-Harrington-2734/,1998
1,2,\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\tRashard Lewis\...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t32\n\t\t\t\t\t\t\t...,\n\t\t\t\t\t\t\t\t\t\t\t\t1998\n\t\t\t\t\t\t\...,5.85,/profile/Rashard-Lewis-3314/,1998
2,3,\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\tKorleone Young...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t40\n\t\t\t\t\t\t\t...,\n\t\t\t\t\t\t\t\t\t\t\t\t1998\n\t\t\t\t\t\t\...,0.15,/profile/Korleone-Young-5007/,1998
3,4,\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\tDan Gadzuric\n...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t33\n\t\t\t\t\t\t\t...,\n\t\t\t\t\t\t\t\t\t\t\t\t2002\n\t\t\t\t\t\t\...,1.3,/profile/Dan-Gadzuric-2495/,1998
4,5,\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\tStromile Swift...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t2\n\t\t\t\t\t\t\t\...,\n\t\t\t\t\t\t\t\t\t\t\t\t2000\n\t\t\t\t\t\t\...,2.57,/profile/Stromile-Swift-4529/,1998


In [12]:
df.tail()

Unnamed: 0,RSCI_Rank,Player,Draft_Pick,Draft_Year,NBA_EWA,Player_Link,Rank_Year
3551,251,\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\tJavien William...,\n\t\t\t\t\t\t\t\t\t\t\t,\n\t\t\t\t\t\t\t\t\t\t\t,-,/profile/Javien-Williams-102571/,2016
3552,252,\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\tCordell Pemsl\...,\n\t\t\t\t\t\t\t\t\t\t\t,\n\t\t\t\t\t\t\t\t\t\t\t,-,/profile/Cordell-Pemsl-83160/,2016
3553,253,\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\tJoe Hampton\n\...,\n\t\t\t\t\t\t\t\t\t\t\t,\n\t\t\t\t\t\t\t\t\t\t\t,-,/profile/Joe-Hampton-83273/,2016
3554,254,\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\tMichael Hurt\n...,\n\t\t\t\t\t\t\t\t\t\t\t,\n\t\t\t\t\t\t\t\t\t\t\t,-,/profile/Michael-Hurt-84437/,2016
3555,255,\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\tJosh Parrish\n...,\n\t\t\t\t\t\t\t\t\t\t\t,\n\t\t\t\t\t\t\t\t\t\t\t,-,/profile/Josh-Parrish-82193/,2016


In [13]:
df["DX_Player_ID"] = df.Player_Link.str.extract("/.*/(.*)/", expand=False)
df.head()

Unnamed: 0,RSCI_Rank,Player,Draft_Pick,Draft_Year,NBA_EWA,Player_Link,Rank_Year,DX_Player_ID
0,1,\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\tAl Harrington\...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t25\n\t\t\t\t\t\t\t...,\n\t\t\t\t\t\t\t\t\t\t\t\t1998\n\t\t\t\t\t\t\...,2.65,/profile/Al-Harrington-2734/,1998,Al-Harrington-2734
1,2,\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\tRashard Lewis\...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t32\n\t\t\t\t\t\t\t...,\n\t\t\t\t\t\t\t\t\t\t\t\t1998\n\t\t\t\t\t\t\...,5.85,/profile/Rashard-Lewis-3314/,1998,Rashard-Lewis-3314
2,3,\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\tKorleone Young...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t40\n\t\t\t\t\t\t\t...,\n\t\t\t\t\t\t\t\t\t\t\t\t1998\n\t\t\t\t\t\t\...,0.15,/profile/Korleone-Young-5007/,1998,Korleone-Young-5007
3,4,\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\tDan Gadzuric\n...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t33\n\t\t\t\t\t\t\t...,\n\t\t\t\t\t\t\t\t\t\t\t\t2002\n\t\t\t\t\t\t\...,1.3,/profile/Dan-Gadzuric-2495/,1998,Dan-Gadzuric-2495
4,5,\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t\tStromile Swift...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t2\n\t\t\t\t\t\t\t\...,\n\t\t\t\t\t\t\t\t\t\t\t\t2000\n\t\t\t\t\t\t\...,2.57,/profile/Stromile-Swift-4529/,1998,Stromile-Swift-4529


In [14]:
df.to_csv("raw_data/draft_express_player_RSCI_ranks_07_10_17.csv", 
          index=False)