In [1]:
import numpy as np
import pandas as pd
import lxml.html
import requests
from lxml.cssselect import CSSSelector

In [2]:
def scrape_css(css_selector, url):
    """Scrape the webpage based on the CSS Selector passed in."""
    css = css_selector
    url = url
    r = requests.get(url)
    tree = lxml.html.fromstring(r.text)
    selector = CSSSelector(css)
    elements = selector(tree)
    return elements

In [3]:
url_2015 = "http://www.pro-football-reference.com/years/2015/draft.htm"

In [4]:
css_selector = "#drafts thead .show_partial_when_sorting~ .tooltip , " \
               "#drafts td , #drafts thead .show_partial_when_sorting"

In [5]:
elements = scrape_css(css_selector, url_2015)

In [6]:
len(elements)

7453

In [7]:
headers = [elem.text for elem in elements if elem.tag == "th"]

In [8]:
tds = [elem for elem in elements if elem.tag == "td"]

In [9]:
data = []

In [10]:
for td in tds:
    if len(td.getchildren())  == 0:
        data.append(td.text)
    # if the child of the element has tag a, then get the text for the column
    elif td.getchildren()[0].tag == "a":
        item = td.getchildren()[0].text
        data.append(item)
    # finally, there are tags left with child that is strong, 
    # which is the td element that contains the player name
    else:
        player = td.getchildren()[0].getchildren()[0].text
        data.append(player)

In [11]:
num_rows = int(len(tds) / len(headers))

In [12]:
np.array(data).reshape(num_rows, len(headers))

array([['1', '1', 'TAM', ..., None, 'Florida St.', 'College Stats'],
       ['1', '2', 'TEN', ..., None, 'Oregon', 'College Stats'],
       ['1', '3', 'JAX', ..., None, 'Florida', 'College Stats'],
       ..., 
       ['7', '254', 'SFO', ..., None, 'South Carolina', None],
       ['7', '255', 'IND', ..., None, 'Mars Hill', None],
       ['7', '256', 'ARI', ..., None, 'Louisville', 'College Stats']], dtype=object)

In [13]:
data = np.array(data).reshape(num_rows, len(headers))

In [14]:
df = pd.DataFrame(data, columns=headers)

In [15]:
df.head()

Unnamed: 0,Rnd,Pick,Tm,None,Pos,Age,To,AP1,PB,St,...,Yds,TD,Rec,Yds.1,TD.1,Tkl,Int,Sk,College/Univ,None.1
0,1,1,TAM,Jameis Winston,QB,21,2015.0,0,1,1,...,210.0,6.0,,,,,,,Florida St.,College Stats
1,1,2,TEN,Marcus Mariota,QB,21,2015.0,0,0,1,...,252.0,2.0,1.0,41.0,1.0,,,,Oregon,College Stats
2,1,3,JAX,Dante Fowler Jr.,OLB,21,,0,0,0,...,,,,,,,,,Florida,College Stats
3,1,4,OAK,Amari Cooper,WR,21,2015.0,0,1,1,...,-3.0,0.0,72.0,1070.0,6.0,,,,Alabama,College Stats
4,1,5,WAS,Brandon Scherff,T,23,2015.0,0,0,1,...,,,,,,,,,Iowa,College Stats


In [16]:
col_link_elems = scrape_css("td:nth-child(29) a", url_2015) 

In [17]:
col_links = [elem.get("href") for elem in col_link_elems]

In [18]:
col_links[:10]

['http://www.sports-reference.com/cfb/players/jameis-winston-1.html',
 'http://www.sports-reference.com/cfb/players/marcus-mariota-1.html',
 'http://www.sports-reference.com/cfb/players/dante-fowler-jr-1.html',
 'http://www.sports-reference.com/cfb/players/amari-cooper-1.html',
 'http://www.sports-reference.com/cfb/players/brandon-scherff-1.html',
 'http://www.sports-reference.com/cfb/players/leonard-williams-1.html',
 'http://www.sports-reference.com/cfb/players/kevin-white-6.html',
 'http://www.sports-reference.com/cfb/players/vic-beasley-1.html',
 'http://www.sports-reference.com/cfb/players/ereck-flowers-1.html',
 'http://www.sports-reference.com/cfb/players/todd-gurley-1.html']

In [19]:
player_link_elems = scrape_css("strong a", url_2015)
player_links = [elem.get("href") for elem in player_link_elems]

In [20]:
player_links[:5]

['/players/W/WinsJa00.htm',
 '/players/M/MariMa01.htm',
 '/players/F/FowlDa00.htm',
 '/players/C/CoopAm00.htm',
 '/players/S/ScheBr00.htm']

In [21]:
def create_data_matrix(tds, num_rows, num_cols):
    data = []
    for td in tds:
        if len(td.getchildren())  == 0:
            data.append(td.text)
        # if the child of the element has tag a, then get the text for the column
        elif td.getchildren()[0].tag == "a":
            item = td.getchildren()[0].text
            data.append(item)
        # finally, there are tags left with child that is strong, 
        # which is the td element that contains the player name
        else:
            player = td.getchildren()[0].getchildren()[0].text
            data.append(player)
    data = np.array(data).reshape(num_rows, num_cols)
    return data

In [30]:
data2 = create_data_matrix(tds, num_rows, len(headers))

In [31]:
data2

array([['1', '1', 'RAM', ..., None, 'California', 'College Stats'],
       ['1', '2', 'PHI', ..., None, 'North Dakota St.', 'College Stats'],
       ['1', '3', 'SDG', ..., None, 'Ohio St.', 'College Stats'],
       ..., 
       ['7', '251', 'PHI', ..., None, 'Oregon', None],
       ['7', '252', 'CAR', ..., None, 'Montana St.', None],
       ['7', '253', 'TEN', ..., None, 'Southern Miss', None]], dtype=object)

In [52]:
del df

In [80]:
# create the url template to insert the year for each draft
url_template = "http://www.pro-football-reference.com/years/{year}/draft.htm"

drafts_df = pd.DataFrame()

errors_list = []

for year in range(1967, 2017):
    url = url_template.format(year=year)
    try:
        elements = scrape_css(css_selector, url)

        headers = [elem.text for elem in elements if elem.tag == "th"]
        
        print(str(year) + ": " + str(len(headers)))
        
#         tds = [elem for elem in elements if elem.tag == "td"]

#         num_cols = len(headers)
#         num_rows = int(len(tds) / num_cols)

#         draft_data = create_data_matrix(tds, num_rows, num_cols)

#         col_link_elems = scrape_css("td:nth-child(29) a", url_2015)
#         col_links = [elem.get("href") for elem in col_link_elems]

#         player_link_elems = scrape_css("strong a", url_2015)
#         player_links = [elem.get("href") for elem in player_link_elems]

#         df = pd.DataFrame(draft_data, columns = headers)
        
#         print(df)

#         df["Player_Url_Ending"] = pd.Series(player_links)
#         df["College_Stats_Link"] = pd.Series(col_links)
#         df["Year"] = year
            
#         drafts_df = pd.concat([drafts_df, df], ignore_index=True)
    except Exception as e: # catch errors and store the links and the error it caused
        errors_list.append([url, e])

1967: 28
1968: 28
1969: 28
1970: 28
1971: 28
1972: 28
1973: 28
1974: 28
1975: 28
1976: 28
1977: 28
1978: 28
1979: 28
1980: 28
1981: 28
1982: 28
1983: 28
1984: 28
1985: 28
1986: 28
1987: 28
1988: 28
1989: 28
1990: 28
1991: 28
1992: 28
1993: 28
1994: 29
1995: 29
1996: 29
1997: 29
1998: 29
1999: 29
2000: 29
2001: 29
2002: 29
2003: 29
2004: 29
2005: 29
2006: 29
2007: 29
2008: 29
2009: 29
2010: 29
2011: 29
2012: 29
2013: 29
2014: 29
2015: 29
2016: 29


In [79]:
errors_list

[['http://www.pro-football-reference.com/years/1967/draft.htm',
  TypeError("unsupported operand type(s) for +: 'int' and 'str'")],
 ['http://www.pro-football-reference.com/years/1968/draft.htm',
  TypeError("unsupported operand type(s) for +: 'int' and 'str'")],
 ['http://www.pro-football-reference.com/years/1969/draft.htm',
  TypeError("unsupported operand type(s) for +: 'int' and 'str'")],
 ['http://www.pro-football-reference.com/years/1970/draft.htm',
  TypeError("unsupported operand type(s) for +: 'int' and 'str'")],
 ['http://www.pro-football-reference.com/years/1971/draft.htm',
  TypeError("unsupported operand type(s) for +: 'int' and 'str'")],
 ['http://www.pro-football-reference.com/years/1972/draft.htm',
  TypeError("unsupported operand type(s) for +: 'int' and 'str'")],
 ['http://www.pro-football-reference.com/years/1973/draft.htm',
  TypeError("unsupported operand type(s) for +: 'int' and 'str'")],
 ['http://www.pro-football-reference.com/years/1974/draft.htm',
  TypeError(

In [88]:
dfs = pd.read_html(url_2015, skiprows=0)

In [89]:
len(dfs)

2

In [90]:
dfs[0].head()

Unnamed: 0.1,Unnamed: 0,Misc,Unnamed: 2,Approx Val,Unnamed: 4,Passing,Rushing,Receiving,Unnamed: 8,Unnamed: 9,...,Yds.1,TD.1,Rec,Yds.2,TD.2,Tkl,Int.1,Sk,College/Univ,Unnamed: 40
0,1,1,TAM,Jameis Winston,QB,21,2015.0,0,1,1,...,,,,,,,,,,
1,1,2,TEN,Marcus Mariota,QB,21,2015.0,0,0,1,...,,,,,,,,,,
2,1,3,JAX,Dante Fowler Jr.,OLB,21,,0,0,0,...,,,,,,,,,,
3,1,4,OAK,Amari Cooper,WR,21,2015.0,0,1,1,...,,,,,,,,,,
4,1,5,WAS,Brandon Scherff,T,23,2015.0,0,0,1,...,,,,,,,,,,


In [87]:
dfs[0].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 268 entries, 0 to 267
Data columns (total 41 columns):
Unnamed: 0      262 non-null object
Misc            268 non-null object
Unnamed: 2      262 non-null object
Approx Val      262 non-null object
Unnamed: 4      262 non-null object
Passing         252 non-null object
Rushing         216 non-null object
Receiving       268 non-null object
Unnamed: 8      262 non-null object
Unnamed: 9      262 non-null object
Unnamed: 10     210 non-null object
Unnamed: 11     204 non-null object
Rnd             210 non-null object
Pick            10 non-null object
Tm              10 non-null object
Unnamed: 15     10 non-null object
Pos             10 non-null object
Age             10 non-null object
To              37 non-null object
AP1             37 non-null object
PB              37 non-null object
St              63 non-null object
CarAV           63 non-null object
DrAV            63 non-null object
G               89 non-null object
Cmp    

## Issues with 