In [1]:
import pandas as pd
import pfr

In [2]:
url_template = "http://www.pro-football-reference.com/draft/{}-combine.htm"
row_css_selector = "#combine > tbody > tr" 
col_css_selector = "#combine > thead > tr > th.poptip"
dfs = []
errors = []
for i in range (2000, 2019):
    url = url_template.format(i)
    try:
        df = pfr.get_combine_table(url, row_css_selector, col_css_selector, 0, 3)
        df["Year"] = i
        dfs.append(df)
        print('Scraped:', url)
    except Exception as e:
        errors.append([url, e])
        print('ERROR:', url)

Scraped: http://www.pro-football-reference.com/draft/2000-combine.htm
Scraped: http://www.pro-football-reference.com/draft/2001-combine.htm
Scraped: http://www.pro-football-reference.com/draft/2002-combine.htm
Scraped: http://www.pro-football-reference.com/draft/2003-combine.htm
Scraped: http://www.pro-football-reference.com/draft/2004-combine.htm
Scraped: http://www.pro-football-reference.com/draft/2005-combine.htm
Scraped: http://www.pro-football-reference.com/draft/2006-combine.htm
Scraped: http://www.pro-football-reference.com/draft/2007-combine.htm
Scraped: http://www.pro-football-reference.com/draft/2008-combine.htm
Scraped: http://www.pro-football-reference.com/draft/2009-combine.htm
Scraped: http://www.pro-football-reference.com/draft/2010-combine.htm
Scraped: http://www.pro-football-reference.com/draft/2011-combine.htm
Scraped: http://www.pro-football-reference.com/draft/2012-combine.htm
Scraped: http://www.pro-football-reference.com/draft/2013-combine.htm
Scraped: http://www.

In [3]:
len(dfs)

19

In [4]:
len(errors)

0

In [5]:
errors

[]

In [6]:
df = pd.concat(dfs, ignore_index=True)

In [7]:
df.head()

Unnamed: 0,Player,Pos,School,College,Ht,Wt,40yd,Vertical,Bench,Broad Jump,3Cone,Shuttle,Drafted (tm/rnd/yr),Year
0,"(John Abraham, AbraJo00)",OLB,South Carolina,,6-4,252,4.55,,,,,,New York Jets / 1st / 13th pick / 2000,2000
1,"(Shaun Alexander, AlexSh00)",RB,Alabama,https://www.sports-reference.com/cfb/players/s...,6-0,218,4.58,,,,,,Seattle Seahawks / 1st / 19th pick / 2000,2000
2,"(Darnell Alford, AlfoDa20)",OT,Boston College,,6-4,334,5.56,25.0,23.0,94.0,8.48,4.98,Kansas City Chiefs / 6th / 188th pick / 2000,2000
3,"(Kyle Allamon, None)",TE,Texas Tech,,6-2,253,4.97,29.0,,104.0,7.29,4.49,,2000
4,"(Rashard Anderson, AndeRa21)",CB,Jackson State,,6-2,206,4.55,34.0,,123.0,7.18,4.15,Carolina Panthers / 1st / 23rd pick / 2000,2000


In [8]:
df.tail()

Unnamed: 0,Player,Pos,School,College,Ht,Wt,40yd,Vertical,Bench,Broad Jump,3Cone,Shuttle,Drafted (tm/rnd/yr),Year
6213,"(Chris Worley, WorlCh00)",ILB,Ohio St.,https://www.sports-reference.com/cfb/players/c...,6-2,238,4.86,29.5,15.0,,,,,2018
6214,"(Isaiah Wynn, WynnIs00)",OG,Georgia,https://www.sports-reference.com/cfb/players/i...,6-3,313,,,,,,,New England Patriots / 1st / 23rd pick / 2018,2018
6215,"(Isaac Yiadom, YiadIs00)",CB,Boston College,https://www.sports-reference.com/cfb/players/i...,6-1,190,4.52,,8.0,120.0,,4.18,Denver Broncos / 3rd / 99th pick / 2018,2018
6216,"(Kenny Young, None)",ILB,UCLA,https://www.sports-reference.com/cfb/players/k...,6-1,236,4.6,36.0,23.0,117.0,7.38,4.48,,2018
6217,"(Trevon Young, YounTr00)",EDGE,Louisville,https://www.sports-reference.com/cfb/players/t...,6-4,258,4.78,33.0,25.0,114.0,6.99,4.4,Los Angeles Rams / 6th / 205th pick / 2018,2018


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6218 entries, 0 to 6217
Data columns (total 14 columns):
Player                 6218 non-null object
Pos                    6218 non-null object
School                 6218 non-null object
College                6218 non-null object
Ht                     6218 non-null object
Wt                     6218 non-null object
40yd                   6218 non-null object
Vertical               6218 non-null object
Bench                  6218 non-null object
Broad Jump             6218 non-null object
3Cone                  6218 non-null object
Shuttle                6218 non-null object
Drafted (tm/rnd/yr)    6218 non-null object
Year                   6218 non-null int64
dtypes: int64(1), object(13)
memory usage: 680.2+ KB


In [10]:
df[["Player", "Pfr_ID"]] = df.Player.apply(pd.Series)

In [11]:
df["Sref_Cfb_ID"] = df.College.str.extract("/.*/.*/(.*)\.", expand=False)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6218 entries, 0 to 6217
Data columns (total 16 columns):
Player                 6218 non-null object
Pos                    6218 non-null object
School                 6218 non-null object
College                6218 non-null object
Ht                     6218 non-null object
Wt                     6218 non-null object
40yd                   6218 non-null object
Vertical               6218 non-null object
Bench                  6218 non-null object
Broad Jump             6218 non-null object
3Cone                  6218 non-null object
Shuttle                6218 non-null object
Drafted (tm/rnd/yr)    6218 non-null object
Year                   6218 non-null int64
Pfr_ID                 4895 non-null object
Sref_Cfb_ID            4841 non-null object
dtypes: int64(1), object(15)
memory usage: 777.3+ KB


In [13]:
df.head()

Unnamed: 0,Player,Pos,School,College,Ht,Wt,40yd,Vertical,Bench,Broad Jump,3Cone,Shuttle,Drafted (tm/rnd/yr),Year,Pfr_ID,Sref_Cfb_ID
0,John Abraham,OLB,South Carolina,,6-4,252,4.55,,,,,,New York Jets / 1st / 13th pick / 2000,2000,AbraJo00,
1,Shaun Alexander,RB,Alabama,https://www.sports-reference.com/cfb/players/s...,6-0,218,4.58,,,,,,Seattle Seahawks / 1st / 19th pick / 2000,2000,AlexSh00,shaun-alexander-1
2,Darnell Alford,OT,Boston College,,6-4,334,5.56,25.0,23.0,94.0,8.48,4.98,Kansas City Chiefs / 6th / 188th pick / 2000,2000,AlfoDa20,
3,Kyle Allamon,TE,Texas Tech,,6-2,253,4.97,29.0,,104.0,7.29,4.49,,2000,,
4,Rashard Anderson,CB,Jackson State,,6-2,206,4.55,34.0,,123.0,7.18,4.15,Carolina Panthers / 1st / 23rd pick / 2000,2000,AndeRa21,


In [14]:
df.to_csv("raw_data/pfr_combine_data.csv", index=False)