# Are 5-star Recruits From the Southeast Better Than Those From other Regions?

Much has been made of the SEC's dominance in college football in recent years, and, much of that success can be attributed to the sheer volume of high profile recruits the region produces.

In [45]:
import requests
from lxml import html
import time
import pandas as pd

In [2]:
top_recruit_lists = {i:'http://247sports.com/Season/%i-Football/CompositeRecruitRankings'%i for i in range(2000, 2012)}
nfl_draft_lists = {i:'https://en.wikipedia.org/wiki/%i_NFL_draft'%i for i in range(2004, 2016)}

In [3]:
headers = {'User-agent': 'bingbot'}
# get 247 composite rankings for every year from 2000-2011
for year in top_recruit_lists:
    top_recruit_lists[year] = requests.get(top_recruit_lists[year], headers=headers)
    time.sleep(0.25) # dont want to spam the server (even though this is a pretty small crawl)

# get NFL Draft results for every year from 2004-2015
for year in nfl_draft_lists:
    nfl_draft_lists[year] = requests.get(nfl_draft_lists[year], headers=headers)
    time.sleep(0.25)

In [161]:
names = []
rank = []
hs_location = []
recruit_year = []

for year in top_recruit_lists:
    tree = html.fromstring(top_recruit_lists[year].content)
    names += tree.xpath('//a[@class="bold"]/text()')
    rank += tree.xpath('//span[@class="primary"]/text()')
    hs_location += tree.xpath('//span[@class="meta"]/text()')
    recruit_year += [year] * len(tree.xpath('//a[@class="bold"]/text()'))

In [162]:
state = []
region = []
region_map = {'WA': "West",
              'OR': "West",
              'CA': "West",
              'ID': "West",
              'NV': "West",
              'AZ': "West",
              'UT': "West",
              'MT': "West",
              'WY': "West",
              'CO': "West",
              'NM': "West",
              'ND': "Midwest",
              'SD': "Midwest",
              'NE': "Midwest",
              'MN': "Midwest",
              'IA': "Midwest",
              'WI': "Midwest",
              'IL': "Midwest",
              'IN': "Midwest",
              'MI': "Midwest",
              'OH': "Midwest",
              'PA': "Midwest",
              'ME': "East",
              'NH': "East",
              'VT': "East",
              'MA': "East",
              'NY': "East",
              'CT': "East",
              'RI': "East",
              'NJ': "East",
              'DC': "East",
              'DE': "East",
              'MD': "East",
              'WV': "East",
              'VA': "East",
              'NC': "East",
              'MO': "Southeast",
              'KY': "Southeast",
              'TN': "Southeast",
              'SC': "Southeast",
              'GA': "Southeast",
              'AL': "Southeast",
              'MS': "Southeast",
              'LA': "Southeast",
              'FL': "Southeast",
              'AR': "Southeast",
              'KS': "Southwest",
              'TX': "Southwest",
              'OK': "Southwest",
              'HI': "Pacific",
              'AK': "Pacific",
             }
for hs in hs_location:
    st = hs.split(', ')[-1].split(')')[0]
    state.append(st)
    region.append(region_map[st])

In [163]:
df = pd.DataFrame(index=names)
df['rank'] = rank
df['hs_location'] = hs_location
df['recruit_year'] = recruit_year
df['state'] = state
df['region'] = region
df = df.drop('Adam Taliaferro') # mistakenly ranked number 1 by website

In [164]:
for year in nfl_draft_lists:
    tree = html.fromstring(nfl_draft_lists[year].content)
    
    pick_num = tree.xpath("//span[@id]/@id")
    to_remove = []
    for i in range(len(pick_num)):
        val = pick_num[i]
        if 'Pick_' not in val:
            to_remove.append(val)
        else:
            num = val.split('_')[-1]
            try:
                pick_num[i] = int(num)
            except ValueError:
                to_remove.append(val) # wikipedia annoyingly labels forfeited picks as decimals between two picks (e.g. 30.5)

    for rmv in to_remove:
        pick_num.remove(rmv)
    
    names = tree.xpath('//span[@class="sortkey"]/text() | //td/span[@style="display:none;"]/text()')
    for i in range(len(names)):
        name = names[i]
        name = name.split(',')
        name[1] = name[1].replace(' ', '')
        names[i] = name[1] + ' ' + name[0]
    names = names[:len(pick_num)]
    
    for i in range(len(names)):
        name = names[i]
        try:
            if year <= df.loc[name, 'recruit_year'] + 6 and year >= df.loc[name, 'recruit_year'] + 3:
                df.loc[name, 'draft_pick'] = i+1
                df.loc[name, 'draft_year'] = year
        except:
            pass # pick wasn't in 247 top 50 composite

In [165]:
df['drafted'] = (~df.draft_pick.isnull()).astype('int')
df['rank'] = df['rank'].astype('int')

In [177]:
df[df.region == 'Southeast'].drafted.sum() / float(len(df[df.region == 'Southeast']))

0.46255506607929514

In [174]:
df[df.region == 'Midwest'].drafted.sum() / float(len(df[df.region == 'Midwest']))

0.3229166666666667

In [178]:
df.to_pickle('recruit_data.pkl')

In [180]:
df = pd.read_pickle('recruit_data.pkl')