# NFL Comparisons
#### In this notebook, I explore using a simple euclidean distance between NFL combine and NCAA season stats to generate a list of 'best comparables' for all NFL players since the inputted year. Only QB, RB, WR, and TEs are used year because...they're more interesting.. and participate in more Combine events than say offensive linemen.

In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import re
import time

import math
from sklearn.preprocessing import normalize
from scipy.spatial import distance

from urllib2 import urlopen
from bs4 import BeautifulSoup

#### The getCombineStats function uses BeautifulSoup to scrape NFL combine metrics from pro-football-reference pages like https://www.pro-football-reference.com/draft/2017-combine.htm. The neat format on the website requires some cleaning and formatting once importing, which is done in the cleanDf function. 


In [2]:
def getCombineStats(year):
    allData = []
    collegeLinks = []
    url = 'https://www.pro-football-reference.com/draft/' + str(year) + '-combine.htm'
    html = urlopen(url)
    soup = BeautifulSoup(html, "lxml")
    column_headers = [th.getText() for th in 
                  soup.findAll('tr', limit=2)[0].findAll('th')]
    column_headers = [x.encode('UTF8') for x in column_headers]
    table_rows = soup.select("#div_combine tr")[2:]
    for row in table_rows:
        indiv_list = []
        for th in row.find_all('th'):
            indiv_list.append(str(th.get_text()))
        for td in row.find_all('td'):
            indiv_list.append(str(td.get_text()))
        links = row.find_all('a',href=True)
        if (len(links) > 0) & ('sports-reference' in str(links)):
            for link in links:
                if 'sports-reference' in str(link):
                    indiv_list.append(link['href'])
        else:
            indiv_list.append('No Link')
        allData.append(indiv_list)
    column_headers.append('College Stats Link')
    data = pd.DataFrame(allData,columns=column_headers)
    data['Year'] = np.repeat(year,len(data))
    data = data[(data['Pos'] == 'QB') | (data['Pos'] == 'RB') | (data['Pos'] == 'WR') | (data['Pos'] == 'TE')]
    return data
    

In [3]:
def cleanDF(data):
    data.reset_index(inplace=True)
    data.drop('index',axis=1,inplace=True)
    data = data.replace('',np.nan)
    
    data['Wt'] = data['Wt'].astype(float)
    data['40yd'] = data['40yd'].astype(float)
    data['Vertical'] = data['Vertical'].astype(float)
    data['BenchReps'] = data['Bench'].astype(float)
    data['BroadJump'] = data['Broad Jump'].astype(float)
    data['3Cone'] = data['3Cone'].astype(float)
    data['Shuttle'] = data['Shuttle'].astype(float)
    
    Round = []
    for i in range(len(data)):
        if data['Drafted (tm/rnd/yr)'][i] > 0:
            Round.append(int(data['Drafted (tm/rnd/yr)'][i].split('/')[1][1]))
        else:
            Round.append(np.nan)
    data['Round'] = Round

    Team = []
    for i in range(len(data)):
        if data['Drafted (tm/rnd/yr)'][i] > 0:
            Team.append(data['Drafted (tm/rnd/yr)'][i].split('/')[0])
        else:
            Team.append('')
    data['Team'] = Team


    Pick = []
    for i in range(len(data)):
        if data['Drafted (tm/rnd/yr)'][i] > 0:
            Pick.append(int(data['Drafted (tm/rnd/yr)'][i].split('/')[2].split('pick')[0].strip()[:-2]))
        else:
            Pick.append(np.nan)
    data['Pick'] = Pick
    data['Team'] = data['Team'].apply(lambda row: row.strip())
    data.drop('Drafted (tm/rnd/yr)',inplace=True,axis = 1)
    
    def height(row):
        return int(row[0])*12 + int(row.split('-')[1])
    data['Height'] = data['Ht'].apply(lambda row:height(row))
    data.drop('Ht',axis=1,inplace=True)
    data.drop('College',axis=1,inplace=True)

    return data


#### The getCollegeStats function uses each season's college passing, rushing, and receiving 'leaderboards' (which contain all players) to piece together each player's total career stats year by year. This was significantly quicker than hitting each player's own stats page. 

In [4]:
def getCollegeStats(category,year):
    url = 'https://www.sports-reference.com/cfb/years/' + year + '-' + category + '.html' 
    html = urlopen(url)
    soup = BeautifulSoup(html, "lxml")
    column_headers = [th.getText() for th in 
                      soup.findAll('tr', limit=2)[1].findAll('th')]
    column_headers = [x.encode('UTF8') for x in column_headers]
    column_headers.append('Link')
    table_rows = soup.select("#div_" + category + " tr")[2:]
    innerDF = pd.DataFrame()
    for row in table_rows:
        indiv_list = []
        for th in row.find_all('th'):
            indiv_list.append(str(th.get_text()))
        for td in row.find_all('td'):
            indiv_list.append(str(td.get_text()))
        links = row.find_all('a',href=True)
        if (len(links) > 0) & ('players' in str(links)):
            for link in links:
                if 'players' in str(link):
                    indiv_list.append(link['href'])
        else:
            indiv_list.append('No Link')
        df = []
        df.append(indiv_list)
        
        

        if category == 'rushing':
            innerDF = innerDF.append(df)
        elif category == 'passing':
            innerDF = innerDF.append(df)
        elif category == 'receiving':
            innerDF = innerDF.append(df)   
    if category == 'passing':
        column_headers = ['Rank','Player','School','Conf','G','Completions','Pass Att','Comp Pct','Pass Yds','Pass Y/A','Pass AY/A','Pass TD','Int','Rate','Rush Att','Rush Yds','Rush Avg','Rush TD','Link']
    elif category == 'rushing':
        column_headers = ['Rank','Player','School','Conf','G','Rush att','Rush Yds','Rush Avg','Rush TDs','Rec','Rec Yds','Rec Avg','Rec TDs','Plays','Yds','Avg','TD','Link']
    elif category == 'receiving':
        column_headers = ['Rank','Player','School','Conf','G','Rec','Rec Yds','Rec Avg','Rec TDs','Rush Att','Rush Yds','Rush Avg','Rush TDs','Plays','Yds','Avg','TD','Link']
        
    innerDF.columns = [column_headers]
    def removeChars(player):
        return re.sub("[^a-zA-Z -]+", "", player)
    innerDF['Player'] = innerDF['Player'].apply(lambda x:removeChars(x))
    
    innerDF = innerDF[innerDF['Player'] != 'Player']
    if (category == 'rushing') | (category == 'receiving'):
        innerDF.drop(['Plays','Yds','Avg','TD'],axis = 1, inplace = True)
        
    if category == 'rushing':
        innerDF[['Rush att','Rush Yds','Rush Avg','Rush TDs','Rec','Rec Yds','Rec Avg','Rec TDs']] = innerDF[['Rush att','Rush Yds','Rush Avg','Rush TDs','Rec','Rec Yds','Rec Avg','Rec TDs']].apply(pd.to_numeric)
    elif category == 'receiving':
        innerDF[['Rec','Rec Yds','Rec Avg','Rec TDs','Rush Att','Rush Yds','Rush Avg','Rush TDs']] = innerDF[['Rec','Rec Yds','Rec Avg','Rec TDs','Rush Att','Rush Yds','Rush Avg','Rush TDs']].apply(pd.to_numeric)
    else:
        innerDF[['Completions','Pass Att','Comp Pct','Pass Yds','Pass Y/A','Pass AY/A','Pass TD','Int','Rate','Rush Att','Rush Yds','Rush Avg','Rush TD']] = innerDF[['Completions','Pass Att','Comp Pct','Pass Yds','Pass Y/A','Pass AY/A','Pass TD','Int','Rate','Rush Att','Rush Yds','Rush Avg','Rush TD']].apply(pd.to_numeric)

    innerDF['Link'] = 'https://www.sports-reference.com' + innerDF['Link']
    innerDF.reset_index(inplace=True)
    innerDF.drop(['index','Rank'],inplace=True,axis=1)
    
    
    return innerDF

#### The meanStats function takes the mean of each player's seasons for seasons in which a player played at least 10 games. Other methods were considered, such as averaging the final season and best other season, or simply using the best season to show a player's upside. I decided to go with averaging all qualifying seasons to credit players who had more illustrious careers, though this may not necessarily indicate talent. This should be looked into further.

In [5]:
def meanStats(row):
    pos = row['Pos']
    link = row['College Stats Link']
    if ((pos == 'WR') | (pos == 'TE')):
        try:
            stats = receiving[(receiving['Link'] == link) & (receiving['G'] >= 10)].mean()
            stats['Link'] = link
            return stats
        except:
            pass
    elif pos == 'RB':
        try:
            stats = rushing[(receiving['Link'] == link) & (rushing['G'] >= 10)].mean()
            stats['Link'] = link
            return stats
        except:
            pass
    elif pos == 'QB':
        try: 
            stats = passing[(passing['Link'] == link) & (passing['G'] >= 10)].mean()
            stats['Link'] = link
            return stats
        except:
            pass


#### Finally, the getPlayerComps function actually calculates the Euclidean distance for the desired players versus all other players. First, the different features are normalized to equally weight them before calculating the distance.  

#### Then, the Euclidean distance is calculated. To summarize the concept, this is the sum of the square roots of the difference between the normalized values of two players' attributes. This gives us an idea of how close the two players' measurables are while equally weighting each category (which is a strong assumption on its own - maybe 40 time needs to be weighed much more than college YPC for a RB, etc. - to be explored). 


In [6]:
def getPlayerComps(player):
    playerRow = allYears[allYears['Player'] == str(player)]
    #return player
    if playerRow['Pos'].any() == 'QB':
        data = allYears.copy()
        #data = allYears[allYears['Pos'] == 'QB'].copy()
        cols = ['40yd','Wt','Height','Comp Pct','Completions','Int','Pass AY/A',
               'Pass Att','Pass TD','Pass Yds','Rush Att','Rush Avg','Rush TD','Rush Yds']
        df_num = data[cols]
        df_num = df_num.apply(pd.to_numeric, errors='ignore')
        df_num = (df_num - df_num.mean())/df_num.std()
        df_num.fillna(0,inplace=True)
        df_num['player'] = data['Player']
        player_num = df_num[df_num['player'] == str(player)]
        euc = df_num.drop(['player'],axis=1).apply(lambda row: distance.euclidean(row,player_num.drop(['player'],axis=1)),axis=1)
        df_num['distance'] = euc
        comps = pd.DataFrame(df_num[['player','distance']].sort_values('distance'))
        return comps.head(25)
    elif playerRow['Pos'].any() == 'RB':
        data = allYears.copy()
        #data = allYears[allYears['Pos'] == 'RB'].copy()
        cols = ['3Cone','40yd','Broad Jump', 'Bench','Wt','Height','Rec','Rec Avg',
                'Rec TDs','Rec Yds','Shuttle','Vertical',
               'Rush Att','Rush Avg','Rush TD','Rush Yds']
        df_num = data[cols]
        df_num = df_num.apply(pd.to_numeric, errors='ignore')
        df_num = (df_num - df_num.mean())/df_num.std()
        df_num.fillna(0,inplace=True)
        df_num['player'] = data['Player']
        player_num = df_num[df_num['player'] == str(player)]
        euc = df_num.drop(['player'],axis=1).apply(lambda row: distance.euclidean(row,player_num.drop(['player'],axis=1)),axis=1)
        df_num['distance'] = euc
        comps = pd.DataFrame(df_num[['player','distance']].sort_values('distance'))
        return comps.head(25)
    else:
        data = allYears.copy()
        #data = allYears[(allYears['Pos'] == 'WR') | (allYears['Pos'] == 'TE')].copy()
        cols = ['3Cone','40yd','Broad Jump','Wt','Height','Rec','Rec Avg',
                'Rec TDs','Rec Yds','Shuttle','Vertical',
               'Rush Att','Rush Avg','Rush TD','Rush Yds']
        df_num = data[cols]
        df_num = df_num.apply(pd.to_numeric, errors='ignore')
        df_num = (df_num - df_num.mean())/df_num.std()
        df_num.fillna(0,inplace=True)
        df_num['player'] = data['Player']
        player_num = df_num[df_num['player'] == str(player)]
        euc = df_num.drop(['player'],axis=1).apply(lambda row: distance.euclidean(row,player_num.drop(['player'],axis=1)),axis=1)
        df_num['distance'] = euc
        comps = pd.DataFrame(df_num[['player','distance']].sort_values('distance'))
        return comps.head(25)

#### Below, I run all of the functions and do some final cleanup. See below for some example outputs.

In [7]:
#run everything
allYears = pd.DataFrame()
rangeStart = 2010
rangeEnd = 2018
years = np.arange(rangeStart,rangeEnd + 1,1)
for year in years:
    df = getCombineStats(year)
    allYears = allYears.append(df)
allYears = cleanDF(allYears)

#college stats
types = ['rushing','passing','receiving']

rushing = pd.DataFrame()
passing = pd.DataFrame()
receiving = pd.DataFrame()

for cat in types:
    for year in (np.arange(rangeStart-3,rangeEnd + 1,1)[:-1]):
        if cat == 'rushing':
            #print(str(year) + cat)
            funcOutput = getCollegeStats(cat,str(year))
            funcOutput['Year'] = year
            rushing = pd.concat([rushing,funcOutput])
        elif cat == 'passing':
            #print(str(year) + cat)
            funcOutput = getCollegeStats(cat,str(year))
            funcOutput['Year'] = year
            passing = pd.concat([passing,funcOutput])
        elif cat == 'receiving':
            #print(str(year) + cat)
            funcOutput = getCollegeStats(cat,str(year))
            funcOutput['Year'] = year
            receiving = pd.concat([receiving,funcOutput])
            
def noLink(row):
    if row['College Stats Link'] == 'No Link':
        return 'No Link ' + row['Player']
    else:
        return row['College Stats Link']
    
def combineBench(row):
    if type(row['Bench']) != str:
        return row['BenchReps']
    else:
        return row['Bench']

def combineBroad(row):
    if type(row['Broad Jump']) != str:
        return row['BroadJump']
    else:
        return row['Broad Jump']   
    
def combineRush(row):
    if type(row['Rush TD']) != str:
        return row['Rush TDs']
    else:
        return row['Rush TD']   

allYears['College Stats Link'] = allYears.apply(lambda x:noLink(x),axis=1)

statLines = allYears.apply(lambda x:meanStats(x),axis=1)
allYears = allYears.merge(statLines,how='left',left_on='College Stats Link',right_on = 'Link')
allYears.drop(['Link','Player_y','School_y','Year_y'],inplace=True,axis=1)
allYears['Player'] = allYears['Player_x']
allYears.drop('Player_x',inplace=True,axis=1)
allYears['School'] = allYears['School_x']
allYears.drop('School_x',inplace=True,axis=1)
allYears['Year'] = allYears['Year_x']
allYears.drop('Year_x',inplace=True,axis=1)
allYears.drop('Rate',inplace=True,axis=1)
allYears['Bench'] = allYears.apply(lambda x:combineBench(x),axis=1)
allYears['Broad Jump'] = allYears.apply(lambda x:combineBroad(x),axis=1)
allYears['Rush TD'] = allYears.apply(lambda x:combineRush(x),axis=1)

allYears.drop('BroadJump',inplace=True,axis=1)
allYears.drop('BenchReps',inplace=True,axis=1)
allYears.drop('Rush TDs',inplace=True,axis=1)

    



  del sys.path[0]


In [20]:
getPlayerComps('Ezekiel Elliott').head(10)

Unnamed: 0,player,distance
687,Ezekiel Elliott,0.0
595,Todd Gurley,0.671088
101,Keiland Williams,0.753457
378,Eddie Lacy,0.801169
751,Jonathan Williams,0.843758
918,Derrius Guice,0.867836
116,Stephen Burton,0.88967
949,Rashaad Penny,0.943686
69,Brandon Minor,0.986593
704,Jordan Howard,0.992238


In [34]:
getPlayerComps('Rob Gronkowski').head(10)

Unnamed: 0,player,distance
40,Rob Gronkowski,0.0
375,Travis Kelce,0.93642
699,Tyler Higbee,1.380808
245,Coby Fleener,1.409135
816,Jordan Leggett,1.626438
916,Dallas Goedert,1.661898
583,AJ Derby,1.745002
348,Joseph Fauria,1.75124
31,Brody Eldridge,1.761087
852,Eric Saubert,1.785275


In [42]:
getPlayerComps("Eric Decker").head(10)

Unnamed: 0,player,distance
22,Eric Decker,0.0
324,Keenan Allen,1.134389
495,Cody Latimer,1.173336
248,Jeff Fuller,1.320621
0,Danario Alexander,1.364857
270,Alshon Jeffery,1.475716
858,JuJu Smith-Schuster,1.483542
128,Tandon Doss,1.496284
850,Seth Russell,1.505005
246,Michael Floyd,1.519546


In [68]:
getPlayerComps("Tevin Coleman").head(10)

Unnamed: 0,player,distance
573,Tevin Coleman,0.0
89,C.J. Spiller,0.940293
689,Josh Ferguson,1.000141
943,Sony Michel,1.002952
188,Brandon Saine,1.020902
194,Da'Rel Scott,1.060087
182,Bilal Powell,1.087378
250,Chris Givens,1.107374
664,Devontae Booker,1.113799
317,Fozzy Whittaker,1.139721
