In [8]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Import scraping tools
import requests
from bs4 import BeautifulSoup

#Import Plotly
import plotly.express as px
import plotly.graph_objs as go

#import sklearn
from sklearn import preprocessing

In [9]:
list_1 = []

for year in range(1987,2022):
    # Get the URL
    url = f'https://nflcombineresults.com/nflcombinedata.php?year={year}&pos=&college='

    # Make the request
    r = requests.get(url)

    # Parse the script
    soup = BeautifulSoup(r.text, 'html.parser')
    

    # Pull the table data

    combine_table = soup.find('table', class_='sortable')
    
    # Get header from first year.
    if (year == 1987):
        # Get the table header
        header = []
        for title in combine_table.find_all('thead'):
            rows = title.find_all('tr')
            for row in rows:
                for i in range(13):
                    pl_data = row.find_all('td')[i].text.strip()
                    header.append(pl_data)
    
    
    # Get player data
    for player in combine_table.find_all('tbody'):
        rows = player.find_all('tr')
        for row in rows:
            list_2 = []
            for i in range(13):
                pl_data = row.find_all('td')[i].text.strip()
                list_2.append(pl_data)
            list_1.append(list_2)
    
    #Track progress
    print(year)

1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021


In [10]:
# Create DataFrame from data:
combine_df = pd.DataFrame(list_1, columns = header)

# Convert datatypes for processing and convert missing vales to NaN so that they will not be counted in the analysis

combine_df['Height (in)'] = combine_df['Height (in)'].astype(float)
combine_df['Weight (lbs)'] = combine_df['Weight (lbs)'].astype(float)
combine_df['40 Yard'] = combine_df['40 Yard'].replace('',np.nan, regex=True).astype(float)
combine_df['Vert Leap (in)'] = combine_df['Vert Leap (in)'].replace('',np.nan, regex=True).astype(float)
combine_df['Broad Jump (in)'] = combine_df['Broad Jump (in)'].replace('',np.nan, regex=True).astype(float)
combine_df['Shuttle'] = combine_df['Shuttle'].replace('',np.nan, regex=True).astype(float)
combine_df['3Cone'] = combine_df['3Cone'].replace('',np.nan, regex=True).astype(float)
combine_df['Bench Press'] = combine_df['Bench Press'].replace('',np.nan, regex=True).astype(float)
combine_df['Wonderlic'] = combine_df['Wonderlic'].replace('',np.nan, regex=True).astype(float)

# Calculate BMI and add it to the combine dataframe
list_bmi = combine_df['Weight (lbs)'] / (combine_df['Height (in)'] ** 2) * 703

combine_df['BMI'] = list_bmi

# Rearrange columns
cols = combine_df.columns.values

combine_df = combine_df[['Year', 'Name', 'College', 'POS', 'Height (in)', 'Weight (lbs)', 'BMI',
       'Wonderlic', '40 Yard', 'Bench Press', 'Vert Leap (in)',
       'Broad Jump (in)', 'Shuttle', '3Cone']]

#Drop Wonderlic due to NaN values and less relevent for our purposes
combine_df = combine_df.drop(['Wonderlic'], axis=1)

#Drop bad values

combine_df.head()

Unnamed: 0,Year,Name,College,POS,Height (in),Weight (lbs),BMI,40 Yard,Bench Press,Vert Leap (in),Broad Jump (in),Shuttle,3Cone
0,1987,Mike Adams,Arizona State,CB,69.8,198.0,28.569962,4.42,13.0,32.0,118.0,4.6,
1,1987,John Adickes,Baylor,C,74.8,266.0,33.422102,4.97,25.0,26.5,103.0,4.6,
2,1987,Tommy Agee,Auburn,FB,71.8,217.0,29.591445,,15.0,,,,
3,1987,David Alexander,Tulsa (OK),C,75.0,279.0,34.8688,5.13,22.0,27.5,105.0,4.33,
4,1987,Lyneal Alston,Southern Mississippi,WR,72.1,202.0,27.317199,4.64,7.0,32.0,114.0,4.52,


In [15]:
combine_df.to_csv("resources/combine_df.csv", index=False)