In [1]:
#import dependencies
import pymongo
from bs4 import BeautifulSoup as bs
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from sqlalchemy import create_engine
from config import password, host
import requests
import pandas as pd

# Scraping NFL Salary Data using Pandas

In [2]:
#enable chrome browser
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=True)

In [3]:
salary_url = 'https://www.pro-football-reference.com/players/salary.htm'
nfl_wiki_url = 'https://en.wikipedia.org/wiki/Wikipedia:WikiProject_National_Football_League/National_Football_League_team_abbreviations'

In [4]:
#read table from url
tables = pd.read_html(salary_url)
tables

[          Rk              Player Pos   Tm       Salary
 0        1.0        Kirk Cousins  QB  MIN  $27,500,000
 1        2.0      Marcus Mariota  QB  TEN  $20,922,000
 2        NaN      Jameis Winston  QB  TAM  $20,922,000
 3        4.0          Derek Carr  QB  OAK  $19,900,000
 4        5.0          Joe Flacco  QB  DEN  $18,500,000
 ...      ...                 ...  ..  ...          ...
 1909     NaN       Michael Dogbe  DL  ARI     $495,000
 1910     NaN  Ross Pierschbacher  OL  WAS     $495,000
 1911  1912.0        Alex Redmond   G  CIN     $493,236
 1912  1913.0         Holton Hill  CB  MIN     $435,882
 1913  1914.0      Tyrone Swoopes  TE  SEA     $378,000
 
 [1914 rows x 5 columns]]

In [5]:
#create dataframe based on 1st table
salary_df = tables[0]
salary_df.head()

Unnamed: 0,Rk,Player,Pos,Tm,Salary
0,1.0,Kirk Cousins,QB,MIN,"$27,500,000"
1,2.0,Marcus Mariota,QB,TEN,"$20,922,000"
2,,Jameis Winston,QB,TAM,"$20,922,000"
3,4.0,Derek Carr,QB,OAK,"$19,900,000"
4,5.0,Joe Flacco,QB,DEN,"$18,500,000"


## Cleaning the Salary DataFrame

In [6]:
#rename columns
salary_df = salary_df.rename(columns={'Rk': 'Salary_Rank', 'Pos':'Player_Position', 'Tm': 'Abrv' })

#'Rank' column currently contains blank values when salaries are tied
#this line of code will fill the blank rank value with the value directly above
salary_df['Salary_Rank'] = salary_df['Salary_Rank'].fillna(method='ffill')

#fill blank positions with 'NULL' text string
salary_df['Player_Position'] = salary_df['Player_Position'].fillna('NULL')

#Split out Player Name and store in a temporary dataframe
temp_df = salary_df['Player'].str.split(' ', n=1, expand=True)

#add first name and last name to original salary dataframe
salary_df['First_Name'] = temp_df[0]
salary_df['Last_Name'] = temp_df[1]

#remove '$' and ',' characters from Salary 
salary_df['Salary'] = salary_df['Salary'].str.replace('$','')
salary_df['Salary'] = salary_df['Salary'].str.replace(',','')

#convert salary datatype to int
salary_df['Salary'] = salary_df['Salary'].astype(int)

#change abbreviations to official abbreviations
salary_df = salary_df.replace({'TAM':'TB', 'OAK': 'LV', 'NOR': 'NO', 'SFO': 'SF', 
                                'GNB': 'GB', 'KAN': 'KC', 'NWE': 'NE'})

#create unique ID using player name and position
salary_df['uid'] = salary_df['Player'] + salary_df['Player_Position']

#rearrange columns to finish cleaning the salary dataframe
columns = ['Salary_Rank', 'uid','Player', 'First_Name', 'Last_Name', 'Player_Position', 'Abrv', 'Salary']
salary_df = salary_df[columns]

salary_df.head()

Unnamed: 0,Salary_Rank,uid,Player,First_Name,Last_Name,Player_Position,Abrv,Salary
0,1.0,Kirk CousinsQB,Kirk Cousins,Kirk,Cousins,QB,MIN,27500000
1,2.0,Marcus MariotaQB,Marcus Mariota,Marcus,Mariota,QB,TEN,20922000
2,2.0,Jameis WinstonQB,Jameis Winston,Jameis,Winston,QB,TB,20922000
3,4.0,Derek CarrQB,Derek Carr,Derek,Carr,QB,LV,19900000
4,5.0,Joe FlaccoQB,Joe Flacco,Joe,Flacco,QB,DEN,18500000


# Pull NFL Abbreviation Mapping from Wikipedia

In [7]:
tables = pd.read_html(nfl_wiki_url)

#create dataframe based on 1st table
nfl_mapping = tables[0]

#assign 1st row as headers
nfl_mapping.columns = nfl_mapping.iloc[0]
nfl_mapping = nfl_mapping[1:]

#rename abbreviation columns
nfl_mapping = nfl_mapping.rename(columns={'Abbreviation/Acronym':'Abrv', 'Franchise':'Team'})

nfl_mapping.head()

Unnamed: 0,Abrv,Team
1,ARI,Arizona Cardinals
2,ATL,Atlanta Falcons
3,BAL,Baltimore Ravens
4,BUF,Buffalo Bills
5,CAR,Carolina Panthers


## Merge Full NFL Name to Salary Table

In [8]:
#merge salary df with nfl mapping
salary_df = pd.merge(salary_df, nfl_mapping, on = 'Abrv', how='left')

In [9]:
#rearrange columns
columns = ['Salary_Rank', 'uid', 'Player', 'First_Name', 'Last_Name', 'Player_Position', 'Abrv', 'Team', 'Salary']
salary_df = salary_df[columns]

#sort df by rank
salary_df = salary_df.sort_values(by=['Salary_Rank'])

#make columns lowercase
salary_df.columns = salary_df.columns.str.lower()

salary_df.head()

Unnamed: 0,salary_rank,uid,player,first_name,last_name,player_position,abrv,team,salary
0,1.0,Kirk CousinsQB,Kirk Cousins,Kirk,Cousins,QB,MIN,Minnesota Vikings,27500000
1,2.0,Marcus MariotaQB,Marcus Mariota,Marcus,Mariota,QB,TEN,Tennessee Titans,20922000
2,2.0,Jameis WinstonQB,Jameis Winston,Jameis,Winston,QB,TB,Tampa Bay Buccaneers,20922000
3,4.0,Derek CarrQB,Derek Carr,Derek,Carr,QB,LV,Las Vegas Raiders,19900000
4,5.0,Joe FlaccoQB,Joe Flacco,Joe,Flacco,QB,DEN,Denver Broncos,18500000


In [None]:
#save csv just for reference
salary_df.to_csv('salaries.csv', index=False)
salary_df

# NFL Player Stats from 2019

In [10]:
# Assign URL
url = 'https://www.pro-football-reference.com/years/2019/scrimmage.htm'

In [11]:
# Read Table from URL
tables = pd.read_html(url, header=[1])
print(len(tables))

1


In [12]:
# Find Correct Table and Assign Variable
stats2019 = tables[0]

stats2019

Unnamed: 0,Rk,Player,Tm,Age,Pos,G,GS,Tgt,Rec,Yds,...,1D.1,Lng.1,Y/A,Y/G.1,A/G,Touch,Y/Tch,YScm,RRTD,Fmb
0,1,Christian McCaffrey*+,CAR,23,RB,16,16,142,116,1005,...,57,84,4.8,86.7,17.9,403,5.9,2392,19,1
1,2,Ezekiel Elliott*,DAL,24,RB,16,16,71,54,420,...,78,33,4.5,84.8,18.8,355,5.0,1777,14,3
2,3,Nick Chubb*,CLE,24,RB,16,16,49,36,278,...,62,88,5.0,93.4,18.6,334,5.3,1772,8,3
3,4,Derrick Henry*,TEN,25,RB,15,15,24,18,206,...,73,74,5.1,102.7,20.2,321,5.4,1746,18,5
4,5,Michael Thomas*+,NOR,26,WR,16,15,185,149,1725,...,0,-9,-9.0,-0.6,0.1,150,11.4,1716,9,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
569,552,Jason Sanders,MIA,24,K,16,0,1,1,1,...,0,,,,,1,1.0,1,1,0
570,553,Darrius Shepherd,GNB,24,,6,0,2,1,1,...,0,,,,,1,1.0,1,0,1
571,554,Eric Tomlinson,3TM,27,,8,3,1,1,1,...,0,,,,,1,1.0,1,0,0
572,555,Vita Vea,TAM,24,DL/dt,16,16,1,1,1,...,0,,,,,1,1.0,1,1,0


# Cleaning the NFL Player Stats Data

In [13]:
# Remove special characters from player names
stats2019['Player'] = stats2019['Player'].map(lambda x: x.rstrip('+*'))

In [14]:
# Fix Team Abreviations
stats2019 = stats2019.replace({'TAM':'TB', 'OAK': 'LV', 'NOR': 'NO', 'SFO': 'SF', 
                                'GNB': 'GB', 'KAN': 'KC', 'NWE': 'NE'})
stats2019

Unnamed: 0,Rk,Player,Tm,Age,Pos,G,GS,Tgt,Rec,Yds,...,1D.1,Lng.1,Y/A,Y/G.1,A/G,Touch,Y/Tch,YScm,RRTD,Fmb
0,1,Christian McCaffrey,CAR,23,RB,16,16,142,116,1005,...,57,84,4.8,86.7,17.9,403,5.9,2392,19,1
1,2,Ezekiel Elliott,DAL,24,RB,16,16,71,54,420,...,78,33,4.5,84.8,18.8,355,5.0,1777,14,3
2,3,Nick Chubb,CLE,24,RB,16,16,49,36,278,...,62,88,5.0,93.4,18.6,334,5.3,1772,8,3
3,4,Derrick Henry,TEN,25,RB,15,15,24,18,206,...,73,74,5.1,102.7,20.2,321,5.4,1746,18,5
4,5,Michael Thomas,NO,26,WR,16,15,185,149,1725,...,0,-9,-9.0,-0.6,0.1,150,11.4,1716,9,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
569,552,Jason Sanders,MIA,24,K,16,0,1,1,1,...,0,,,,,1,1.0,1,1,0
570,553,Darrius Shepherd,GB,24,,6,0,2,1,1,...,0,,,,,1,1.0,1,0,1
571,554,Eric Tomlinson,3TM,27,,8,3,1,1,1,...,0,,,,,1,1.0,1,0,0
572,555,Vita Vea,TB,24,DL/dt,16,16,1,1,1,...,0,,,,,1,1.0,1,1,0


In [15]:
# Pick Relevant Columns
stats2019 = stats2019[['Player', 'Tm', 'Age', 'Pos', 'G', 'Touch', 'Y/Tch', 'YScm', 'RRTD', 'Fmb', 'Rec']]

stats2019

Unnamed: 0,Player,Tm,Age,Pos,G,Touch,Y/Tch,YScm,RRTD,Fmb,Rec
0,Christian McCaffrey,CAR,23,RB,16,403,5.9,2392,19,1,116
1,Ezekiel Elliott,DAL,24,RB,16,355,5.0,1777,14,3,54
2,Nick Chubb,CLE,24,RB,16,334,5.3,1772,8,3,36
3,Derrick Henry,TEN,25,RB,15,321,5.4,1746,18,5,18
4,Michael Thomas,NO,26,WR,16,150,11.4,1716,9,1,149
...,...,...,...,...,...,...,...,...,...,...,...
569,Jason Sanders,MIA,24,K,16,1,1.0,1,1,0,1
570,Darrius Shepherd,GB,24,,6,1,1.0,1,0,1,1
571,Eric Tomlinson,3TM,27,,8,1,1.0,1,0,0,1
572,Vita Vea,TB,24,DL/dt,16,1,1.0,1,1,0,1


In [16]:
# Fix Column Names
stats2019 = stats2019.rename(columns={'Tm':'Team', 'Pos': 'Position', 'G': 'Games', 'Y/Tch': 'Yards/Touch', 
                                'YScm': 'Total_Yards', 'RRTD': 'Touchdowns', 'Fmb': 'Fumbles', 'Rec':'Receptions', 'Touch': 'Touches'})

In [17]:
# Remove secondary column names rows
stats2019 = stats2019.loc[stats2019['Total_Yards'] != 'YScm']

stats2019

Unnamed: 0,Player,Team,Age,Position,Games,Touches,Yards/Touch,Total_Yards,Touchdowns,Fumbles,Receptions
0,Christian McCaffrey,CAR,23,RB,16,403,5.9,2392,19,1,116
1,Ezekiel Elliott,DAL,24,RB,16,355,5.0,1777,14,3,54
2,Nick Chubb,CLE,24,RB,16,334,5.3,1772,8,3,36
3,Derrick Henry,TEN,25,RB,15,321,5.4,1746,18,5,18
4,Michael Thomas,NO,26,WR,16,150,11.4,1716,9,1,149
...,...,...,...,...,...,...,...,...,...,...,...
569,Jason Sanders,MIA,24,K,16,1,1.0,1,1,0,1
570,Darrius Shepherd,GB,24,,6,1,1.0,1,0,1,1
571,Eric Tomlinson,3TM,27,,8,1,1.0,1,0,0,1
572,Vita Vea,TB,24,DL/dt,16,1,1.0,1,1,0,1


In [18]:
# Convert Data Types to Integers and Float
num_cols = ['Age', 'Games', 'Touches', 'Total_Yards', 'Touchdowns', 'Fumbles', 'Receptions']

for col in num_cols:
    stats2019[f'{col}'] = stats2019[f'{col}'].astype(str).astype(int)

stats2019['Yards/Touch'] = stats2019['Yards/Touch'].astype(str).astype(float)

stats2019.dtypes


Player          object
Team            object
Age              int32
Position        object
Games            int32
Touches          int32
Yards/Touch    float64
Total_Yards      int32
Touchdowns       int32
Fumbles          int32
Receptions       int32
dtype: object

In [19]:
# Print Header of Final Database
stats2019.head()

Unnamed: 0,Player,Team,Age,Position,Games,Touches,Yards/Touch,Total_Yards,Touchdowns,Fumbles,Receptions
0,Christian McCaffrey,CAR,23,RB,16,403,5.9,2392,19,1,116
1,Ezekiel Elliott,DAL,24,RB,16,355,5.0,1777,14,3,54
2,Nick Chubb,CLE,24,RB,16,334,5.3,1772,8,3,36
3,Derrick Henry,TEN,25,RB,15,321,5.4,1746,18,5,18
4,Michael Thomas,NO,26,WR,16,150,11.4,1716,9,1,149


# Extract 2019 NFL season standings using Pandas

In [20]:
# Pull seaonson standings table data using Pandas
url = 'http://www.playoffstatus.com/nfl/nflpostseasonprob.html'
tables = pd.read_html(url, header=[1])

In [21]:
# Store correct table in variable
nfl_2019_standings_df = tables[0]
nfl_2019_standings_df

Unnamed: 0,Team,Conference Conf,W,L,T,Super Bowl Winner,Super Bowl,Conference Championship Conf Game,Round 2,Round 1
0,Chiefs,AFC,12,4,0,100%,^,^,^,^
1,Forty-Niners49'ers,NFC,13,3,0,X,100%,^,^,^
2,Packers,NFC,13,3,0,X,X,100%,^,^
3,Titans,AFC,9,7,0,X,X,100%,^,^
4,Ravens,AFC,14,2,0,X,X,X,100%,^
5,Seahawks,NFC,11,5,0,X,X,X,100%,^
6,Texans,AFC,10,6,0,X,X,X,100%,^
7,Vikings,NFC,10,6,0,X,X,X,100%,^
8,Saints,NFC,13,3,0,X,X,X,X,100%
9,Patriots,AFC,12,4,0,X,X,X,X,100%


In [22]:
# Rename columns correct 49ers name
nfl_2019_standings_df = nfl_2019_standings_df[['Team','Conference Conf','W','L','T']]
nfl_2019_standings_df = nfl_2019_standings_df.rename(columns={"Conference Conf": "Conference", "W": "Wins", "L":"Losses","T":"Ties"})
nfl_2019_standings_df.at[1,'Team'] = '49ers'
nfl_2019_standings_df

Unnamed: 0,Team,Conference,Wins,Losses,Ties
0,Chiefs,AFC,12,4,0
1,49ers,NFC,13,3,0
2,Packers,NFC,13,3,0
3,Titans,AFC,9,7,0
4,Ravens,AFC,14,2,0
5,Seahawks,NFC,11,5,0
6,Texans,AFC,10,6,0
7,Vikings,NFC,10,6,0
8,Saints,NFC,13,3,0
9,Patriots,AFC,12,4,0


In [23]:
# Pull NFL team abbreviation data using pandas
url = 'https://en.wikipedia.org/wiki/Wikipedia:WikiProject_National_Football_League/National_Football_League_team_abbreviations'
tables = pd.read_html(url)

In [24]:
# Clean up abbreviation table 
nfl_abbrev_df = tables[0]
nfl_abbrev_df = nfl_abbrev_df.drop(0)
nfl_abbrev_df = nfl_abbrev_df.rename(columns={0: "Abbreviation", 1: "Team"})
nfl_abbrev_df

Unnamed: 0,Abbreviation,Team
1,ARI,Arizona Cardinals
2,ATL,Atlanta Falcons
3,BAL,Baltimore Ravens
4,BUF,Buffalo Bills
5,CAR,Carolina Panthers
6,CHI,Chicago Bears
7,CIN,Cincinnati Bengals
8,CLE,Cleveland Browns
9,DAL,Dallas Cowboys
10,DEN,Denver Broncos


In [25]:
# Split and get last part of team name
nfl_abbrev_df["Team"] = nfl_abbrev_df["Team"].str.split().str[-1]
nfl_abbrev_df.head()

Unnamed: 0,Abbreviation,Team
1,ARI,Cardinals
2,ATL,Falcons
3,BAL,Ravens
4,BUF,Bills
5,CAR,Panthers


In [26]:
# Merge season standings table and abbreviation table on team name
nfl_2019_standings_df = pd.merge(nfl_2019_standings_df, nfl_abbrev_df, how='inner', on='Team')
nfl_2019_standings_df.head()

Unnamed: 0,Team,Conference,Wins,Losses,Ties,Abbreviation
0,Chiefs,AFC,12,4,0,KC
1,49ers,NFC,13,3,0,SF
2,Packers,NFC,13,3,0,GB
3,Titans,AFC,9,7,0,TEN
4,Ravens,AFC,14,2,0,BAL


# Connect and Load Data to Postgres Database

Player Salary Data

In [None]:
#create postgres connection string
rds_connection_string = f"postgres:{password}@localhost:{host}/NFL"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [None]:
#view engine table names
engine.table_names()

In [None]:
#load salary df to salaries table
salary_df.to_sql(name='salaries', con=engine, if_exists='append', index=False)

In [None]:
#preview data loaded from table
pd.read_sql_query('select * from salaries limit 10', con=engine).head()

Player Stats Data

In [28]:
# Connect to Postgres and Start Engine
connection_string = f"postgres:{password}@localhost:5432/NFL"
engine = create_engine(f'postgresql://{connection_string}')

In [29]:
# Print Table Names in Database
engine.table_names()

['salaries', 'nfl_2019_standings', 'player_stats']

In [30]:
# Fill Table
stats2019.to_sql(name='player_stats', con=engine, if_exists='replace', index=False)

In [31]:
# Print Table
pd.read_sql_query('select * from player_stats limit 10', con=engine)

Unnamed: 0,Player,Team,Age,Position,Games,Touches,Yards/Touch,Total_Yards,Touchdowns,Fumbles,Receptions
0,Christian McCaffrey,CAR,23,RB,16,403,5.9,2392,19,1,116
1,Ezekiel Elliott,DAL,24,RB,16,355,5.0,1777,14,3,54
2,Nick Chubb,CLE,24,RB,16,334,5.3,1772,8,3,36
3,Derrick Henry,TEN,25,RB,15,321,5.4,1746,18,5,18
4,Michael Thomas,NO,26,WR,16,150,11.4,1716,9,1,149
5,Leonard Fournette,JAX,24,RB,15,341,4.9,1674,3,1,76
6,Dalvin Cook,MIN,24,RB,14,303,5.5,1654,13,4,53
7,Aaron Jones,GB,25,RB,16,285,5.5,1558,19,3,49
8,Austin Ekeler,LAC,24,RB,16,224,6.9,1550,11,3,92
9,Chris Carson,SEA,25,RB,15,315,4.7,1496,9,7,37


In [32]:
new_customer_data_df.to_sql(name='nfl_2019_standings', con=engine, if_exists='append', index=False)

NameError: name 'new_customer_data_df' is not defined