In [2]:
#import dependencies
import pymongo
from bs4 import BeautifulSoup as bs
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from sqlalchemy import create_engine
from config import password, host
import requests
import pandas as pd

## Scraping NFL Salary Data using Pandas

In [3]:
#enable chrome browser
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=True)

In [4]:
salary_url = 'https://www.pro-football-reference.com/players/salary.htm'
nfl_wiki_url = 'https://en.wikipedia.org/wiki/Wikipedia:WikiProject_National_Football_League/National_Football_League_team_abbreviations'

In [5]:
#read table from url
tables = pd.read_html(salary_url)
tables

[          Rk              Player Pos   Tm       Salary
 0        1.0        Kirk Cousins  QB  MIN  $27,500,000
 1        2.0      Marcus Mariota  QB  TEN  $20,922,000
 2        NaN      Jameis Winston  QB  TAM  $20,922,000
 3        4.0          Derek Carr  QB  OAK  $19,900,000
 4        5.0          Joe Flacco  QB  DEN  $18,500,000
 ...      ...                 ...  ..  ...          ...
 1909     NaN       Michael Dogbe  DL  ARI     $495,000
 1910     NaN  Ross Pierschbacher  OL  WAS     $495,000
 1911  1912.0        Alex Redmond   G  CIN     $493,236
 1912  1913.0         Holton Hill  CB  MIN     $435,882
 1913  1914.0      Tyrone Swoopes  TE  SEA     $378,000
 
 [1914 rows x 5 columns]]

In [6]:
#create dataframe based on 1st table
salary_df = tables[0]
salary_df.head()

Unnamed: 0,Rk,Player,Pos,Tm,Salary
0,1.0,Kirk Cousins,QB,MIN,"$27,500,000"
1,2.0,Marcus Mariota,QB,TEN,"$20,922,000"
2,,Jameis Winston,QB,TAM,"$20,922,000"
3,4.0,Derek Carr,QB,OAK,"$19,900,000"
4,5.0,Joe Flacco,QB,DEN,"$18,500,000"


## Cleaning the Salary DataFrame

In [7]:
#rename columns
salary_df = salary_df.rename(columns={'Rk': 'Salary_Rank', 'Pos':'Player_Position', 'Tm': 'Abrv' })

#'Rank' column currently contains blank values when salaries are tied
#this line of code will fill the blank rank value with the value directly above
salary_df['Salary_Rank'] = salary_df['Salary_Rank'].fillna(method='ffill')

#fill blank positions with 'NULL' text string
salary_df['Player_Position'] = salary_df['Player_Position'].fillna('NULL')

#Split out Player Name and store in a temporary dataframe
temp_df = salary_df['Player'].str.split(' ', n=1, expand=True)

#add first name and last name to original salary dataframe
salary_df['First_Name'] = temp_df[0]
salary_df['Last_Name'] = temp_df[1]

#remove '$' and ',' characters from Salary 
salary_df['Salary'] = salary_df['Salary'].str.replace('$','')
salary_df['Salary'] = salary_df['Salary'].str.replace(',','')

#convert salary datatype to int
salary_df['Salary'] = salary_df['Salary'].astype(int)

#change abbreviations to official abbreviations
salary_df = salary_df.replace({'TAM':'TB', 'OAK': 'LV', 'NOR': 'NO', 'SFO': 'SF', 
                                'GNB': 'GB', 'KAN': 'KC', 'NWE': 'NE'})

#create unique ID using player name and position
salary_df['uid'] = salary_df['Player'] + salary_df['Player_Position']

#rearrange columns to finish cleaning the salary dataframe
columns = ['Salary_Rank', 'uid','Player', 'First_Name', 'Last_Name', 'Player_Position', 'Abrv', 'Salary']
salary_df = salary_df[columns]

salary_df.head()

Unnamed: 0,Salary_Rank,uid,Player,First_Name,Last_Name,Player_Position,Abrv,Salary
0,1.0,Kirk CousinsQB,Kirk Cousins,Kirk,Cousins,QB,MIN,27500000
1,2.0,Marcus MariotaQB,Marcus Mariota,Marcus,Mariota,QB,TEN,20922000
2,2.0,Jameis WinstonQB,Jameis Winston,Jameis,Winston,QB,TB,20922000
3,4.0,Derek CarrQB,Derek Carr,Derek,Carr,QB,LV,19900000
4,5.0,Joe FlaccoQB,Joe Flacco,Joe,Flacco,QB,DEN,18500000


## Pull NFL Abbreviation Mapping from Wikipedia

In [8]:

tables = pd.read_html(nfl_wiki_url)

#create dataframe based on 1st table
nfl_mapping = tables[0]

#assign 1st row as headers
nfl_mapping.columns = nfl_mapping.iloc[0]
nfl_mapping = nfl_mapping[1:]

#rename abbreviation columns
nfl_mapping = nfl_mapping.rename(columns={'Abbreviation/Acronym':'Abrv', 'Franchise':'Team'})

nfl_mapping.head()

Unnamed: 0,Abrv,Team
1,ARI,Arizona Cardinals
2,ATL,Atlanta Falcons
3,BAL,Baltimore Ravens
4,BUF,Buffalo Bills
5,CAR,Carolina Panthers


## Merge Full NFL Name to Salary Table

In [9]:
#merge salary df with nfl mapping
salary_df = pd.merge(salary_df, nfl_mapping, on = 'Abrv', how='left')

In [11]:
#rearrange columns
columns = ['Salary_Rank', 'uid', 'Player', 'First_Name', 'Last_Name', 'Player_Position', 'Abrv', 'Team', 'Salary']
salary_df = salary_df[columns]

#sort df by rank
salary_df = salary_df.sort_values(by=['Salary_Rank'])

#make columns lowercase
salary_df.columns = salary_df.columns.str.lower()

salary_df.head()

Unnamed: 0,salary_rank,uid,player,first_name,last_name,player_position,abrv,team,salary
0,1.0,Kirk CousinsQB,Kirk Cousins,Kirk,Cousins,QB,MIN,Minnesota Vikings,27500000
1,2.0,Marcus MariotaQB,Marcus Mariota,Marcus,Mariota,QB,TEN,Tennessee Titans,20922000
2,2.0,Jameis WinstonQB,Jameis Winston,Jameis,Winston,QB,TB,Tampa Bay Buccaneers,20922000
3,4.0,Derek CarrQB,Derek Carr,Derek,Carr,QB,LV,Las Vegas Raiders,19900000
4,5.0,Joe FlaccoQB,Joe Flacco,Joe,Flacco,QB,DEN,Denver Broncos,18500000


In [12]:
#save csv just for reference
salary_df.to_csv('salaries.csv', index=False)
salary_df

Unnamed: 0,salary_rank,uid,player,first_name,last_name,player_position,abrv,team,salary
0,1.0,Kirk CousinsQB,Kirk Cousins,Kirk,Cousins,QB,MIN,Minnesota Vikings,27500000
1,2.0,Marcus MariotaQB,Marcus Mariota,Marcus,Mariota,QB,TEN,Tennessee Titans,20922000
2,2.0,Jameis WinstonQB,Jameis Winston,Jameis,Winston,QB,TB,Tampa Bay Buccaneers,20922000
3,4.0,Derek CarrQB,Derek Carr,Derek,Carr,QB,LV,Las Vegas Raiders,19900000
4,5.0,Joe FlaccoQB,Joe Flacco,Joe,Flacco,QB,DEN,Denver Broncos,18500000
...,...,...,...,...,...,...,...,...,...
1909,1584.0,Michael DogbeDL,Michael Dogbe,Michael,Dogbe,DL,ARI,Arizona Cardinals,495000
1901,1584.0,Austin BryantEDGE,Austin Bryant,Austin,Bryant,EDGE,DET,Detroit Lions,495000
1911,1912.0,Alex RedmondG,Alex Redmond,Alex,Redmond,G,CIN,Cincinnati Bengals,493236
1912,1913.0,Holton HillCB,Holton Hill,Holton,Hill,CB,MIN,Minnesota Vikings,435882


## Connect and Load Data to Postgres Database

In [13]:
#create postgres connection string
rds_connection_string = f"postgres:{password}@localhost:{host}/NFL"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [14]:
#view engine table names
engine.table_names()

['salaries']

In [15]:
#load salary df to salaries table
salary_df.to_sql(name='salaries', con=engine, if_exists='append', index=False)

In [16]:
#preview data loaded from table
pd.read_sql_query('select * from salaries limit 10', con=engine).head()

Unnamed: 0,salary_rank,uid,player,first_name,last_name,player_position,abrv,team,salary
0,1,Kirk CousinsQB,Kirk Cousins,Kirk,Cousins,QB,MIN,Minnesota Vikings,27500000
1,2,Marcus MariotaQB,Marcus Mariota,Marcus,Mariota,QB,TEN,Tennessee Titans,20922000
2,2,Jameis WinstonQB,Jameis Winston,Jameis,Winston,QB,TB,Tampa Bay Buccaneers,20922000
3,4,Derek CarrQB,Derek Carr,Derek,Carr,QB,LV,Las Vegas Raiders,19900000
4,5,Joe FlaccoQB,Joe Flacco,Joe,Flacco,QB,DEN,Denver Broncos,18500000
