In [6]:
#import dependencies
import pymongo
from bs4 import BeautifulSoup as bs
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from sqlalchemy import create_engine
import requests
import pandas as pd
from config import password, host

ModuleNotFoundError: No module named 'config'

# Scraping NFL Salary Data using Pandas

In [None]:
#enable chrome browser
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=True)

In [None]:
salary_url = 'https://www.pro-football-reference.com/players/salary.htm'
nfl_wiki_url = 'https://en.wikipedia.org/wiki/Wikipedia:WikiProject_National_Football_League/National_Football_League_team_abbreviations'

In [None]:
#read table from url
tables = pd.read_html(salary_url)
tables

In [None]:
#create dataframe based on 1st table
salary_df = tables[0]
salary_df.head()

## Cleaning the Salary DataFrame

In [None]:
#rename columns
salary_df = salary_df.rename(columns={'Rk': 'Salary_Rank', 'Pos':'Player_Position', 'Tm': 'Abrv' })

#'Rank' column currently contains blank values when salaries are tied
#this line of code will fill the blank rank value with the value directly above
salary_df['Salary_Rank'] = salary_df['Salary_Rank'].fillna(method='ffill')

#fill blank positions with 'NULL' text string
salary_df['Player_Position'] = salary_df['Player_Position'].fillna('NULL')

#Split out Player Name and store in a temporary dataframe
temp_df = salary_df['Player'].str.split(' ', n=1, expand=True)

#add first name and last name to original salary dataframe
salary_df['First_Name'] = temp_df[0]
salary_df['Last_Name'] = temp_df[1]

#remove '$' and ',' characters from Salary 
salary_df['Salary'] = salary_df['Salary'].str.replace('$','')
salary_df['Salary'] = salary_df['Salary'].str.replace(',','')

#convert salary datatype to int
salary_df['Salary'] = salary_df['Salary'].astype(int)

#change abbreviations to official abbreviations
salary_df = salary_df.replace({'TAM':'TB', 'OAK': 'LV', 'NOR': 'NO', 'SFO': 'SF', 
                                'GNB': 'GB', 'KAN': 'KC', 'NWE': 'NE'})

#create unique ID using player name and position
salary_df['uid'] = salary_df['Player'] + salary_df['Player_Position']

#rearrange columns to finish cleaning the salary dataframe
columns = ['Salary_Rank', 'uid','Player', 'First_Name', 'Last_Name', 'Player_Position', 'Abrv', 'Salary']
salary_df = salary_df[columns]

salary_df.head()

# Pull NFL Abbreviation Mapping from Wikipedia

In [5]:
tables = pd.read_html(nfl_wiki_url)

#create dataframe based on 1st table
nfl_mapping = tables[0]

#assign 1st row as headers
nfl_mapping.columns = nfl_mapping.iloc[0]
nfl_mapping = nfl_mapping[1:]

#rename abbreviation columns
nfl_mapping = nfl_mapping.rename(columns={'Abbreviation/Acronym':'Abrv', 'Franchise':'Team'})

nfl_mapping.head()

NameError: name 'nfl_wiki_url' is not defined

## Merge Full NFL Name to Salary Table

In [None]:
#merge salary df with nfl mapping
salary_df = pd.merge(salary_df, nfl_mapping, on = 'Abrv', how='left')

In [None]:
#rearrange columns
columns = ['Salary_Rank', 'uid', 'Player', 'First_Name', 'Last_Name', 'Player_Position', 'Abrv', 'Team', 'Salary']
salary_df = salary_df[columns]

#sort df by rank
salary_df = salary_df.sort_values(by=['Salary_Rank'])

#make columns lowercase
salary_df.columns = salary_df.columns.str.lower()

salary_df.head()

In [None]:
#save csv just for reference
salary_df.to_csv('salaries.csv', index=False)
salary_df

# NFL Player Stats from 2019

In [None]:
# Assign URL
url = 'https://www.pro-football-reference.com/years/2019/scrimmage.htm'

In [None]:
# Read Table from URL
tables = pd.read_html(url, header=[1])
print(len(tables))

In [None]:
# Find Correct Table and Assign Variable
stats2019 = tables[0]

stats2019

# Cleaning the NFL Player Stats Data

In [None]:
# Remove special characters from player names
stats2019['Player'] = stats2019['Player'].map(lambda x: x.rstrip('+*'))

In [None]:
# Fix Team Abreviations
stats2019 = stats2019.replace({'TAM':'TB', 'OAK': 'LV', 'NOR': 'NO', 'SFO': 'SF', 
                                'GNB': 'GB', 'KAN': 'KC', 'NWE': 'NE'})
stats2019

In [None]:
# Pick Relevant Columns
stats2019 = stats2019[['Player', 'Tm', 'Age', 'Pos', 'G', 'Touch', 'Y/Tch', 'YScm', 'RRTD', 'Fmb', 'Rec']]

stats2019

In [None]:
# Fix Column Names
stats2019 = stats2019.rename(columns={'Tm':'Team', 'Pos': 'Position', 'G': 'Games', 'Y/Tch': 'Yards/Touch', 
                                'YScm': 'Total_Yards', 'RRTD': 'Touchdowns', 'Fmb': 'Fumbles', 'Rec':'Receptions', 'Touch': 'Touches'})

In [None]:
# Remove secondary column names rows
stats2019 = stats2019.loc[stats2019['Total_Yards'] != 'YScm']

stats2019

In [None]:
# Convert Data Types to Integers and Float
num_cols = ['Age', 'Games', 'Touches', 'Total_Yards', 'Touchdowns', 'Fumbles', 'Receptions']

for col in num_cols:
    stats2019[f'{col}'] = stats2019[f'{col}'].astype(str).astype(int)

stats2019['Yards/Touch'] = stats2019['Yards/Touch'].astype(str).astype(float)

stats2019.dtypes


In [None]:
# Print Header of Final Database
stats2019.head()

# Extract 2019 NFL season standings using Pandas

In [23]:
# Pull seaonson standings table data using Pandas
url = 'http://www.playoffstatus.com/nfl/nflpostseasonprob.html'
tables = pd.read_html(url, header=[1])

In [24]:
# Store correct table in variable
nfl_2019_standings_df = tables[0]
nfl_2019_standings_df

Unnamed: 0,Team,Conference Conf,W,L,T,Super Bowl Winner,Super Bowl,Conference Championship Conf Game,Round 2,Round 1
0,Chiefs,AFC,12,4,0,100%,^,^,^,^
1,Forty-Niners49'ers,NFC,13,3,0,X,100%,^,^,^
2,Packers,NFC,13,3,0,X,X,100%,^,^
3,Titans,AFC,9,7,0,X,X,100%,^,^
4,Ravens,AFC,14,2,0,X,X,X,100%,^
5,Seahawks,NFC,11,5,0,X,X,X,100%,^
6,Texans,AFC,10,6,0,X,X,X,100%,^
7,Vikings,NFC,10,6,0,X,X,X,100%,^
8,Saints,NFC,13,3,0,X,X,X,X,100%
9,Patriots,AFC,12,4,0,X,X,X,X,100%


In [25]:
# Rename columns and correct 49ers name
nfl_2019_standings_df = nfl_2019_standings_df[['Team','Conference Conf','W','L','T']]
nfl_2019_standings_df = nfl_2019_standings_df.rename(columns={"Conference Conf": "Conference", "W": "Wins", "L":"Losses","T":"Ties"})
nfl_2019_standings_df.at[1,'Team'] = '49ers'
nfl_2019_standings_df.head()

Unnamed: 0,Team,Conference,Wins,Losses,Ties
0,Chiefs,AFC,12,4,0
1,49ers,NFC,13,3,0
2,Packers,NFC,13,3,0
3,Titans,AFC,9,7,0
4,Ravens,AFC,14,2,0


In [26]:
# Pull NFL team abbreviation data using pandas
url = 'https://en.wikipedia.org/wiki/Wikipedia:WikiProject_National_Football_League/National_Football_League_team_abbreviations'
tables = pd.read_html(url)

In [27]:
# Clean up abbreviation table 
nfl_abbrev_df = tables[0]
nfl_abbrev_df = nfl_abbrev_df.drop(0)
nfl_abbrev_df = nfl_abbrev_df.rename(columns={0: "Abbreviation", 1: "Team"})
nfl_abbrev_df.head()

Unnamed: 0,Abbreviation,Team
1,ARI,Arizona Cardinals
2,ATL,Atlanta Falcons
3,BAL,Baltimore Ravens
4,BUF,Buffalo Bills
5,CAR,Carolina Panthers


In [28]:
# Split and get last part of team name
nfl_abbrev_df["Team"] = nfl_abbrev_df["Team"].str.split().str[-1]
nfl_abbrev_df.head()

Unnamed: 0,Abbreviation,Team
1,ARI,Cardinals
2,ATL,Falcons
3,BAL,Ravens
4,BUF,Bills
5,CAR,Panthers


In [29]:
# Merge season standings table and abbreviation table on team name
nfl_2019_standings_df = pd.merge(nfl_2019_standings_df, nfl_abbrev_df, how='inner', on='Team')
nfl_2019_standings_df.columns = nfl_2019_standings_df.columns.str.lower()

In [32]:
nfl_2019_standings_df.head()

Unnamed: 0,team,conference,wins,losses,ties,abbreviation
0,Chiefs,AFC,12,4,0,KC
1,49ers,NFC,13,3,0,SF
2,Packers,NFC,13,3,0,GB
3,Titans,AFC,9,7,0,TEN
4,Ravens,AFC,14,2,0,BAL


# Connect and Load Data to Postgres Database

## Player Salary Data

In [None]:
#create postgres connection string
rds_connection_string = f"postgres:{password}@localhost:{host}/NFL"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [None]:
#view engine table names
engine.table_names()

In [None]:
#load salary df to salaries table
salary_df.to_sql(name='salaries', con=engine, if_exists='append', index=False)

In [None]:
#preview data loaded from table
pd.read_sql_query('select * from salaries limit 10', con=engine).head()

## Player Stats Data

In [None]:
# Connect to Postgres and Start Engine
connection_string = f"postgres:{password}@localhost:5432/NFL"
engine = create_engine(f'postgresql://{connection_string}')

In [None]:
# Print Table Names in Database
engine.table_names()

In [None]:
# Fill Table
stats2019.to_sql(name='player_stats', con=engine, if_exists='replace', index=False)

In [None]:
# Print Table
pd.read_sql_query('select * from player_stats limit 10', con=engine)

In [None]:
nfl_2019_standings_df.to_sql(name='nfl_2019_standings', con=engine, if_exists='append', index=False)