In [1]:
import pandas as pd
from sqlalchemy import create_engine
from config import pswd

# Scrape NFL Player Stats from 2019

In [2]:
# Assign URL
url = 'https://www.pro-football-reference.com/years/2019/scrimmage.htm'

In [3]:
# Read Table from URL
tables = pd.read_html(url, header=[1])
print(len(tables))

1


In [4]:
# Find Correct Table and Assign Variable
stats2019 = tables[0]

stats2019

Unnamed: 0,Rk,Player,Tm,Age,Pos,G,GS,Tgt,Rec,Yds,...,1D.1,Lng.1,Y/A,Y/G.1,A/G,Touch,Y/Tch,YScm,RRTD,Fmb
0,1,Christian McCaffrey*+,CAR,23,RB,16,16,142,116,1005,...,57,84,4.8,86.7,17.9,403,5.9,2392,19,1
1,2,Ezekiel Elliott*,DAL,24,RB,16,16,71,54,420,...,78,33,4.5,84.8,18.8,355,5.0,1777,14,3
2,3,Nick Chubb*,CLE,24,RB,16,16,49,36,278,...,62,88,5.0,93.4,18.6,334,5.3,1772,8,3
3,4,Derrick Henry*,TEN,25,RB,15,15,24,18,206,...,73,74,5.1,102.7,20.2,321,5.4,1746,18,5
4,5,Michael Thomas*+,NOR,26,WR,16,15,185,149,1725,...,0,-9,-9.0,-0.6,0.1,150,11.4,1716,9,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
569,552,Jason Sanders,MIA,24,K,16,0,1,1,1,...,0,,,,,1,1.0,1,1,0
570,553,Darrius Shepherd,GNB,24,,6,0,2,1,1,...,0,,,,,1,1.0,1,0,1
571,554,Eric Tomlinson,3TM,27,,8,3,1,1,1,...,0,,,,,1,1.0,1,0,0
572,555,Vita Vea,TAM,24,DL/dt,16,16,1,1,1,...,0,,,,,1,1.0,1,1,0


# Cleaning the Data

In [5]:
# Remove special characters from player names
stats2019['Player'] = stats2019['Player'].map(lambda x: x.rstrip('+*'))

In [6]:
# Fix Team Abreviations
stats2019 = stats2019.replace({'TAM':'TB', 'OAK': 'LV', 'NOR': 'NO', 'SFO': 'SF', 
                                'GNB': 'GB', 'KAN': 'KC', 'NWE': 'NE'})
stats2019

Unnamed: 0,Rk,Player,Tm,Age,Pos,G,GS,Tgt,Rec,Yds,...,1D.1,Lng.1,Y/A,Y/G.1,A/G,Touch,Y/Tch,YScm,RRTD,Fmb
0,1,Christian McCaffrey,CAR,23,RB,16,16,142,116,1005,...,57,84,4.8,86.7,17.9,403,5.9,2392,19,1
1,2,Ezekiel Elliott,DAL,24,RB,16,16,71,54,420,...,78,33,4.5,84.8,18.8,355,5.0,1777,14,3
2,3,Nick Chubb,CLE,24,RB,16,16,49,36,278,...,62,88,5.0,93.4,18.6,334,5.3,1772,8,3
3,4,Derrick Henry,TEN,25,RB,15,15,24,18,206,...,73,74,5.1,102.7,20.2,321,5.4,1746,18,5
4,5,Michael Thomas,NO,26,WR,16,15,185,149,1725,...,0,-9,-9.0,-0.6,0.1,150,11.4,1716,9,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
569,552,Jason Sanders,MIA,24,K,16,0,1,1,1,...,0,,,,,1,1.0,1,1,0
570,553,Darrius Shepherd,GB,24,,6,0,2,1,1,...,0,,,,,1,1.0,1,0,1
571,554,Eric Tomlinson,3TM,27,,8,3,1,1,1,...,0,,,,,1,1.0,1,0,0
572,555,Vita Vea,TB,24,DL/dt,16,16,1,1,1,...,0,,,,,1,1.0,1,1,0


In [7]:
# Pick Relevant Columns
stats2019 = stats2019[['Player', 'Tm', 'Age', 'Pos', 'G', 'Touch', 'Y/Tch', 'YScm', 'RRTD', 'Fmb', 'Rec']]

stats2019

Unnamed: 0,Player,Tm,Age,Pos,G,Touch,Y/Tch,YScm,RRTD,Fmb,Rec
0,Christian McCaffrey,CAR,23,RB,16,403,5.9,2392,19,1,116
1,Ezekiel Elliott,DAL,24,RB,16,355,5.0,1777,14,3,54
2,Nick Chubb,CLE,24,RB,16,334,5.3,1772,8,3,36
3,Derrick Henry,TEN,25,RB,15,321,5.4,1746,18,5,18
4,Michael Thomas,NO,26,WR,16,150,11.4,1716,9,1,149
...,...,...,...,...,...,...,...,...,...,...,...
569,Jason Sanders,MIA,24,K,16,1,1.0,1,1,0,1
570,Darrius Shepherd,GB,24,,6,1,1.0,1,0,1,1
571,Eric Tomlinson,3TM,27,,8,1,1.0,1,0,0,1
572,Vita Vea,TB,24,DL/dt,16,1,1.0,1,1,0,1


In [8]:
# Fix Column Names
stats2019 = stats2019.rename(columns={'Tm':'Team', 'Pos': 'Position', 'G': 'Games', 'Y/Tch': 'Yards/Touch', 
                                'YScm': 'Total_Yards', 'RRTD': 'Touchdowns', 'Fmb': 'Fumbles', 'Rec':'Receptions', 'Touch': 'Touches'})

In [9]:
# Remove secondary column names rows
stats2019 = stats2019.loc[stats2019['Total_Yards'] != 'YScm']

stats2019

Unnamed: 0,Player,Team,Age,Position,Games,Touches,Yards/Touch,Total_Yards,Touchdowns,Fumbles,Receptions
0,Christian McCaffrey,CAR,23,RB,16,403,5.9,2392,19,1,116
1,Ezekiel Elliott,DAL,24,RB,16,355,5.0,1777,14,3,54
2,Nick Chubb,CLE,24,RB,16,334,5.3,1772,8,3,36
3,Derrick Henry,TEN,25,RB,15,321,5.4,1746,18,5,18
4,Michael Thomas,NO,26,WR,16,150,11.4,1716,9,1,149
...,...,...,...,...,...,...,...,...,...,...,...
569,Jason Sanders,MIA,24,K,16,1,1.0,1,1,0,1
570,Darrius Shepherd,GB,24,,6,1,1.0,1,0,1,1
571,Eric Tomlinson,3TM,27,,8,1,1.0,1,0,0,1
572,Vita Vea,TB,24,DL/dt,16,1,1.0,1,1,0,1


In [10]:
# Convert Data Types to Integers and Float
num_cols = ['Age', 'Games', 'Touches', 'Total_Yards', 'Touchdowns', 'Fumbles', 'Receptions']

for col in num_cols:
    stats2019[f'{col}'] = stats2019[f'{col}'].astype(str).astype(int)

stats2019['Yards/Touch'] = stats2019['Yards/Touch'].astype(str).astype(float)

stats2019.dtypes


Player          object
Team            object
Age              int64
Position        object
Games            int64
Touches          int64
Yards/Touch    float64
Total_Yards      int64
Touchdowns       int64
Fumbles          int64
Receptions       int64
dtype: object

# Connect to SQL and Load Database to Table

In [11]:
# Connect to Postgres and Start Engine
connection_string = f"postgres:{pswd}@localhost:5432/NFL_Draft"
engine = create_engine(f'postgresql://{connection_string}')

In [12]:
# Print Table Names in Database
engine.table_names()

['player_stats']

In [13]:
# Fill Table
stats2019.to_sql(name='player_stats', con=engine, if_exists='replace', index=False)

In [14]:
# Print Table
pd.read_sql_query('select * from player_stats limit 10', con=engine)

Unnamed: 0,Player,Team,Age,Position,Games,Touches,Yards/Touch,Total_Yards,Touchdowns,Fumbles,Receptions
0,Christian McCaffrey,CAR,23,RB,16,403,5.9,2392,19,1,116
1,Ezekiel Elliott,DAL,24,RB,16,355,5.0,1777,14,3,54
2,Nick Chubb,CLE,24,RB,16,334,5.3,1772,8,3,36
3,Derrick Henry,TEN,25,RB,15,321,5.4,1746,18,5,18
4,Michael Thomas,NO,26,WR,16,150,11.4,1716,9,1,149
5,Leonard Fournette,JAX,24,RB,15,341,4.9,1674,3,1,76
6,Dalvin Cook,MIN,24,RB,14,303,5.5,1654,13,4,53
7,Aaron Jones,GB,25,RB,16,285,5.5,1558,19,3,49
8,Austin Ekeler,LAC,24,RB,16,224,6.9,1550,11,3,92
9,Chris Carson,SEA,25,RB,15,315,4.7,1496,9,7,37
