This script will be run each week to update the database with the prior week's data and before making predictions for the upcoming week.

In [1]:
import os
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
import time
import pandas as pd
import psycopg2
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn

In [47]:
# Set current date and determine how many days to filter the run
season = 2023
filter_days = 104

# Filter files based on the start date
current_date = datetime.today().strftime("%Y%m%d") + "0"
start_date = (datetime.today() - timedelta(days=filter_days)).strftime("%Y%m%d") + "0"

In [3]:
current_date = '202312160'

In [4]:
# Directory named data with 2 subdirectories: standings and scores
DATA_DIR = "data"
STANDINGS_DIR = os.path.join(DATA_DIR, "standings")
SCORES_DIR = os.path.join(DATA_DIR, "scores")

In [5]:
standings_files = os.listdir(STANDINGS_DIR)
standings_files = [s for s in standings_files if ".htm" in s]

box_scores = os.listdir(SCORES_DIR)
box_scores = [os.path.join(SCORES_DIR, f) for f in box_scores if f.endswith(".htm") and f[:8] >= start_date]

In [6]:
async def get_html(url, selector, sleep = 5, retries = 3):
    # Selector will be a CSS selector -- ID that's used to locate unique element within the html
    html = None
    # Allows for more time to avoid sending to many requests and getting banned
    for i in range(1, retries+1):
        time.sleep(sleep * i)
        # Logic to handle errors when web scraping
        try:
            async with async_playwright() as p:
                browser = await p.firefox.launch() # Can also use p.chromium.launch() if issues persist
                page = await browser.new_page()
                await page.goto(url)
                print(await page.title())
                html = await page.inner_html(selector)
        except PlaywrightTimeout:
            print(f"Timeout error on {url}")
            continue
        else:
            break
    return html

In [7]:
async def scrape_season(season):
    url = f"https://www.pro-football-reference.com/years/{season}/games.htm"
    html = await get_html(url, "#all_games")
    
    save_path = os.path.join(STANDINGS_DIR, f"{season}-games.htm") # Designate the filename and path
            
    with open(save_path, "w+") as f:
        f.write(html)

In [8]:
await scrape_season(season)

2023 NFL Weekly League Schedule | Pro-Football-Reference.com


In [9]:
async def scrape_game(standings_file, start_date, current_date):
    with open(standings_file, 'r') as f:
        html = f.read()

    soup = BeautifulSoup(html)
    links = soup.find_all("a") # Finds all "a" (anchor) tags
    hrefs = [l.get("href") for l in links] # Grab href part of the anchor tag

    # Check for non-empty tag with "boxscore" and ".htm" and filter based on start_date
    box_scores = [l for l in hrefs if l and "boxscore" in l and ".htm" in l]
    box_scores = [url for url in box_scores if start_date <= url[11:19] <= current_date]
    box_scores = [f"https://pro-football-reference.com{l}" for l in box_scores]

    for url in box_scores:
        save_path = os.path.join(SCORES_DIR, url.split("/")[-1])
        if os.path.exists(save_path):
            continue

        html = await get_html(url, "#content")
        if not html:
            continue
        with open(save_path, "w+") as f:
            f.write(html)

In [10]:
# Loop to iterate through standings directory and scrape each individual box score page
for f in standings_files:
    filepath = os.path.join(STANDINGS_DIR, f)
    await scrape_game(filepath, start_date, current_date)

New games have been scraped. Now, parse the each boxscore and compile into a dataframe.

In [11]:
def parse_html(box_score):
    with open(box_score) as f:
        html = f.read() # Open and read html file
        
    soup = BeautifulSoup(html) # Use BeautifulSoup to parse html
    # Parses and removes elements and labels from the box score table
    [s.decompose() for s in soup.select("tr.over_header")] # Select tr (table row) tag with class "over_header"
    [s.decompose() for s in soup.select("tr.thead")] # Then calls decompose on both to remove them from HTML
    
    return soup

In [12]:
def get_game_info(soup):
    #Get final score:
    score = pd.read_html(str(soup), attrs={"id": "scoring"})[0]
    score_df = pd.DataFrame(columns=["Tm", "Score"])
    
    # Extract final score
    score_df["Score"] = score.iloc[-1, -2:].tolist()

    # Get team stats:
    team_stats = pd.read_html(str(soup), attrs={"id": "team_stats"})[0]

    column_names = ["Tm", "FrDwns", "TotYds", "TO", "Pen", "Pen_Yds", "ThdDwns", "ThdDwnAtt"]
    stats_df = pd.DataFrame(columns=column_names)

    # Pull each individual stat:
    stats_df["Tm"] = team_stats.columns[1:].tolist()
    stats_df["FrDwns"] = team_stats.iloc[0,-2:].tolist()
    stats_df["TotYds"] = team_stats.iloc[5,-2:].tolist()
    stats_df["TO"] = team_stats.iloc[7,-2:].tolist()

    # Correct format penalty and third down values:
    pen_values = team_stats.iloc[8, -2:].apply(lambda x: x.split('-'))
    stats_df["Pen"] = [val[0] for val in pen_values]
    stats_df["Pen_Yds"] = [val[1] for val in pen_values]

    thddwn_values = team_stats.iloc[9, -2:].apply(lambda x: x.split('-'))
    stats_df["ThdDwns"] = [val[0] for val in thddwn_values]
    stats_df["ThdDwnAtt"] = [val[1] for val in thddwn_values]

    score_df["Tm"] = stats_df["Tm"]
    
    game_info = score_df.merge(stats_df, on="Tm")
    return game_info

In [13]:
def get_passing(soup):
    passing_stats = pd.read_html(str(soup), attrs={"id": "player_offense"})[0]
    advanced_stats = pd.read_html(str(soup), attrs={"id": "passing_advanced"})[0]

    # Clean data: concatenate rows and remove unneeded columns
    columns_to_keep = ["Tm", "Cmp", "Att", "Yds", "TD", "Int", "Rate"]
    passing_stats = passing_stats[columns_to_keep]

    # Calculate QB rating for the team
    team1 = passing_stats.iloc[0]["Tm"]
    team2 = passing_stats.iloc[-1]["Tm"]
    team1_qbr = round((passing_stats.loc[passing_stats["Tm"] == team1, "Rate"] * passing_stats.loc[passing_stats["Tm"] == team1, "Att"]).sum() / passing_stats.loc[passing_stats["Tm"] == team1, "Att"].sum(), 1)
    team2_qbr = round((passing_stats.loc[passing_stats["Tm"] == team2, "Rate"] * passing_stats.loc[passing_stats["Tm"] == team2, "Att"]).sum() / passing_stats.loc[passing_stats["Tm"] == team2, "Att"].sum(), 1)

    # Concatenate: Add up values per team and add the overall passing rate back in
    passing_stats = passing_stats.groupby("Tm").sum().reset_index()

    # Clean advanced stats
    columns_to_keep = ["Tm", "YAC", "Drops", "BadTh", "Sk", "Hrry", "Hits", "Prss"]
    advanced_stats = advanced_stats[columns_to_keep]

    # Concatenate: Add up values and add calculated values back to dataframe
    advanced_stats = advanced_stats.groupby("Tm").sum().reset_index()

    # Concatenate both data frames
    passing_stats = passing_stats.merge(advanced_stats, on="Tm")

    # Rename specific columns with "Pass_" prefix
    columns_to_rename = ["Cmp", "Att", "Yds", "TD", "Int"]
    passing_stats = passing_stats.rename(columns=lambda x: "Pass_" + x if x in columns_to_rename else x)

    # Rename specific columns with "QB_" prefix
    columns_to_rename = ["Rate", "BadTh", "Sk", "Hrry", "Hits", "Prss"]
    passing_stats = passing_stats.rename(columns=lambda x: "QB_" + x if x in columns_to_rename else x)

    # Add team QB rating columns
    passing_stats.loc[passing_stats["Tm"] == team1, "QB_Rate"] = team1_qbr
    passing_stats.loc[passing_stats["Tm"] == team2, "QB_Rate"] = team2_qbr
    
    return passing_stats

In [14]:
def get_rushing(soup):
    rushing_stats = pd.read_html(str(soup), attrs={"id": "rushing_advanced"})[0]

    # Remove unneeded columns
    columns_to_keep = ["Tm", "Att", "Yds", "TD", "YAC"]
    rushing_stats = rushing_stats[columns_to_keep]

    # Concatenate: Add up values per team
    rushing_stats = rushing_stats.groupby("Tm").sum().reset_index()

    # Calculate yards per carry (YPC)
    rushing_stats["YPC"] = (rushing_stats["Yds"] / rushing_stats["Att"]).round(1)

     # Rename column labels with "Rush_" prefix
    rushing_stats = rushing_stats.rename(columns=lambda x: "Rush_" + x if x != "Tm" else x)
    
    return rushing_stats

In [15]:
def get_kicking(soup):
    kicking_stats = pd.read_html(str(soup), attrs={"id": "kicking"})[0]

    columns_to_keep = ["Tm", "XPM", "XPA", "FGM", "FGA"]
    kicking_stats = kicking_stats[columns_to_keep]
    
    kicking_stats = kicking_stats.groupby("Tm").sum().reset_index()
    
    return kicking_stats

In [16]:
# Function to determine which season a particular game was played
def read_season_info(soup):
    nav = soup.select("#bottom_nav")[0]
    hrefs = [a["href"] for a in nav.find_all('a')]
    season = hrefs[2].split("/")[2]
    return season

In [17]:
def get_dataframe(soup, box_score):
    # Concatenate and build the dataframe
    game_info = get_game_info(soup)
    passing = get_passing(soup)
    rushing = get_rushing(soup)
    kicking = get_kicking(soup)

    dataframe = game_info.merge(passing, on="Tm").merge(rushing, on="Tm").merge(kicking, on="Tm")
    
    game_opp = dataframe.iloc[::-1].reset_index()  # Include opponent stats in the same row for machine learning model
    game_opp.columns += "_opp"

    full_game = pd.concat([dataframe, game_opp], axis=1)
    full_game = full_game.drop(columns=['index_opp'])

    # Add additional metadata:
    full_game["Home"] = [0,1] # Define home and away team
    full_game["Won"] = full_game["Score"] > full_game["Score_opp"] # Determine who won the game

    full_game["Season"] = read_season_info(soup) # Determines which season this game happened
    
    full_game["Date"] = os.path.basename(box_score)[:8] # Gets date from file name
    full_game["Date"] = pd.to_datetime(full_game["Date"], format="%Y%m%d") # Converts and formats this to a datetime

    # Add info:
    # all columns to lowercase
    return full_game

In [18]:
# Loop to parse each HTML file
games = []
for box_score in box_scores:
    soup = parse_html(box_score)
    full_game = get_dataframe(soup, box_score)
    games.append(full_game)
    
print(f"Boxscores: {len(games)} / {len(box_scores)}")

Boxscores: 212 / 212


In [19]:
games_dataframe = pd.concat(games, ignore_index=True)
games_dataframe = games_dataframe.sort_values('Date') # Sort each game by date
games_dataframe = games_dataframe.reset_index(drop=True) # Reset the indices
games_dataframe.rename(columns={'TO': 'T_O'}, inplace=True) # TO is a keyword in SQL

# Replace "OAK" with "LVR" in the "Tm" and "Tm_opp" columns
games_dataframe['Tm'] = games_dataframe['Tm'].replace('OAK', 'LVR')
games_dataframe['Tm_opp'] = games_dataframe['Tm_opp'].replace('OAK', 'LVR')

In [38]:
try:
    # Establish a connection to the PostgreSQL database
    connection = psycopg2.connect(
        host = "localhost",
        dbname = "nfl",
        user = "postgres",
        password = "Plenoir2002!", # Include correct password
        port = 5432
    )
    print("Connected to PostgreSQL database.")
except (Exception, psycopg2.Error) as error:
    print("Error connecting to PostgreSQL database: ", error)

# Create a cursor object to interact with the database
cursor = connection.cursor()

Connected to PostgreSQL database.


In [39]:
# Fetch the last gameid from the database
cursor.execute("SELECT MAX(gameid) FROM {}".format('nfl_data'))
last_gameid = cursor.fetchone()[0]

# Increment gameid for each row in your DataFrame
games_dataframe['gameid'] = range(last_gameid + 1, last_gameid + 1 + len(games_dataframe))

In [43]:
# Insert data into the table
insert_query = """
    INSERT INTO {} ({}) VALUES ({});
""".format(
    'nfl_data',
    ', '.join(games_dataframe.columns.tolist()),  # Include all columns in the column list
    ', '.join(['%s'] * len(games_dataframe.columns))  # Adjust the placeholders
)

# Convert DataFrame to a list of tuples
data_values = [tuple(row) for row in games_dataframe.values]

# Execute the insert query with data values
for row in data_values:
    cursor.execute(insert_query, row)
    
connection.commit()

In [None]:
# Update results ta

In [56]:
# Close the cursor and the database connection
cursor.close()
connection.close()