This script prepares the database by adding the upcoming season's schedule. This will webscrape the new schedule and populate the predictions table.

In [1]:
import os
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
import time
import psycopg2 # PostgreSQL database adapter

In [2]:
upcoming_season = 2023

In [3]:
# Directory named data with 2 subdirectories: standings and scores
DATA_DIR = "data"
STANDINGS_DIR = os.path.join(DATA_DIR, "standings")

In [4]:
async def get_html(url, selector, sleep = 5, retries = 3):
    # Selector will be a CSS selector -- ID that's used to locate unique element within the html
    html = None
    # Allows for more time to avoid sending to many requests and getting banned
    for i in range(1, retries+1):
        time.sleep(sleep * i)
        # Logic to handle errors when web scraping
        try:
            async with async_playwright() as p:
                browser = await p.firefox.launch() # Can also use p.chromium.launch() if issues persist
                page = await browser.new_page()
                await page.goto(url)
                print(await page.title())
                html = await page.inner_html(selector)
        except PlaywrightTimeout:
            print(f"Timeout error on {url}")
            continue
        else:
            break
    return html

In [5]:
async def scrape_season(season):
    url = f"https://www.pro-football-reference.com/years/{season}/games.htm"
    html = await get_html(url, "#all_games")
    
    save_path = os.path.join(STANDINGS_DIR, f"{season}-games.htm") # Designate the filename and path
    if os.path.exists(save_path): # Check that we are not scraping data that has already been scraped
        return save_path
            
    with open(save_path, "w+") as f:
        f.write(html)
        
    return save_path

In [6]:
filepath = await scrape_season(upcoming_season)

2023 NFL Weekly League Schedule | Pro-Football-Reference.com


In [6]:
# Function to get team abbreviation
def team_abbreviate(team_name):
    team_name = team_name.lower()
    return team_abbreviations.get(team_name, team_name)

team_abbreviations = {
    'detroit lions': 'DET',
    'kansas city chiefs': 'KC',
    'atlanta falcons': 'ATL',
    'carolina panthers': 'CAR',
    'cleveland browns': 'CLE',
    'cincinnati bengals': 'CIN',
    'jacksonville jaguars': 'JAX',
    'indianapolis colts': 'IND',
    'washington commanders': 'WAS',
    'arizona cardinals': 'ARI',
    'baltimore ravens': 'BAL',
    'houston texans': 'HOU',
    'tampa bay buccaneers': 'TB',
    'minnesota vikings': 'MIN',
    'new orleans saints': 'NO',
    'tennessee titans': 'TEN',
    'san francisco 49ers': 'SF',
    'pittsburgh steelers': 'PIT',
    'green bay packers': 'GB',
    'chicago bears': 'CHI',
    'las vegas raiders': 'LV',
    'denver broncos': 'DEN',
    'miami dolphins': 'MIA',
    'los angeles chargers': 'LAC',
    'philadelphia eagles': 'PHI',
    'new england patriots': 'NE',
    'los angeles rams': 'LAR',
    'seattle seahawks': 'SEA',
    'dallas cowboys': 'DAL',
    'new york giants': 'NYG',
    'new york jets': 'NYJ',
    'buffalo bills': 'BUF'
}

In [7]:
try:
    # Establish a connection to the PostgreSQL database
    connection = psycopg2.connect(
        host = "localhost",
        dbname = "nfl",
        user = "postgres",
        password = "Plenoir2002!", # Enter correct password
        port = 5432
    )
    print("Connected to PostgreSQL database.")
except (Exception, psycopg2.Error) as error:
    print("Error connecting to PostgreSQL database: ", error)

Connected to PostgreSQL database.


In [8]:
# Create a cursor object to interact with the database
cursor = connection.cursor()

In [9]:
predictions_table_query = '''
    CREATE TABLE IF NOT EXISTS predictions (
    gameid INT,
    Tm VARCHAR(10),
    Home FLOAT,
    Season FLOAT,
    Date DATE,
    Week INT,
    Score FLOAT,
    P_score FLOAT,
    Tm_opp VARCHAR(10),
    Score_opp FLOAT,
    P_score_opp FLOAT,
    Win BOOLEAN,
    Pred_Win BOOLEAN,
    Spread FLOAT,
    Pred_spread FLOAT,
    Spread_win BOOLEAN
);
'''

cursor.execute(predictions_table_query)
connection.commit()

In [10]:
insert_query = '''
    INSERT INTO predictions (
        Tm, Home, Season, Date, Week, Tm_opp
    ) VALUES (%s, %s, %s, %s, %s, %s);
'''

In [11]:
with open(filepath, 'r') as f:
    html = f.read()

soup = BeautifulSoup(html, 'html.parser')

# Find all rows with the data-row attribute
rows = soup.find_all("tr")

# Iterate through each row
for row in rows:
    # Find the week number
    week_num = row.find("th", {"data-stat": "week_num", "class": "right"})
        
    # Check if the week number element is found
    if week_num:
        week_num = week_num.text

    # Extract data from each cell
    cells = row.find_all("td")
    for cell in cells:
        data_stat = cell.get("data-stat", "")

        if data_stat == "game_date":
            # Set season
            date = cell.text
        if data_stat == "winner":
            team = cell.text
            team = team_abbreviate(team)
        if data_stat == "game_location":
            if cell.text == "@":
                home = 0
            else:
                home = 1
        if data_stat == "loser":
            opponent = cell.text
            opponent = team_abbreviate(opponent)
            
    if week_num is not None:
        cursor.execute(insert_query, (team, float(home), float(upcoming_season), date, int(week_num), opponent))
        cursor.execute(insert_query, (opponent, float(not home), float(upcoming_season), date, int(week_num), team))

connection.commit()
print("Data inserted.")

Data inserted.


In [12]:
cursor.close()
connection.close()