In [1]:
import os
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
# Defining all directories and define the full path to the file
SCORE_DIR = "data/scores"
box_scores = os.listdir(SCORE_DIR)
box_scores = [os.path.join(SCORE_DIR, f) for f in box_scores if f.endswith(".htm")]

In [3]:
def parse_html(box_score):
    with open(box_score) as f:
        html = f.read() # Open and read html file
        
    soup = BeautifulSoup(html) # Use BeautifulSoup to parse html
    # Parses and removes elements and labels from the box score table
    [s.decompose() for s in soup.select("tr.over_header")] # Select tr (table row) tag with class "over_header"
    [s.decompose() for s in soup.select("tr.thead")] # Then calls decompose on both to remove them from HTML
    
    return soup

In [4]:
# Use pandas to read in HTML table as a dataframe
# "Str" gets the html from the BeautifulSoup instance
# Passing in attributes keyword to check css id for different html elements
# pd's read_html function returns a list as default -- we only want the first element

In [5]:
def get_game_info(soup):
    #Get final score:
    score = pd.read_html(str(soup), attrs={"id": "scoring"})[0]
    score_df = pd.DataFrame(columns=["Tm", "Score"])
    
    # Extract final score
    score_df["Score"] = score.iloc[-1, -2:].tolist()

    # Get team stats:
    team_stats = pd.read_html(str(soup), attrs={"id": "team_stats"})[0]

    column_names = ["Tm", "FrDwns", "TotYds", "TO", "Pen", "Pen_Yds", "ThdDwns", "ThdDwnAtt"]
    stats_df = pd.DataFrame(columns=column_names)

    # Pull each individual stat:
    stats_df["Tm"] = team_stats.columns[1:].tolist()
    stats_df["FrDwns"] = team_stats.iloc[0,-2:].tolist()
    stats_df["TotYds"] = team_stats.iloc[5,-2:].tolist()
    stats_df["TO"] = team_stats.iloc[7,-2:].tolist()

    # Correct format penalty and third down values:
    pen_values = team_stats.iloc[8, -2:].apply(lambda x: x.split('-'))
    stats_df["Pen"] = [val[0] for val in pen_values]
    stats_df["Pen_Yds"] = [val[1] for val in pen_values]

    thddwn_values = team_stats.iloc[9, -2:].apply(lambda x: x.split('-'))
    stats_df["ThdDwns"] = [val[0] for val in thddwn_values]
    stats_df["ThdDwnAtt"] = [val[1] for val in thddwn_values]

    score_df["Tm"] = stats_df["Tm"]
    
    game_info = score_df.merge(stats_df, on="Tm")
    return game_info

In [6]:
def get_passing(soup):
    passing_stats = pd.read_html(str(soup), attrs={"id": "player_offense"})[0]
    advanced_stats = pd.read_html(str(soup), attrs={"id": "passing_advanced"})[0]

    # Clean data: concatenate rows and remove unneeded columns
    columns_to_keep = ["Tm", "Cmp", "Att", "Yds", "TD", "Int", "Rate"]
    passing_stats = passing_stats[columns_to_keep]

    # Calculate QB rating for the team
    team1 = passing_stats.iloc[0]["Tm"]
    team2 = passing_stats.iloc[-1]["Tm"]
    team1_qbr = round((passing_stats.loc[passing_stats["Tm"] == team1, "Rate"] * passing_stats.loc[passing_stats["Tm"] == team1, "Att"]).sum() / passing_stats.loc[passing_stats["Tm"] == team1, "Att"].sum(), 1)
    team2_qbr = round((passing_stats.loc[passing_stats["Tm"] == team2, "Rate"] * passing_stats.loc[passing_stats["Tm"] == team2, "Att"]).sum() / passing_stats.loc[passing_stats["Tm"] == team2, "Att"].sum(), 1)

    # Concatenate: Add up values per team and add the overall passing rate back in
    passing_stats = passing_stats.groupby("Tm").sum().reset_index()

    # Clean advanced stats
    columns_to_keep = ["Tm", "YAC", "Drops", "BadTh", "Sk", "Hrry", "Hits", "Prss"]
    advanced_stats = advanced_stats[columns_to_keep]

    # Concatenate: Add up values and add calculated values back to dataframe
    advanced_stats = advanced_stats.groupby("Tm").sum().reset_index()

    # Concatenate both data frames
    passing_stats = passing_stats.merge(advanced_stats, on="Tm")

    # Rename specific columns with "Pass_" prefix
    columns_to_rename = ["Cmp", "Att", "Yds", "TD", "Int"]
    passing_stats = passing_stats.rename(columns=lambda x: "Pass_" + x if x in columns_to_rename else x)

    # Rename specific columns with "QB_" prefix
    columns_to_rename = ["Rate", "BadTh", "Sk", "Hrry", "Hits", "Prss"]
    passing_stats = passing_stats.rename(columns=lambda x: "QB_" + x if x in columns_to_rename else x)

    # Add team QB rating columns
    passing_stats.loc[passing_stats["Tm"] == team1, "QB_Rate"] = team1_qbr
    passing_stats.loc[passing_stats["Tm"] == team2, "QB_Rate"] = team2_qbr
    
    return passing_stats

In [7]:
def get_rushing(soup):
    rushing_stats = pd.read_html(str(soup), attrs={"id": "rushing_advanced"})[0]

    # Remove unneeded columns
    columns_to_keep = ["Tm", "Att", "Yds", "TD", "YAC"]
    rushing_stats = rushing_stats[columns_to_keep]

    # Concatenate: Add up values per team
    rushing_stats = rushing_stats.groupby("Tm").sum().reset_index()

    # Calculate yards per carry (YPC)
    rushing_stats["YPC"] = (rushing_stats["Yds"] / rushing_stats["Att"]).round(1)

     # Rename column labels with "Rush_" prefix
    rushing_stats = rushing_stats.rename(columns=lambda x: "Rush_" + x if x != "Tm" else x)
    
    return rushing_stats

In [8]:
def get_kicking(soup):
    kicking_stats = pd.read_html(str(soup), attrs={"id": "kicking"})[0]

    columns_to_keep = ["Tm", "XPM", "XPA", "FGM", "FGA"]
    kicking_stats = kicking_stats[columns_to_keep]
    
    kicking_stats = kicking_stats.groupby("Tm").sum().reset_index()
    
    return kicking_stats

In [9]:
# Function to determine which season a particular game was played
def read_season_info(soup):
    nav = soup.select("#bottom_nav")[0]
    hrefs = [a["href"] for a in nav.find_all('a')]
    season = hrefs[2].split("/")[2]
    return season

In [10]:
def get_dataframe(soup, box_score):
    # Concatenate and build the dataframe
    game_info = get_game_info(soup)
    passing = get_passing(soup)
    rushing = get_rushing(soup)
    kicking = get_kicking(soup)

    dataframe = game_info.merge(passing, on="Tm").merge(rushing, on="Tm").merge(kicking, on="Tm")
    
    game_opp = dataframe.iloc[::-1].reset_index()  # Include opponent stats in the same row for machine learning model
    game_opp.columns += "_opp"

    full_game = pd.concat([dataframe, game_opp], axis=1)
    full_game = full_game.drop(columns=['index_opp'])

    # Add additional metadata:
    full_game["Home"] = [0,1] # Define home and away team
    full_game["Won"] = full_game["Score"] > full_game["Score_opp"] # Determine who won the game

    full_game["Season"] = read_season_info(soup) # Determines which season this game happened
    
    full_game["Date"] = os.path.basename(box_score)[:8] # Gets date from file name
    full_game["Date"] = pd.to_datetime(full_game["Date"], format="%Y%m%d") # Converts and formats this to a datetime

    # Add info:
    # all columns to lowercase
    return full_game

In [11]:
# Loop to parse each HTML file
games = []
for box_score in box_scores:
    soup = parse_html(box_score)
    full_game = get_dataframe(soup, box_score)
    games.append(full_game)
    
    # Print statement to monitor progress for every 100 box scores
    if len(games) % 100 == 0:
        print(f"{len(games)} / {len(box_scores)}")

100 / 1372
200 / 1372
300 / 1372
400 / 1372
500 / 1372
600 / 1372
700 / 1372
800 / 1372
900 / 1372
1000 / 1372
1100 / 1372
1200 / 1372
1300 / 1372


In [12]:
games_dataframe = pd.concat(games, ignore_index=True)
games_dataframe = games_dataframe.sort_values('Date') # Sort each game by date
games_dataframe = games_dataframe.reset_index(drop=True) # Reset the indices
games_dataframe.rename(columns={'TO': 'T_O'}, inplace=True) # TO is a keyword in SQL

# Replace "OAK" with "LVR" in the "Tm" and "Tm_opp" columns
games_dataframe['Tm'] = games_dataframe['Tm'].replace('OAK', 'LVR')
games_dataframe['Tm_opp'] = games_dataframe['Tm_opp'].replace('OAK', 'LVR')

In [20]:
games_dataframe

Unnamed: 0,Tm,Score,FrDwns,TotYds,T_O,Pen,Pen_Yds,ThdDwns,ThdDwnAtt,Pass_Cmp,...,Rush_YAC_opp,Rush_YPC_opp,XPM_opp,XPA_opp,FGM_opp,FGA_opp,Home,Won,Season,Date
0,PHI,18,18,232,2,11,101,8,16,20,...,44,4.1,0.0,1.0,2.0,2.0,1,True,2018,2018-09-06
1,ATL,12,16,299,1,15,135,4,15,21,...,74,4.2,1.0,1.0,1.0,1.0,0,False,2018,2018-09-06
2,LAC,28,33,541,2,7,45,3,11,34,...,54,3.9,5.0,5.0,1.0,1.0,1,False,2018,2018-09-09
3,NWE,27,23,389,3,6,36,4,14,26,...,63,4.9,2.0,2.0,2.0,2.0,1,True,2018,2018-09-09
4,HOU,20,21,325,2,6,44,2,11,17,...,47,3.9,3.0,3.0,2.0,2.0,0,False,2018,2018-09-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2739,CIN,20,18,309,2,9,71,6,14,26,...,10,2.1,2.0,2.0,3.0,3.0,0,False,2022,2023-01-29
2740,SFO,7,11,164,3,11,81,2,8,11,...,57,3.4,4.0,4.0,1.0,1.0,0,False,2022,2023-01-29
2741,PHI,31,25,269,0,4,34,5,15,15,...,60,3.4,1.0,1.0,0.0,0.0,1,True,2022,2023-01-29
2742,PHI,35,25,417,1,6,33,11,18,27,...,59,6.1,5.0,5.0,1.0,2.0,1,False,2022,2023-02-12


In [21]:
games_dataframe.to_csv("nfl_games.csv")