In [2]:
import os
import pandas as pd
from bs4 import BeautifulSoup

In [4]:
# Defining all directories and define the full path to the file
SCORE_DIR = "data/scores"
box_scores = os.listdir(SCORE_DIR)
box_scores = [os.path.join(SCORE_DIR, f) for f in box_scores if f.endswith(".html")]

In [5]:
def parse_html(box_score):
    with open(box_score) as f:
        html = f.read() # Open and read the box score html file
        
    soup = BeautifulSoup(html) # Use BeautifulSoup to parse html
    # Parses and removes elements and labels from the box score table
    [s.decompose() for s in soup.select("tr.over_header")] # Select tr (table row) tag with class "over_header"
    [s.decompose() for s in soup.select("tr.thead")] # Then calls decompose on both to remove them from HTML
    return soup

In [6]:
# Function to determine which season a particular game was played
def read_season_info(soup):
    nav = soup.select("#bottom_nav_container")[0]
    hrefs = [a["href"] for a in nav.find_all('a')]
    season = os.path.basename(hrefs[1]).split("_")[0]
    return season

In [7]:
def read_line_score(soup):
    # Use pandas to read in line score table by using the line_score id found in the HTML
    # Str gets the html from the BeautifulSoup instance
    # Passing in attributes keyword to check for id equaling line_score
    # pd's read_html function returns a list as default -- we only want the first element
    line_score = pd.read_html(str(soup), attrs={"id": "line_score"})[0] # Uses [0] index b/c it returns a list as default value
    
    cols = list(line_score.columns) # Converts line scores to list (from an index to list)
    cols[0] = "team" # Rename first and last columns
    cols[-1] = "total"
    line_score.columns = cols # Assign that back to line score columns
    
    line_score = line_score[["team", "total"]] # Removes any headers relating to the quarterly scores
    return line_score

In [8]:
def read_stat(soup, team, stat):
    # Similar to as before (read_line_score) but with a different id tag
    # Stat will be either basic or advanced
    dataframe = pd.read_html(str(soup), attrs={"id": f"box-{team}-game-{stat}"}, index_col=0) [0]
    dataframe = dataframe.apply(pd.to_numeric, errors = "coerce") # Converts all columns (i.e. "DNP" string) to a numeric value (NaN)
    return dataframe

In [10]:
games = []
base_cols = None

for box_score in box_scores:
    soup = parse_html(box_score)
    line_score = read_line_score(soup)
    teams = list(line_score["team"])

    summaries = []
    for team in teams:
        basic = read_stat(soup, team, "basic")
        advanced = read_stat(soup, team, "advanced")

        # Concatenate the last row (containing team totals) in both basic and advanced dataframes
        totals = pd.concat([basic.iloc[-1,:], advanced.iloc[-1,:]])
        totals.index = totals.index.str.lower()

        maxes = pd.concat([basic.iloc[:-1].max(), advanced.iloc[:-1].max()])
        maxes.index = maxes.index.str.lower() + "_max"

        summary = pd.concat([totals, maxes])

        if base_cols is None:
            base_cols = list(summary.index.drop_duplicates(keep="first"))
            base_cols = [b for b in base_cols if "bpm" not in b and "_max" not in b] # Can add or remove _max stats from box score

        summary = summary[base_cols]
        summaries.append(summary)

    summary = pd.concat(summaries, axis=1).T
    game = pd.concat([summary, line_score], axis=1)

    game["home"] = [0,1] # Define home and away team

    game_opp = game.iloc[::-1].reset_index() # Include opponent stats in the same row for machine learning model
    game_opp.columns += "_opp"

    full_game = pd.concat([game, game_opp], axis=1)
    full_game["season"] = read_season_info(soup) # Determines which season this game happened

    full_game["date"] = os.path.basename(box_score)[:8] # Gets date from file name
    full_game["date"] = pd.to_datetime(full_game["date"], format="%Y%m%d") # Converts and formats this to a datetime

    full_game["won"] = full_game["total"] > full_game["total_opp"] # Determine who won the game
    games.append(full_game)

    # Print statement to monitor progress for every 200 box scores
    if len(games) % 200 == 0:
        print(f"{len(games)} / {len(box_scores)}")

200 / 10206
400 / 10206
600 / 10206
800 / 10206
1000 / 10206
1200 / 10206
1400 / 10206
1600 / 10206
1800 / 10206
2000 / 10206
2200 / 10206
2400 / 10206
2600 / 10206
2800 / 10206
3000 / 10206
3200 / 10206
3400 / 10206
3600 / 10206
3800 / 10206
4000 / 10206
4200 / 10206
4400 / 10206
4600 / 10206
4800 / 10206
5000 / 10206
5200 / 10206
5400 / 10206
5600 / 10206
5800 / 10206
6000 / 10206
6200 / 10206
6400 / 10206
6600 / 10206
6800 / 10206
7000 / 10206
7200 / 10206
7400 / 10206
7600 / 10206
7800 / 10206
8000 / 10206
8200 / 10206
8400 / 10206
8600 / 10206
8800 / 10206
9000 / 10206
9200 / 10206
9400 / 10206
9600 / 10206
9800 / 10206
10000 / 10206
10200 / 10206


In [28]:
games_dataframe = pd.concat(games, ignore_index=True)
games_dataframe = games_dataframe.sort_values('date') # Sort each game by date

# Remove unnecessary columns
games_dataframe = games_dataframe.drop(columns=['+/-'])
games_dataframe = games_dataframe.drop(columns=['usg%'])
games_dataframe = games_dataframe.drop(columns=['+/-_opp'])
games_dataframe = games_dataframe.drop(columns=['usg%_opp'])
games_dataframe = games_dataframe.drop(columns=['index_opp'])
games_dataframe = games_dataframe.drop(columns=['pts'])
games_dataframe = games_dataframe.drop(columns=['pts_opp'])
games_dataframe = games_dataframe.drop(columns=['mp_opp'])
# Removing duplicate columns: 'mp'
duplicate_columns = ['mp']
games_dataframe = games_dataframe.loc[:, ~games_dataframe.columns.duplicated(keep='first') | ~games_dataframe.columns.isin(duplicate_columns)]

# Rename specific columns with "fg_" prefix
# SQL columns cannot start with a decimal value
columns_to_rename = ["3p", "3pa", "3p%", "3par", "3p_opp", "3pa_opp", "3p%_opp", "3par_opp"]
games_dataframe = games_dataframe.rename(columns=lambda x: "fg_" + x if x in columns_to_rename else x)

# Rename columns with "%" to "_percent"
games_dataframe.columns = [column.replace("%", "_percent") for column in games_dataframe.columns]

games_dataframe = games_dataframe.reset_index(drop=True) # Reset the indices

In [29]:
games_dataframe

Unnamed: 0,mp,fg,fga,fg_percent,fg_3p,fg_3pa,fg_3p_percent,ft,fta,ft_percent,...,blk_percent_opp,tov_percent_opp,ortg_opp,drtg_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,...,15.4,11.8,97.5,95.5,CHI,97,1,2016,2015-10-27,False
1,240.0,35.0,83.0,0.422,6.0,18.0,0.333,19.0,27.0,0.704,...,10.8,15.9,110.9,94.9,GSW,111,1,2016,2015-10-27,False
2,240.0,41.0,96.0,0.427,9.0,30.0,0.300,20.0,22.0,0.909,...,4.5,15.9,94.9,110.9,NOP,95,0,2016,2015-10-27,True
3,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,...,10.3,9.0,95.5,97.5,CLE,95,0,2016,2015-10-27,True
4,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,0.800,...,5.5,12.3,111.2,98.6,DET,106,0,2016,2015-10-27,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20407,240.0,34.0,92.0,0.370,11.0,35.0,0.314,15.0,19.0,0.789,...,8.8,12.4,120.4,103.8,DEN,109,0,2023,2023-06-07,False
20408,240.0,35.0,78.0,0.449,8.0,25.0,0.320,17.0,20.0,0.850,...,13.2,6.4,121.9,107.2,DEN,108,0,2023,2023-06-09,False
20409,240.0,39.0,79.0,0.494,14.0,28.0,0.500,16.0,21.0,0.762,...,5.9,13.9,107.2,121.9,MIA,95,1,2023,2023-06-09,True
20410,240.0,38.0,84.0,0.452,5.0,28.0,0.179,13.0,23.0,0.565,...,12.5,7.2,92.6,97.8,MIA,89,0,2023,2023-06-12,True


In [30]:
games_dataframe.to_csv("nba_games.csv")