In [None]:
import os
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
BOX_SCORE_DIR = "../data/raw/scraped_html/scores"

In [None]:
box_scores = os.listdir(BOX_SCORE_DIR)

In [None]:
len(box_scores)

In [None]:
# Get path to each box score html file
box_scores = [os.path.join(BOX_SCORE_DIR, f) for f in box_scores if f.endswith(".html")]

In [None]:
def parse_html(box_score):
    try:
        with open(box_score, encoding='utf-8') as f:
            html = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {box_score}: {e}")
        with open(box_score, encoding='utf-8', errors='replace') as f:
            html = f.read()
    
    soup = BeautifulSoup(html)
    [s.decompose() for s in soup.select("tr.over_header")]
    [s.decompose() for s in soup.select("tr.thead")]
    return soup

In [None]:
def read_line_score(soup):
    line_score = pd.read_html(str(soup), attrs = {'id': 'line_score'})[0]
    cols = list(line_score.columns)
    cols[0] = "team"
    cols[-1] = "total"
    line_score.columns = cols
    
    # Only care about the team and total points
    line_score = line_score[["team", "total"]]
    
    return line_score

In [None]:
# Converts time in the format MM:SS into seconds
def convert_to_seconds(time_str):
    try:
        min_sec = list(map(int, time_str.split(':')))  # min_sec should be a list of two ints [{minutes}, {seconds}]
        return (min_sec[0] * 60 + min_sec[1]) if (len(min_sec) == 2) else (min_sec[0] * 60)
    except (ValueError, AttributeError):
        return time_str # if the player didn't play...

In [None]:
def read_stats(soup, team, stat):
    # Read in the box score stats
    df = pd.read_html(str(soup), attrs={"id": f"box-{team}-game-{stat}"}, index_col=0)[0]
    # Create a "seconds played (SP)" column from MP
    df['SP'] = df['MP'].apply(convert_to_seconds)
    df = df.apply(pd.to_numeric, errors="coerce")
    return df

In [None]:
def read_season_info(soup):
    nav = soup.select("#bottom_nav_container")[0]
    hrefs = [a["href"] for a in nav.find_all('a')]
    season = os.path.basename(hrefs[1]).split("_")[0]
    return season

In [None]:
base_cols = None  # this contains which stats we want to pull for each player

all_player_data = []

box_scores_parsed = 0;

for box_score in box_scores:
    print(box_score)  # prints the current file
    soup = parse_html(box_score)
    line_score = read_line_score(soup)
    teams = list(line_score["team"])
    box_scores_parsed += 1;

    for team in teams:
        basic = read_stats(soup, team, "basic")
        advanced = read_stats(soup, team, "advanced")
        advanced_cols_to_append = advanced.columns.difference(basic.columns)  # this is so we don't merge duplicate columns!
        total_stats = pd.merge(basic, advanced[advanced_cols_to_append], left_index=True, right_index=True)

        # Make sure each player is accounted for...
        assert len(basic.index) == len(advanced.index) == len(total_stats.index)

        # Remove the MP column
        total_stats = total_stats.drop("MP", axis=1)

        # Move SP column to the front
        SP_col = total_stats.pop("SP")
        total_stats.insert(0, "SP", SP_col)

        # Remove players who didn't play this game
        total_stats.drop(total_stats[pd.isna(total_stats.SP)].index, inplace=True)

        # Convert column names into lowercase
        total_stats.columns = total_stats.columns.str.lower()

        # Set the base_cols
        if base_cols is None:
            base_cols = list(total_stats.columns.drop_duplicates(keep="first"))
            base_cols = [b for b in base_cols if "bpm" not in b]   # drop bpm as this is not present in all box_scores

        # Include only the base_cols
        total_stats = total_stats[base_cols]

        # Add team, opposing team, and whether home/away
        home = team == teams[1]  # home team is listed second (teams[1])
        total_stats["home"] = 1 if home else 0
        total_stats["team"] = team
        team_opp = [t for t in teams if t != team][0]
        total_stats["team_opp"] = team_opp  # get opposing team

        # Add which season this game happened
        total_stats["season"] = read_season_info(soup)

        # Add the date of the game
        total_stats["date"] = os.path.basename(box_score)[:8]
        total_stats["date"] = pd.to_datetime(total_stats["date"], format="%Y%m%d")

        total_stats["won"] = line_score.loc[line_score["team"] == team, "total"].iloc[0] > line_score.loc[line_score["team"] == team_opp, "total"].iloc[0]

        # Rename the 'Starters' index to "player_name"
        total_stats.index.name = "player_name"

        # Remove the last row (which is "Team Totals")
        total_stats = total_stats[:-1]

        all_player_data.append(total_stats)
    
    if box_scores_parsed % 100 == 0:
        print(f"{box_scores_parsed} / {len(box_scores)}")

In [None]:
basic

In [None]:
advanced

In [None]:
total_stats

In [None]:
print(total_stats.columns)
print(len(total_stats.columns))

In [None]:
print(base_cols)
print(len(base_cols))

In [None]:
# Merge all player data into single df
all_player_data_df = pd.concat(all_player_data, ignore_index=False)

In [None]:
all_player_data_df

In [None]:
print(all_player_data_df.columns)
print(len(all_player_data_df.columns))

In [None]:
output_csv_path = "../data/raw"

all_player_data_df.to_csv(f"{output_csv_path}/all_player_stats.csv")