In [1]:
import os
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
BOX_SCORE_DIR = "../data/raw/scraped_html/scores"

In [3]:
box_scores = os.listdir(BOX_SCORE_DIR)

In [4]:
len(box_scores)

11022

In [5]:
# Get path to each box score html file
box_scores = [os.path.join(BOX_SCORE_DIR, f) for f in box_scores if f.endswith(".html")]

In [6]:
def parse_html(box_score):
    try:
        with open(box_score, encoding='utf-8') as f:
            html = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {box_score}: {e}")
        with open(box_score, encoding='utf-8', errors='replace') as f:
            html = f.read()
    
    soup = BeautifulSoup(html)
    [s.decompose() for s in soup.select("tr.over_header")]
    [s.decompose() for s in soup.select("tr.thead")]
    return soup

In [7]:
def read_line_score(soup):
    line_score = pd.read_html(str(soup), attrs = {'id': 'line_score'})[0]
    cols = list(line_score.columns)
    cols[0] = "team"
    cols[-1] = "total"
    line_score.columns = cols
    
    # Only care about the team and total points
    line_score = line_score[["team", "total"]]
    
    return line_score

In [8]:
# Converts time in the format MM:SS into seconds
def convert_to_seconds(time_str):
    try:
        min_sec = list(map(int, time_str.split(':')))  # min_sec should be a list of two ints [{minutes}, {seconds}]
        return (min_sec[0] * 60 + min_sec[1]) if (len(min_sec) == 2) else (min_sec[0] * 60)
    except (ValueError, AttributeError):
        return time_str # if the player didn't play...

In [9]:
def read_stats(soup, team, stat):
    # Read in the box score stats
    df = pd.read_html(str(soup), attrs={"id": f"box-{team}-game-{stat}"}, index_col=0)[0]
    # Create a "seconds played (SP)" column from MP
    df['SP'] = df['MP'].apply(convert_to_seconds)
    df = df.apply(pd.to_numeric, errors="coerce")
    return df

In [10]:
def read_season_info(soup):
    nav = soup.select("#bottom_nav_container")[0]
    hrefs = [a["href"] for a in nav.find_all('a')]
    season = os.path.basename(hrefs[1]).split("_")[0]
    return season

In [11]:
base_cols = None  # this contains which stats we want to pull for each player

all_player_data = []

box_scores_parsed = 0;

for box_score in box_scores:
#     print(box_score)  # prints the current file
    soup = parse_html(box_score)
    line_score = read_line_score(soup)
    teams = list(line_score["team"])
    box_scores_parsed += 1;

    for team in teams:
        basic = read_stats(soup, team, "basic")
        advanced = read_stats(soup, team, "advanced")
        advanced_cols_to_append = advanced.columns.difference(basic.columns)  # this is so we don't merge duplicate columns!
        total_stats = pd.merge(basic, advanced[advanced_cols_to_append], left_index=True, right_index=True)

        # Make sure each player is accounted for...
#         assert len(basic.index) == len(advanced.index) == len(total_stats.index)

        # Remove the MP column
        total_stats = total_stats.drop("MP", axis=1)

        # Move SP column to the front
        SP_col = total_stats.pop("SP")
        total_stats.insert(0, "SP", SP_col)

        # Remove players who didn't play this game
        total_stats.drop(total_stats[pd.isna(total_stats.SP)].index, inplace=True)

        # Convert column names into lowercase
        total_stats.columns = total_stats.columns.str.lower()

        # Set the base_cols
        if base_cols is None:
            base_cols = list(total_stats.columns.drop_duplicates(keep="first"))
            base_cols = [b for b in base_cols if "bpm" not in b]   # drop bpm as this is not present in all box_scores

        # Include only the base_cols
        total_stats = total_stats[base_cols]

        # Add team, opposing team, and whether home/away
        home = team == teams[1]  # home team is listed second (teams[1])
        total_stats["home"] = 1 if home else 0
        total_stats["team"] = team
        team_opp = [t for t in teams if t != team][0]
        total_stats["team_opp"] = team_opp  # get opposing team

        # Add which season this game happened
        total_stats["season"] = read_season_info(soup)

        # Add the date of the game
        total_stats["date"] = os.path.basename(box_score)[:8]
        total_stats["date"] = pd.to_datetime(total_stats["date"], format="%Y%m%d")

        total_stats["won"] = line_score.loc[line_score["team"] == team, "total"].iloc[0] > line_score.loc[line_score["team"] == team_opp, "total"].iloc[0]

        # Rename the 'Starters' index to "player_name"
        total_stats.index.name = "player_name"

        # Remove the last row (which is "Team Totals")
        total_stats = total_stats[:-1]

        all_player_data.append(total_stats)
    
    if box_scores_parsed % 100 == 0:
        print(f"{box_scores_parsed} / {len(box_scores)}")

100 / 11022
200 / 11022
300 / 11022
400 / 11022
500 / 11022
600 / 11022
700 / 11022
800 / 11022
900 / 11022
1000 / 11022
1100 / 11022
Error decoding file ../data/raw/scraped_html/scores\201604010ATL.html: 'utf-8' codec can't decode byte 0x97 in position 6440: invalid start byte
Error decoding file ../data/raw/scraped_html/scores\201604010CHO.html: 'utf-8' codec can't decode byte 0x97 in position 6443: invalid start byte
Error decoding file ../data/raw/scraped_html/scores\201604010DET.html: 'utf-8' codec can't decode byte 0x97 in position 6439: invalid start byte
1200 / 11022
1300 / 11022
1400 / 11022
1500 / 11022
1600 / 11022
1700 / 11022
1800 / 11022
1900 / 11022
2000 / 11022
2100 / 11022
2200 / 11022
2300 / 11022
2400 / 11022
2500 / 11022
2600 / 11022
2700 / 11022
2800 / 11022
2900 / 11022
3000 / 11022
3100 / 11022
3200 / 11022
3300 / 11022
3400 / 11022
3500 / 11022
3600 / 11022
3700 / 11022
3800 / 11022
3900 / 11022
4000 / 11022
4100 / 11022
4200 / 11022
4300 / 11022
4400 / 11022
45

In [12]:
basic

Unnamed: 0_level_0,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-,SP
Starters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Keyonte George,,11.0,22.0,0.5,9.0,16.0,0.563,2.0,3.0,0.667,...,2.0,2.0,6.0,3.0,0.0,2.0,6.0,33.0,7.0,2433.0
Lauri Markkanen,,6.0,23.0,0.261,4.0,11.0,0.364,4.0,4.0,1.0,...,9.0,14.0,3.0,0.0,0.0,1.0,1.0,20.0,8.0,2037.0
Collin Sexton,,10.0,18.0,0.556,3.0,8.0,0.375,12.0,13.0,0.923,...,2.0,3.0,9.0,1.0,1.0,3.0,2.0,35.0,15.0,2032.0
John Collins,,8.0,14.0,0.571,1.0,3.0,0.333,1.0,1.0,1.0,...,9.0,13.0,2.0,1.0,1.0,1.0,4.0,18.0,0.0,1835.0
Walker Kessler,,1.0,2.0,0.5,0.0,1.0,0.0,2.0,2.0,1.0,...,3.0,6.0,2.0,1.0,1.0,1.0,3.0,4.0,1.0,1519.0
Jordan Clarkson,,3.0,8.0,0.375,1.0,5.0,0.2,6.0,7.0,0.857,...,0.0,2.0,9.0,0.0,0.0,3.0,2.0,13.0,-12.0,1999.0
Kris Dunn,,2.0,3.0,0.667,2.0,3.0,0.667,2.0,2.0,1.0,...,4.0,5.0,1.0,2.0,0.0,3.0,2.0,8.0,-13.0,1257.0
Taylor Hendricks,,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,,...,2.0,3.0,0.0,2.0,0.0,2.0,0.0,3.0,-12.0,959.0
Talen Horton-Tucker,,1.0,2.0,0.5,1.0,1.0,1.0,0.0,0.0,,...,0.0,0.0,1.0,0.0,0.0,0.0,2.0,3.0,-9.0,329.0
Kira Lewis Jr.,,,,,,,,,,,...,,,,,,,,,,


In [13]:
advanced

Unnamed: 0_level_0,MP,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg,BPM,SP
Starters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Keyonte George,,0.708,0.705,0.727,0.136,0.0,5.4,2.7,23.7,3.5,0.0,7.9,24.4,142.0,136.0,5.5,2433.0
Lauri Markkanen,,0.404,0.348,0.478,0.174,15.7,28.9,22.2,12.3,0.0,0.0,3.9,29.6,112.0,138.0,-9.9,2037.0
Collin Sexton,,0.738,0.639,0.444,0.722,3.1,6.4,4.8,44.3,1.4,2.8,11.2,30.8,154.0,139.0,10.5,2032.0
John Collins,,0.623,0.607,0.214,0.071,14.0,32.1,22.9,10.3,1.5,3.1,6.5,19.7,137.0,132.0,-1.0,1835.0
Walker Kessler,,0.694,0.5,0.5,1.0,12.6,12.9,12.8,9.2,1.9,3.8,25.8,6.0,151.0,136.0,-3.2,1519.0
Jordan Clarkson,,0.587,0.438,0.625,0.875,6.4,0.0,3.2,33.5,0.0,0.0,21.3,16.5,134.0,146.0,-4.4,1999.0
Kris Dunn,,1.031,1.0,1.0,0.667,5.1,20.8,12.9,6.0,4.5,0.0,43.6,12.8,108.0,130.0,-1.2,1257.0
Taylor Hendricks,,1.5,1.5,1.0,0.0,6.7,13.7,10.1,0.0,5.9,0.0,66.7,7.3,64.0,129.0,0.0,959.0
Talen Horton-Tucker,,0.75,0.75,0.5,0.0,0.0,0.0,0.0,25.6,0.0,0.0,0.0,14.2,175.0,145.0,-2.8,329.0
Kira Lewis Jr.,,,,,,,,,,,,,,,,,


In [14]:
total_stats

Unnamed: 0_level_0,sp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,trb%,ts%,usg%,efg%,home,team,team_opp,season,date,won
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Keyonte George,2433.0,11.0,22.0,0.5,9.0,16.0,0.563,2.0,3.0,0.667,...,2.7,0.708,24.4,0.705,1,UTA,GSW,2024,2024-02-15,False
Lauri Markkanen,2037.0,6.0,23.0,0.261,4.0,11.0,0.364,4.0,4.0,1.0,...,22.2,0.404,29.6,0.348,1,UTA,GSW,2024,2024-02-15,False
Collin Sexton,2032.0,10.0,18.0,0.556,3.0,8.0,0.375,12.0,13.0,0.923,...,4.8,0.738,30.8,0.639,1,UTA,GSW,2024,2024-02-15,False
John Collins,1835.0,8.0,14.0,0.571,1.0,3.0,0.333,1.0,1.0,1.0,...,22.9,0.623,19.7,0.607,1,UTA,GSW,2024,2024-02-15,False
Walker Kessler,1519.0,1.0,2.0,0.5,0.0,1.0,0.0,2.0,2.0,1.0,...,12.8,0.694,6.0,0.5,1,UTA,GSW,2024,2024-02-15,False
Jordan Clarkson,1999.0,3.0,8.0,0.375,1.0,5.0,0.2,6.0,7.0,0.857,...,3.2,0.587,16.5,0.438,1,UTA,GSW,2024,2024-02-15,False
Kris Dunn,1257.0,2.0,3.0,0.667,2.0,3.0,0.667,2.0,2.0,1.0,...,12.9,1.031,12.8,1.0,1,UTA,GSW,2024,2024-02-15,False
Taylor Hendricks,959.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,,...,10.1,1.5,7.3,1.5,1,UTA,GSW,2024,2024-02-15,False
Talen Horton-Tucker,329.0,1.0,2.0,0.5,1.0,1.0,1.0,0.0,0.0,,...,0.0,0.75,14.2,0.75,1,UTA,GSW,2024,2024-02-15,False


In [15]:
print(total_stats.columns)
print(len(total_stats.columns))

Index(['sp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%', 'orb',
       'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', '+/-', '3par',
       'ast%', 'blk%', 'drb%', 'drtg', 'ftr', 'orb%', 'ortg', 'stl%', 'tov%',
       'trb%', 'ts%', 'usg%', 'efg%', 'home', 'team', 'team_opp', 'season',
       'date', 'won'],
      dtype='object')
40


In [16]:
print(base_cols)
print(len(base_cols))

['sp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', '+/-', '3par', 'ast%', 'blk%', 'drb%', 'drtg', 'ftr', 'orb%', 'ortg', 'stl%', 'tov%', 'trb%', 'ts%', 'usg%', 'efg%']
34


In [17]:
# Merge all player data into single df
all_player_data_df = pd.concat(all_player_data, ignore_index=False)

In [18]:
all_player_data_df

Unnamed: 0_level_0,sp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,trb%,ts%,usg%,efg%,home,team,team_opp,season,date,won
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Andre Drummond,2229.0,6.0,16.0,0.375,0.0,0.0,,6.0,10.0,0.600,...,24.8,0.441,23.6,0.375,0,DET,ATL,2016,2015-10-27,True
Marcus Morris,2225.0,6.0,19.0,0.316,1.0,4.0,0.250,5.0,6.0,0.833,...,13.1,0.416,22.9,0.342,0,DET,ATL,2016,2015-10-27,True
Kentavious Caldwell-Pope,2223.0,7.0,14.0,0.500,4.0,7.0,0.571,3.0,3.0,1.000,...,5.2,0.685,18.3,0.643,0,DET,ATL,2016,2015-10-27,True
Ersan İlyasova,2066.0,6.0,12.0,0.500,3.0,6.0,0.500,1.0,2.0,0.500,...,9.9,0.621,18.1,0.625,0,DET,ATL,2016,2015-10-27,True
Reggie Jackson,1927.0,4.0,10.0,0.400,2.0,4.0,0.500,5.0,5.0,1.000,...,12.1,0.615,17.3,0.500,0,DET,ATL,2016,2015-10-27,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Walker Kessler,1519.0,1.0,2.0,0.500,0.0,1.0,0.000,2.0,2.0,1.000,...,12.8,0.694,6.0,0.500,1,UTA,GSW,2024,2024-02-15,False
Jordan Clarkson,1999.0,3.0,8.0,0.375,1.0,5.0,0.200,6.0,7.0,0.857,...,3.2,0.587,16.5,0.438,1,UTA,GSW,2024,2024-02-15,False
Kris Dunn,1257.0,2.0,3.0,0.667,2.0,3.0,0.667,2.0,2.0,1.000,...,12.9,1.031,12.8,1.000,1,UTA,GSW,2024,2024-02-15,False
Taylor Hendricks,959.0,1.0,1.0,1.000,1.0,1.0,1.000,0.0,0.0,,...,10.1,1.500,7.3,1.500,1,UTA,GSW,2024,2024-02-15,False


In [19]:
print(all_player_data_df.columns)
print(len(all_player_data_df.columns))

Index(['sp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%', 'orb',
       'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', '+/-', '3par',
       'ast%', 'blk%', 'drb%', 'drtg', 'ftr', 'orb%', 'ortg', 'stl%', 'tov%',
       'trb%', 'ts%', 'usg%', 'efg%', 'home', 'team', 'team_opp', 'season',
       'date', 'won'],
      dtype='object')
40


In [20]:
output_csv_path = "../data/raw"

all_player_data_df.to_csv(f"{output_csv_path}/all_player_stats.csv")