In [1]:
import os
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
BOX_SCORE_DIR = "../data/raw/scraped_html/scores"

In [3]:
box_scores = os.listdir(BOX_SCORE_DIR)

In [4]:
len(box_scores)

11022

In [5]:
# Get path to each box score html file
box_scores = [os.path.join(BOX_SCORE_DIR, f) for f in box_scores if f.endswith(".html")]

In [6]:
def parse_html(box_score):
    with open(box_score) as f:
        html = f.read()
    
    soup = BeautifulSoup(html)
    [s.decompose() for s in soup.select("tr.over_header")]
    [s.decompose() for s in soup.select("tr.thead")]
    return soup

In [7]:
def read_line_score(soup):
    line_score = pd.read_html(str(soup), attrs = {'id': 'line_score'})[0]
    cols = list(line_score.columns)
    cols[0] = "team"
    cols[-1] = "total"
    line_score.columns = cols
    
    # Only care about the team and total points
    line_score = line_score[["team", "total"]]
    
    return line_score

In [8]:
# Converts time in the format MM:SS into seconds
def convert_to_seconds(time_str):
    try:
        min_sec = list(map(int, time_str.split(':')))  # min_sec should be a list of two ints [{minutes}, {seconds}]
        return (min_sec[0] * 60 + min_sec[1]) if (len(min_sec) == 2) else (min_sec[0] * 60)
    except (ValueError, AttributeError):
        return time_str # if the player didn't play...

In [9]:
def read_stats(soup, team, stat):
    # Read in the box score stats
    df = pd.read_html(str(soup), attrs={"id": f"box-{team}-game-{stat}"}, index_col=0)[0]
    # Create a "seconds played (SP)" column from MP
    df['SP'] = df['MP'].apply(convert_to_seconds)
    df = df.apply(pd.to_numeric, errors="coerce")
    return df

In [10]:
def read_season_info(soup):
    nav = soup.select("#bottom_nav_container")[0]
    hrefs = [a["href"] for a in nav.find_all('a')]
    season = os.path.basename(hrefs[1]).split("_")[0]
    return season

In [11]:
base_cols = None  # this contains which stats we want to pull for each player

all_player_data = []

box_score = box_scores[0]
soup = parse_html(box_score)
line_score = read_line_score(soup)
teams = list(line_score["team"])

for team in teams:
    basic = read_stats(soup, team, "basic")
    advanced = read_stats(soup, team, "advanced")
    advanced_cols_to_append = advanced.columns.difference(basic.columns)  # this is so we don't merge duplicate columns!
    total_stats = pd.merge(basic, advanced[advanced_cols_to_append], left_index=True, right_index=True)
    
    # Make sure each player is accounted for...
    assert len(basic.index) == len(advanced.index) == len(total_stats.index)
    
    # Remove the MP column
    total_stats = total_stats.drop("MP", axis=1)
    
    # Move SP column to the front
    SP_col = total_stats.pop("SP")
    total_stats.insert(0, "SP", SP_col)
    
    # Remove players who didn't play this game
    total_stats.drop(total_stats[pd.isna(total_stats.SP)].index, inplace=True)
    
    # Convert column names into lowercase
    total_stats.columns = total_stats.columns.str.lower()
    
    # Set the base_cols
    if base_cols is None:
        base_cols = list(total_stats.columns.drop_duplicates(keep="first"))
        base_cols = [b for b in base_cols if "bpm" not in b]   # drop bpm as this is not present in all box_scores
    
    # Include only the base_cols
    total_stats = total_stats[base_cols]
    
    # Add team, opposing team, and whether home/away
    home = team == teams[1]  # home team is listed second (teams[1])
    total_stats["home"] = 1 if home else 0
    total_stats["team"] = team
    team_opp = [t for t in teams if t != team][0]
    total_stats["team_opp"] = team_opp  # get opposing team
    
    # Add which season this game happened
    total_stats["season"] = read_season_info(soup)
    
    # Add the date of the game
    total_stats["date"] = os.path.basename(box_score)[:8]
    total_stats["date"] = pd.to_datetime(total_stats["date"], format="%Y%m%d")
    
    total_stats["won"] = line_score.loc[line_score["team"] == team, "total"].iloc[0] > line_score.loc[line_score["team"] == team_opp, "total"].iloc[0]
    
    # Rename the 'Starters' index to "player_name"
    total_stats.index.name = "player_name"
    
    # Remove the last row (which is "Team Totals")
    total_stats = total_stats[:-1]
    
    all_player_data.append(total_stats)

In [12]:
basic

Unnamed: 0_level_0,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-,SP
Starters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Paul Millsap,,7.0,15.0,0.467,2.0,6.0,0.333,3.0,4.0,0.75,...,7.0,8.0,3.0,0.0,0.0,2.0,4.0,19.0,-22.0,2160.0
Jeff Teague,,7.0,16.0,0.438,1.0,3.0,0.333,3.0,4.0,0.75,...,2.0,2.0,4.0,0.0,0.0,5.0,1.0,18.0,-23.0,1912.0
Al Horford,,6.0,11.0,0.545,1.0,3.0,0.333,2.0,3.0,0.667,...,3.0,4.0,4.0,2.0,3.0,1.0,1.0,15.0,-5.0,1814.0
Kyle Korver,,3.0,9.0,0.333,1.0,5.0,0.2,0.0,0.0,,...,2.0,2.0,1.0,1.0,0.0,1.0,4.0,7.0,-9.0,1722.0
Kent Bazemore,,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,,...,7.0,7.0,1.0,0.0,0.0,4.0,3.0,0.0,-17.0,1250.0
Dennis SchrÃ¶der,,8.0,14.0,0.571,2.0,5.0,0.4,2.0,2.0,1.0,...,2.0,3.0,4.0,2.0,0.0,1.0,1.0,20.0,4.0,1522.0
Thabo Sefolosha,,1.0,3.0,0.333,0.0,1.0,0.0,0.0,0.0,,...,6.0,7.0,3.0,4.0,0.0,0.0,1.0,2.0,-1.0,1136.0
Lamar Patterson,,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,...,1.0,1.0,2.0,0.0,0.0,0.0,4.0,5.0,10.0,1098.0
Tiago Splitter,,2.0,5.0,0.4,0.0,0.0,,0.0,0.0,,...,1.0,4.0,0.0,0.0,1.0,0.0,4.0,4.0,-3.0,978.0
Mike Muscala,,0.0,1.0,0.0,0.0,0.0,,0.0,0.0,,...,1.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,5.0,479.0


In [13]:
advanced

Unnamed: 0_level_0,MP,TS%,eFG%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg,BPM,SP
Starters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Paul Millsap,,0.567,0.533,0.4,0.267,3.1,16.7,10.8,14.5,0.0,0.0,10.7,24.1,107.0,113.0,-0.5,2160.0
Jeff Teague,,0.507,0.469,0.188,0.25,0.0,5.4,3.0,22.8,0.0,0.0,22.0,33.1,84.0,121.0,-8.4,1912.0
Al Horford,,0.609,0.591,0.273,0.273,3.7,8.5,6.4,23.1,3.3,7.1,7.5,20.4,122.0,108.0,11.7,1814.0
Kyle Korver,,0.389,0.389,0.556,0.0,0.0,6.0,3.4,5.2,1.8,0.0,10.0,16.1,73.0,116.0,-9.0,1722.0
Kent Bazemore,,0.0,0.0,0.333,0.0,0.0,28.8,16.3,6.2,0.0,0.0,57.1,15.6,8.0,104.0,-21.2,1250.0
Dennis SchrÃ¶der,,0.672,0.643,0.357,0.143,4.4,6.8,5.7,34.6,4.0,0.0,6.3,29.0,130.0,111.0,14.6,1522.0
Thabo Sefolosha,,0.333,0.333,0.333,0.0,5.9,27.2,17.9,22.1,10.6,0.0,0.0,7.3,114.0,81.0,12.6,1136.0
Lamar Patterson,,1.33,1.5,1.0,2.0,0.0,4.7,2.6,15.3,0.0,0.0,0.0,4.8,258.0,121.0,2.1,1098.0
Tiago Splitter,,0.4,0.4,0.0,0.0,20.5,5.3,11.9,0.0,0.0,4.4,0.0,14.2,106.0,118.0,-7.4,978.0
Mike Muscala,,0.0,0.0,0.0,0.0,0.0,10.7,6.1,0.0,0.0,0.0,50.0,11.6,0.0,116.0,-22.2,479.0


In [14]:
total_stats

Unnamed: 0_level_0,sp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,trb%,ts%,usg%,efg%,home,team,team_opp,season,date,won
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Paul Millsap,2160.0,7.0,15.0,0.467,2.0,6.0,0.333,3.0,4.0,0.75,...,10.8,0.567,24.1,0.533,1,ATL,DET,2016,2015-10-27,False
Jeff Teague,1912.0,7.0,16.0,0.438,1.0,3.0,0.333,3.0,4.0,0.75,...,3.0,0.507,33.1,0.469,1,ATL,DET,2016,2015-10-27,False
Al Horford,1814.0,6.0,11.0,0.545,1.0,3.0,0.333,2.0,3.0,0.667,...,6.4,0.609,20.4,0.591,1,ATL,DET,2016,2015-10-27,False
Kyle Korver,1722.0,3.0,9.0,0.333,1.0,5.0,0.2,0.0,0.0,,...,3.4,0.389,16.1,0.389,1,ATL,DET,2016,2015-10-27,False
Kent Bazemore,1250.0,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,,...,16.3,0.0,15.6,0.0,1,ATL,DET,2016,2015-10-27,False
Dennis SchrÃ¶der,1522.0,8.0,14.0,0.571,2.0,5.0,0.4,2.0,2.0,1.0,...,5.7,0.672,29.0,0.643,1,ATL,DET,2016,2015-10-27,False
Thabo Sefolosha,1136.0,1.0,3.0,0.333,0.0,1.0,0.0,0.0,0.0,,...,17.9,0.333,7.3,0.333,1,ATL,DET,2016,2015-10-27,False
Lamar Patterson,1098.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,...,2.6,1.33,4.8,1.5,1,ATL,DET,2016,2015-10-27,False
Tiago Splitter,978.0,2.0,5.0,0.4,0.0,0.0,,0.0,0.0,,...,11.9,0.4,14.2,0.4,1,ATL,DET,2016,2015-10-27,False
Mike Muscala,479.0,0.0,1.0,0.0,0.0,0.0,,0.0,0.0,,...,6.1,0.0,11.6,0.0,1,ATL,DET,2016,2015-10-27,False


In [15]:
print(total_stats.columns)
print(len(total_stats.columns))

Index(['sp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%', 'orb',
       'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', '+/-', '3par',
       'ast%', 'blk%', 'drb%', 'drtg', 'ftr', 'orb%', 'ortg', 'stl%', 'tov%',
       'trb%', 'ts%', 'usg%', 'efg%', 'home', 'team', 'team_opp', 'season',
       'date', 'won'],
      dtype='object')
40


In [16]:
print(base_cols)
print(len(base_cols))

['sp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', '+/-', '3par', 'ast%', 'blk%', 'drb%', 'drtg', 'ftr', 'orb%', 'ortg', 'stl%', 'tov%', 'trb%', 'ts%', 'usg%', 'efg%']
34


In [17]:
# Merge all player data into single df
all_player_data_df = pd.concat(all_player_data, ignore_index=False)

In [18]:
all_player_data_df

Unnamed: 0_level_0,sp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,trb%,ts%,usg%,efg%,home,team,team_opp,season,date,won
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Andre Drummond,2229.0,6.0,16.0,0.375,0.0,0.0,,6.0,10.0,0.6,...,24.8,0.441,23.6,0.375,0,DET,ATL,2016,2015-10-27,True
Marcus Morris,2225.0,6.0,19.0,0.316,1.0,4.0,0.25,5.0,6.0,0.833,...,13.1,0.416,22.9,0.342,0,DET,ATL,2016,2015-10-27,True
Kentavious Caldwell-Pope,2223.0,7.0,14.0,0.5,4.0,7.0,0.571,3.0,3.0,1.0,...,5.2,0.685,18.3,0.643,0,DET,ATL,2016,2015-10-27,True
Ersan Ä°lyasova,2066.0,6.0,12.0,0.5,3.0,6.0,0.5,1.0,2.0,0.5,...,9.9,0.621,18.1,0.625,0,DET,ATL,2016,2015-10-27,True
Reggie Jackson,1927.0,4.0,10.0,0.4,2.0,4.0,0.5,5.0,5.0,1.0,...,12.1,0.615,17.3,0.5,0,DET,ATL,2016,2015-10-27,True
Stanley Johnson,1469.0,3.0,10.0,0.3,1.0,3.0,0.333,0.0,0.0,,...,7.9,0.35,17.6,0.35,0,DET,ATL,2016,2015-10-27,True
Steve Blake,953.0,1.0,6.0,0.167,1.0,5.0,0.2,0.0,0.0,,...,0.0,0.25,22.2,0.25,0,DET,ATL,2016,2015-10-27,True
Jodie Meeks,657.0,1.0,4.0,0.25,0.0,0.0,,0.0,0.0,,...,8.9,0.25,17.9,0.25,0,DET,ATL,2016,2015-10-27,True
Aron Baynes,651.0,3.0,5.0,0.6,0.0,0.0,,0.0,0.0,,...,22.3,0.6,21.7,0.6,0,DET,ATL,2016,2015-10-27,True
Paul Millsap,2160.0,7.0,15.0,0.467,2.0,6.0,0.333,3.0,4.0,0.75,...,10.8,0.567,24.1,0.533,1,ATL,DET,2016,2015-10-27,False


In [None]:
print(all_player_data_df.columns)
print(len(all_player_data_df.columns))

In [None]:
output_csv_path = "../data/raw"

all_player_data_df.to_csv(f"{output_csv_path}/all_player_stats.csv")