# PART 1: DATA PROCUREMENT

This notebook will employ webscraping techniques to retrieve player statistics and lineup data from NBA's stats-API. This information will then be filtered, compiled and outputted into a master CSV for the next stage of the project pipeline.

---

In [1]:
# API Client
from nba_api.stats.endpoints import boxscoretraditionalv2 as BST
from nba_api.stats.static import players

# Web Scraping
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select, WebDriverWait

# Data Management
import numpy as np
import pandas as pd

# Utils
import difflib
from functools import reduce
from time import time, sleep

## 1A: Data Collection

**Objective:** Scrape data from various API-endpoints and store in local dataframes.

---

#### Define end-point URLs of interest:

In [2]:
# Standard stats
player_bio_url = r'https://www.nba.com/players'
trad_stats_url = r'https://www.nba.com/stats/players/traditional/?sort=MIN&dir=-1&Season=2021-22&SeasonType=Regular%20Season&PerMode=Totals'
adv_stats_url = r'https://www.nba.com/stats/players/advanced/?sort=GP&dir=-1&Season=2021-22&SeasonType=Regular%20Season'
misc_stats_url = r'https://www.nba.com/stats/players/misc/?sort=GP&dir=-1&Season=2021-22&SeasonType=Regular%20Season&PerMode=Totals'
scoring_stats_url = r'https://www.nba.com/stats/players/scoring/?sort=GP&dir=-1&Season=2021-22&SeasonType=Regular%20Season&PerMode=Totals'
usg_stats_url = r'https://www.nba.com/stats/players/usage/?sort=USG_PCT&dir=-1&Season=2021-22&SeasonType=Regular%20Season&PerMode=Totals'
opp_stats_url = r'https://www.nba.com/stats/players/opponent/?sort=GP&dir=-1&Season=2021-22&SeasonType=Regular%20Season&PerMode=Totals'
def_stats_url = r'https://www.nba.com/stats/players/defense/?sort=DEF_WS&dir=-1&Season=2021-22&SeasonType=Regular%20Season&PerMode=Totals'

# Offensive Play-Type stats
trsn_o_stats_url = r'https://www.nba.com/stats/players/transition/?SeasonType=Regular%20Season&PerMode=Totals'
iso_o_stats_url = r'https://www.nba.com/stats/players/isolation/?SeasonType=Regular%20Season&PerMode=Totals'
pnrbh_o_stats_url = r'https://www.nba.com/stats/players/ball-handler/#!?SeasonType=Regular%20Season&PerMode=Totals'
pnrrm_o_stats_url = r'https://www.nba.com/stats/players/roll-man/?SeasonType=Regular%20Season&PerMode=Totals'
postup_o_stats_url = r'https://www.nba.com/stats/players/playtype-post-up/?SeasonType=Regular%20Season&PerMode=Totals'
spotup_o_stats_url = r'https://www.nba.com/stats/players/spot-up/?SeasonType=Regular%20Season&PerMode=Totals'
handoff_o_stats_url = r'https://www.nba.com/stats/players/hand-off/?SeasonType=Regular%20Season&PerMode=Totals'
cuts_o_stats_url = r'https://www.nba.com/stats/players/cut/?SeasonType=Regular%20Season&PerMode=Totals'
offscrn_o_stats_url = r'https://www.nba.com/stats/players/off-screen/?SeasonType=Regular%20Season&PerMode=Totals'
putbk_o_stats_url = r'https://www.nba.com/stats/players/putbacks/?SeasonType=Regular%20Season&PerMode=Totals'

# Defensive Play-Type stats
iso_d_stats_url = r'https://www.nba.com/stats/players/isolation/?SeasonType=Regular%20Season&PerMode=Totals&TypeGrouping=defensive'
pnrbh_d_stats_url = r'https://www.nba.com/stats/players/ball-handler/#!?SeasonType=Regular%20Season&PerMode=Totals&TypeGrouping=defensive'
pnrrm_d_stats_url = r'https://www.nba.com/stats/players/roll-man/?SeasonType=Regular%20Season&PerMode=Totals&TypeGrouping=defensive'
postup_d_stats_url = r'https://www.nba.com/stats/players/playtype-post-up/?SeasonType=Regular%20Season&PerMode=Totals&TypeGrouping=defensive'
spotup_d_stats_url = r'https://www.nba.com/stats/players/spot-up/?SeasonType=Regular%20Season&PerMode=Totals&TypeGrouping=defensive'
handoff_d_stats_url = r'https://www.nba.com/stats/players/hand-off/?SeasonType=Regular%20Season&PerMode=Totals&TypeGrouping=defensive'
offscrn_d_stats_url = r'https://www.nba.com/stats/players/off-screen/?SeasonType=Regular%20Season&PerMode=Totals&TypeGrouping=defensive'
eff_d_stats_url = r'https://www.nba.com/stats/players/opponent-shooting/?Season=2021-22&SeasonType=Regular%20Season&PerMode=Totals&DistanceRange=By%20Zone'

# Tracking stats
drives_stats_url = r'https://www.nba.com/stats/players/drives/?Season=2021-22&SeasonType=Regular%20Season&PerMode=Totals'
cns_stats_url = r'https://www.nba.com/stats/players/shots-general/?Season=2021-22&SeasonType=Regular%20Season&sort=FG3M&dir=1&PerMode=Totals&GeneralRange=Catch%20and%20Shoot'
pass_stats_url = r'https://www.nba.com/stats/players/passing/?Season=2021-22&SeasonType=Regular%20Season&PerMode=Totals'
touches_stats_url = r'https://www.nba.com/stats/players/touches/?Season=2021-22&SeasonType=Regular%20Season&PerMode=Totals'
pullup_stats_url = r'https://www.nba.com/stats/players/shots-general/?Season=2021-22&SeasonType=Regular%20Season&sort=FG3M&dir=1&PerMode=Totals&GeneralRange=Pullups'
reb_stats_url = r'https://www.nba.com/stats/players/rebounding/?Season=2021-22&SeasonType=Regular%20Season&PerMode=Totals'
mvmt_stats_url = r'https://www.nba.com/stats/players/speed-distance/?Season=2021-22&SeasonType=Regular%20Season&PerMode=Totals'
hustle_stats_url = r'https://www.nba.com/stats/players/hustle/?Season=2021-22&SeasonType=Regular%20Season&PerMode=Totals'
boxouts_stats_url = r'https://www.nba.com/stats/players/box-outs/?Season=2021-22&SeasonType=Regular%20Season&PerMode=Totals'

# Dribble / Touch stats
dr_0_url = r'https://www.nba.com/stats/players/shots-dribbles/?Season=2021-22&SeasonType=Regular%20Season&PerMode=Totals'
dr_1_url = r'https://www.nba.com/stats/players/shots-dribbles/?Season=2021-22&SeasonType=Regular%20Season&PerMode=Totals&DribbleRange=1%20Dribble'
dr_2_url = r'https://www.nba.com/stats/players/shots-dribbles/?Season=2021-22&SeasonType=Regular%20Season&PerMode=Totals&DribbleRange=2%20Dribbles'
dr_3_6_url = r'https://www.nba.com/stats/players/shots-dribbles/?Season=2021-22&SeasonType=Regular%20Season&PerMode=Totals&DribbleRange=3-6%20Dribbles'
dr_7plus_url = r'https://www.nba.com/stats/players/shots-dribbles/?Season=2021-22&SeasonType=Regular%20Season&PerMode=Totals&DribbleRange=7%2B%20Dribbles'
tch_0_2_url = r'https://www.nba.com/stats/players/shots-touch-time/?Season=2021-22&SeasonType=Regular%20Season&PerMode=Totals'
tch_2_6_url = r'https://www.nba.com/stats/players/shots-touch-time/?Season=2021-22&SeasonType=Regular%20Season&PerMode=Totals&TouchTimeRange=Touch%202-6%20Seconds'
tch_6plus_url = 'https://www.nba.com/stats/players/shots-touch-time/?Season=2021-22&SeasonType=Regular%20Season&PerMode=Totals&TouchTimeRange=Touch%206%2B%20Seconds'

# Shooting stats
eff_zone_stats_url = r'https://www.nba.com/stats/players/shooting/?Season=2021-22&SeasonType=Regular%20Season&PerMode=Totals&DistanceRange=By%20Zone'
eff_playtype_stats_url = r'https://www.nba.com/stats/players/shooting-efficiency/?Season=2021-22&SeasonType=Regular%20Season&PerMode=Totals'

# Lineup stats
lineup_stats_url = r'https://www.nba.com/stats/lineups/advanced/?Season=2021-22&SeasonType=Regular%20Season'

#### Establish driver & setup a standardized scraper:

In [3]:
driver = webdriver.Firefox(executable_path='/Users/rc/Applications/geckodriver')
sleep(5)  # ensure enough time for driver setup

In [4]:
def nba_api_scraper(url, std_fmt=True, cols=None, xp=None):
    """Takes input page url and returns scraped stats table from the webpage of interest."""
    
    # Use driver to retrieve url
    driver.get(url)
    
    # Define x-path of interest for selection
    if xp == None:
        xp = r'/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select'
    
    # Implement explicit wait time to ensure page loads to prevent execution errors
    elem = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, xp)))
    
    # Select 'All' rows split amongst multiple pages
    select = Select(driver.find_element_by_xpath(xp))
    select.select_by_index(0)  # first elem is 'All'

    # Read data based on source page format
    pg_src = driver.page_source
    if std_fmt == True:
        table = pd.read_html(pg_src)[0]
        table = table[[col for col in table.columns if 'RANK' not in col]]  # remove error columns
    else:
        table = pd.read_html(pg_src)[0]
        table = table.iloc[1200:].reset_index(drop=True)  # removing null-rows resulting from pg selection
        table.columns = cols
              
    return table

#### Collect data from each end-point URL:

In [5]:
### Standard stats
player_bio_df = nba_api_scraper(player_bio_url, xp='/html/body/div[1]/div[2]/div[3]/section/div/div[2]/div[1]/div[7]/div/div[3]/div/label/div/select')
trad_stats_df = nba_api_scraper(trad_stats_url)
adv_stats_df = nba_api_scraper(adv_stats_url)
scoring_stats_df = nba_api_scraper(scoring_stats_url)
usg_stats_df = nba_api_scraper(usg_stats_url)
opp_stats_df = nba_api_scraper(opp_stats_url)
def_stats_df = nba_api_scraper(def_stats_url)

In [6]:
### Offensive Play-Type stats
trsn_o_stats_df = nba_api_scraper(trsn_o_stats_url)
iso_o_stats_df = nba_api_scraper(iso_o_stats_url)
pnrbh_o_stats_df = nba_api_scraper(pnrbh_o_stats_url)
pnrrm_o_stats_df = nba_api_scraper(pnrrm_o_stats_url)
postup_o_stats_df = nba_api_scraper(postup_o_stats_url)
spotup_o_stats_df = nba_api_scraper(spotup_o_stats_url)
handoff_o_stats_df = nba_api_scraper(handoff_o_stats_url)
cuts_o_stats_df = nba_api_scraper(cuts_o_stats_url)
offscrn_o_stats_df = nba_api_scraper(offscrn_o_stats_url)
putbk_o_stats_df = nba_api_scraper(putbk_o_stats_url)

In [7]:
### Defensive Play-Type stats
iso_d_stats_df = nba_api_scraper(iso_d_stats_url)
pnrbh_d_stats_df = nba_api_scraper(pnrbh_d_stats_url)
pnrrm_d_stats_df = nba_api_scraper(pnrrm_d_stats_url)
postup_d_stats_df = nba_api_scraper(postup_d_stats_url)
spotup_d_stats_df = nba_api_scraper(spotup_d_stats_url)
handoff_d_stats_df = nba_api_scraper(handoff_d_stats_url)
offscrn_d_stats_df = nba_api_scraper(offscrn_d_stats_url)
eff_d_cols = ['PLAYER', 'TEAM', 'Age', 'opp_RA_FGM', 'opp_RA_FGA', 'opp_RA_FG%', 'opp_PT_nonRA_FGM', 'opp_PT_nonRA_FGA', 'opp_PT_nonRA_FG%', 'opp_MR_FGM', 'opp_MR_FGA', 'opp_MR_FG%', 'opp_LC_FGM', 'opp_LC_FGA', 'opp_LC_FG%', 'opp_RC_FGM', 'opp_RC_FGA', 'opp_RC_FG%', 'opp_Corner3_FGM', 'opp_Corner3_FGA', 'opp_Corner3_FG%', 'opp_ATB3_FGM', 'opp_ATB3_FGA', 'opp_ATB3_FG%']
eff_d_stats_df = nba_api_scraper(eff_d_stats_url, False, eff_d_cols)

In [8]:
### Tracking stats
drives_stats_df = nba_api_scraper(drives_stats_url)
cns_stats_df = nba_api_scraper(cns_stats_url)
pass_stats_df = nba_api_scraper(pass_stats_url)
touches_stats_df = nba_api_scraper(touches_stats_url)
pullup_stats_df = nba_api_scraper(pullup_stats_url)
reb_stats_df = nba_api_scraper(reb_stats_url)
mvmt_stats_df = nba_api_scraper(mvmt_stats_url)
hustle_stats_df = nba_api_scraper(hustle_stats_url)
boxouts_stats_df = nba_api_scraper(boxouts_stats_url)

In [9]:
### Dribble stats
dr_0_df = nba_api_scraper(dr_0_url)
dr_1_df = nba_api_scraper(dr_1_url)
dr_2_df = nba_api_scraper(dr_2_url)
dr_3_6_df = nba_api_scraper(dr_3_6_url)
dr_7plus_df = nba_api_scraper(dr_7plus_url)

In [10]:
### Touch-time stats
tch_0_2_df = nba_api_scraper(tch_0_2_url)
tch_2_6_df = nba_api_scraper(tch_2_6_url)
tch_6plus_df = nba_api_scraper(tch_6plus_url)

In [11]:
### Shooting stats
eff_zone_cols = ['PLAYER', 'TEAM', 'Age', 'RA_FGM', 'RA_FGA', 'RA_FG%', 'PT_nonRA_FGM', 'PT_nonRA_FGA', 'PT_nonRA_FG%', 'MR_FGM', 'MR_FGA', 'MR_FG%', 'LC_FGM', 'LC_FGA', 'LC_FG%', 'RC_FGM', 'RC_FGA', 'RC_FG%', 'Corner3_FGM', 'Corner3_FGA', 'Corner3_FG%', 'ATB3_FGM', 'ATB3_FGA', 'ATB3_FG%']
eff_o_stats_df = nba_api_scraper(eff_zone_stats_url, False, eff_zone_cols)
# eff_playtype_stats_df = nba_api_scraper(eff_playtype_stats_url)

In [12]:
### Lineup stats
lineup_stats_df = nba_api_scraper(lineup_stats_url)

In [13]:
# Close driver after data retrieval
driver.close()

## 1B: Data Condensation

**Objective:** Perform initial standardization & filtering steps to prepare datasets for compilation.

In this section, each dataset will be filtered for only columns of interest, based on NBA's stats glossary: https://www.nba.com/stats/help/glossary/. Through the use of some helper functions, minor standardization steps are taken (i.e., player & attribute naming convention) to prepare for data compilation in the next section.

---

#### Define helper functions for basic standardization procedures across dataframes:

In [14]:
def name_formatter(name_str):
    """Converts player naming convention from 'Last, First' to 'First Last' format."""
    
    last, first = name_str.split(', ')
    mod_name_str = first + ' ' + last
    return mod_name_str

def name_matcher(df1, col1, df2, col2):
    """Replaces col1 of df1 with closest string value match of col2 from df2 and returns modified array."""
    
    std_names = df2[col2].tolist()
    malformed_names = df1[col1].tolist()
    new_names = []
    for mal_name in malformed_names:
        new_name = difflib.get_close_matches(mal_name, std_names, n=1, cutoff=0.9)
        if new_name != []:
            new_names.append(new_name[0])
        else:
            new_names.append('')
            
    return new_names  

def play_style_cleaner(df, prefix):
    """Aggregates duplicated player info (due to mid-season trades) and returns processed df."""
    
    df = df.groupby('Player', as_index=False).agg({'FGM': 'sum', 'FGA': 'sum'})
    
    df = df.rename(columns={'Player': 'PLAYER', 'FGM': prefix+'FGM', 'FGA': prefix+'FGA'})
    df = df[['PLAYER', prefix+'FGM', prefix+'FGA']].reset_index(drop=True)
    
    return df

#### Standardize and condense the traditional and advanced statistical data:

In [15]:
player_bio_df = player_bio_df.rename(columns={'Player': 'PLAYER', 'Team': 'TEAM', 'Position': 'POS', 'Height': 'H'})
player_bio_df = player_bio_df[['PLAYER', 'TEAM', 'POS', 'H']]

In [16]:
trad_stats_df = trad_stats_df[['PLAYER', 'GP', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'REB', 'AST', 'TOV', 'STL', 'BLK']]

# Replace malforned names in player bio dataframe with names from traditional stats dataframe
player_bio_df['PLAYER'] = name_matcher(player_bio_df, 'PLAYER', trad_stats_df, 'PLAYER')
player_bio_df = player_bio_df[player_bio_df.PLAYER != ''].reset_index(drop=True)  # filter out players of this season that won't have stats (and hence name records) 

In [17]:
adv_stats_df = adv_stats_df[['PLAYER', 'POSS']]

In [18]:
usg_stats_df = usg_stats_df.rename(columns={'Player': 'PLAYER'})
usg_stats_df = usg_stats_df[['PLAYER', 'USG%', '%FGA', '%3PA', '%FTA', '%REB', '%AST', '%BLKA', '%PFD', '%PTS']]  # percentile stats are in relation to team's total

#### Standardize and condense scoring-efficiency-related statistics:

In [19]:
eff_o_stats_df = eff_o_stats_df   # remove dup rows at top generated from the scrape
eff_o_stats_df = eff_o_stats_df[['PLAYER', 'RA_FGA', 'RA_FGM', 'PT_nonRA_FGA', 'PT_nonRA_FGM', 'MR_FGA', 'MR_FGM', 'LC_FGA', 'LC_FGM', 'RC_FGA', 'RC_FGM', 'Corner3_FGA', 'Corner3_FGM', 'ATB3_FGA', 'ATB3_FGM']]

In [20]:
cns_stats_df.columns = [i[1] for i in cns_stats_df.columns]  # eliminiating multi-index
cns_stats_df = cns_stats_df.rename(columns={'Player': 'PLAYER', '2FGA': 'cns_2FGA', '2FGM': 'cns_2FGM', '3PA': 'cns_3PA', '3PM': 'cns_3PM'})
cns_stats_df = cns_stats_df[['PLAYER', 'cns_2FGA', 'cns_2FGM', 'cns_3PA', 'cns_3PM']]

In [21]:
pullup_stats_df.columns = [i[1] for i in pullup_stats_df.columns]  # eliminiating multi-index
pullup_stats_df = pullup_stats_df.rename(columns={'Player': 'PLAYER', '2FGA': 'pullup_2FGA', '2FGM': 'pullup_2FGM', '3PA': 'pullup_3PA', '3PM': 'pullup_3PM'})
pullup_stats_df = pullup_stats_df[['PLAYER', 'pullup_2FGA', 'pullup_2FGM', 'pullup_3PA', 'pullup_3PM']]

#### Standardize and condense offensive-style data:

In [22]:
scoring_stats_df = scoring_stats_df.rename(columns={'Player':'PLAYER', '%PTS2PT\xa0MR': '%PTS2PT MR'})
scoring_stats_df = scoring_stats_df[['PLAYER', '%FGA2PT', '%FGA3PT', '%PTS2PT', '%PTS2PT MR', '%PTS3PT', '%PTSFBPs', '%PTSFT', '%PTSOffTO', '%PTSPITP', '2FGM%AST', '2FGM%UAST', '3FGM%AST', '3FGM%UAST']]

In [23]:
trsn_o_stats_df = play_style_cleaner(trsn_o_stats_df, 'trsn_')

iso_o_stats_df = play_style_cleaner(iso_o_stats_df, 'iso_')

pnrbh_o_stats_df = play_style_cleaner(pnrbh_o_stats_df, 'pnrbh_')

pnrrm_o_stats_df = play_style_cleaner(pnrrm_o_stats_df, 'pnrrm_')

postup_o_stats_df = play_style_cleaner(postup_o_stats_df, 'postup_')

spotup_o_stats_df = play_style_cleaner(spotup_o_stats_df, 'spotup_')

handoff_o_stats_df = play_style_cleaner(handoff_o_stats_df, 'handoff_')

cuts_o_stats_df = play_style_cleaner(cuts_o_stats_df, 'cuts_')

offscrn_o_stats_df = play_style_cleaner(offscrn_o_stats_df, 'offscrn_')

putbk_o_stats_df = play_style_cleaner(putbk_o_stats_df, 'putbk_')

In [24]:
drives_stats_df = drives_stats_df.rename(columns={'Player': 'PLAYER', 'FGA': 'drives_FGA', 'FGM': 'drives_FGM', 'PTS': 'drives_PTS', 'PASS%': 'drives_PASS%', 'AST%': 'drives_AST%'})
drives_stats_df = drives_stats_df[['PLAYER', 'DRIVES', 'drives_FGA', 'drives_FGM', 'drives_PTS', 'drives_PASS%', 'drives_AST%']]

In [25]:
pass_stats_df = pass_stats_df.rename(columns={'Player': 'PLAYER', 'AST\xa0PTSCreated': 'AST PTSCreated', 'AST\xa0ToPass%\xa0Adj': 'AST ToPass% Adj'})
pass_stats_df = pass_stats_df[['PLAYER', 'PassesMade', 'PassesReceived', 'SecondaryAST', 'PotentialAST', 'AST PTSCreated', 'ASTAdj', 'AST ToPass% Adj']]

In [26]:
### Dribble stats

dr_0_df.columns = [i[1] for i in dr_0_df.columns]  # eliminiating multi-index
dr_0_df = dr_0_df.rename(columns={'Player': 'PLAYER', '2FGA': 'dr_0_2FGA', '2FGM': 'dr_0_2FGM', '3PA': 'dr_0_3PA', '3PM': 'dr_0_3PM'})
dr_0_df = dr_0_df[['PLAYER', 'dr_0_2FGA', 'dr_0_2FGM', 'dr_0_3PA', 'dr_0_3PM']]

dr_1_df.columns = [i[1] for i in dr_1_df.columns]  # eliminiating multi-index
dr_1_df = dr_1_df.rename(columns={'Player': 'PLAYER', '2FGA': 'dr_1_2FGA', '2FGM': 'dr_1_2FGM', '3PA': 'dr_1_3PA', '3PM': 'dr_1_3PM'})
dr_1_df = dr_1_df[['PLAYER', 'dr_1_2FGA', 'dr_1_2FGM', 'dr_1_3PA', 'dr_1_3PM']]

dr_2_df.columns = [i[1] for i in dr_2_df.columns]  # eliminiating multi-index
dr_2_df = dr_2_df.rename(columns={'Player': 'PLAYER', '2FGA': 'dr_2_2FGA', '2FGM': 'dr_2_2FGM', '3PA': 'dr_2_3PA', '3PM': 'dr_2_3PM'})
dr_2_df = dr_2_df[['PLAYER', 'dr_2_2FGA', 'dr_2_2FGM', 'dr_2_3PA', 'dr_2_3PM']]

dr_3_6_df.columns = [i[1] for i in dr_3_6_df.columns]  # eliminiating multi-index
dr_3_6_df = dr_3_6_df.rename(columns={'Player': 'PLAYER', '2FGA': 'dr_3_6_2FGA', '2FGM': 'dr_3_6_2FGM', '3PA': 'dr_3_6_3PA', '3PM': 'dr_3_6_3PM'})
dr_3_6_df = dr_3_6_df[['PLAYER', 'dr_3_6_2FGA', 'dr_3_6_2FGM', 'dr_3_6_3PA', 'dr_3_6_3PM']]

dr_7plus_df.columns = [i[1] for i in dr_7plus_df.columns]  # eliminiating multi-index
dr_7plus_df = dr_7plus_df.rename(columns={'Player': 'PLAYER', '2FGA': 'dr_7plus_2FGA', '2FGM': 'dr_7plus_2FGM', '3PA': 'dr_7plus_3PA', '3PM': 'dr_7plus_3PM'})
dr_7plus_df = dr_7plus_df[['PLAYER', 'dr_7plus_2FGA', 'dr_7plus_2FGM', 'dr_7plus_3PA', 'dr_7plus_3PM']]

In [27]:
### Touch-time stats

tch_0_2_df.columns = [i[1] for i in tch_0_2_df.columns]  # eliminiating multi-index
tch_0_2_df = tch_0_2_df.rename(columns={'Player': 'PLAYER', '2FGA': 'tch_0_2_2FGA', '2FGM': 'tch_0_2_2FGM', '3PA': 'tch_0_2_3PA', '3PM': 'tch_0_2_3PM'})
tch_0_2_df = tch_0_2_df[['PLAYER', 'tch_0_2_2FGA', 'tch_0_2_2FGM', 'tch_0_2_3PA', 'tch_0_2_3PM']]

tch_2_6_df.columns = [i[1] for i in tch_2_6_df.columns]  # eliminiating multi-index
tch_2_6_df = tch_2_6_df.rename(columns={'Player': 'PLAYER', '2FGA': 'tch_2_6_2FGA', '2FGM': 'tch_2_6_2FGM', '3PA': 'tch_2_6_3PA', '3PM': 'tch_2_6_3PM'})
tch_2_6_df = tch_2_6_df[['PLAYER', 'tch_2_6_2FGA', 'tch_2_6_2FGM', 'tch_2_6_3PA', 'tch_2_6_3PM']]

tch_6plus_df.columns = [i[1] for i in tch_6plus_df.columns]  # eliminiating multi-index
tch_6plus_df = tch_6plus_df.rename(columns={'Player': 'PLAYER', '2FGA': 'tch_6plus_2FGA', '2FGM': 'tch_6plus_2FGM', '3PA': 'tch_6plus_3PA', '3PM': 'tch_6plus_3PM'})
tch_6plus_df = tch_6plus_df[['PLAYER', 'tch_6plus_2FGA', 'tch_6plus_2FGM', 'tch_6plus_3PA', 'tch_6plus_3PM']]

#### Standardize and condense tracking-related data:

In [28]:
touches_stats_df = touches_stats_df.rename(columns={'Player': 'PLAYER', 'Avg\xa0Sec\xa0PerTouch': 'Avg Sec PerTouch', 'Avg\xa0Drib\xa0PerTouch': 'Avg Drib PerTouch', 'PTS\xa0PerTouch': 'PTS PerTouch',
                                                   'PTS\xa0PerElbow\xa0Touch': 'PTS PerElbow Touch', 'PTS\xa0PerPost\xa0Touch': 'PTS PerPost Touch', 'PTS\xa0PerPaint\xa0Touch': 'PTS PerPaint Touch'})
touches_stats_df = touches_stats_df[['PLAYER', 'TOUCHES', 'Avg Sec PerTouch', 'Avg Drib PerTouch', 'PTS PerTouch', 'ElbowTouches', 
                                     'PostUps', 'PaintTouches', 'PTS PerElbow Touch', 'PTS PerPost Touch', 'PTS PerPaint Touch']]

reb_stats_df = reb_stats_df.rename(columns={'Player': 'PLAYER', 'AVG\xa0REBDistance': 'AVG REBDistance'})
reb_stats_df = reb_stats_df[['PLAYER', 'ContestedREB%',  'AVG REBDistance']]

mvmt_stats_df = mvmt_stats_df.rename(columns={'Player': 'PLAYER', 'Dist.\xa0Miles\xa0Off': 'Dist. Miles Off', 'Dist.\xa0Miles\xa0Def': 'Dist. Miles Def'})
mvmt_stats_df = mvmt_stats_df[['PLAYER', 'Dist. Miles Off', 'Dist. Miles Def']]

boxouts_stats_df = boxouts_stats_df.rename(columns={'Player': 'PLAYER', 'Box\xa0Outs': 'Box Outs'})
boxouts_stats_df = boxouts_stats_df[['PLAYER', 'Box Outs']]

#### Standardize and condense defensive statistics:

In [29]:
opp_stats_df = opp_stats_df.rename(columns={'Vs Player': 'PLAYER'})
opp_stats_df['PLAYER'] = opp_stats_df.PLAYER.map(name_formatter)

# Execute groupby based on player name, and aggregate columns per strategy
opp_stats_df = opp_stats_df.groupby('PLAYER', as_index=False).agg({
    'OppFGA': 'sum', 'OppFGM': 'sum', 'Opp3PA': 'sum', 'Opp3PM': 'sum', 'OppFTA': 'sum', 'OppTOV': 'sum', 'OppPF': 'sum'
})

opp_stats_df = opp_stats_df[['PLAYER', 'OppFGA', 'OppFGM', 'Opp3PA', 'Opp3PM', 'OppFTA', 'OppTOV', 'OppPF']]

In [30]:
def_stats_df = def_stats_df.rename(columns={'Player': 'PLAYER', 'OPP\xa0PTSFB': 'OppPTSFB', 'OPP\xa0PTSPAINT': 'OppPTSPAINT'})
def_stats_df = def_stats_df[['PLAYER', 'OppPTSFB', 'OppPTSPAINT', 'DEFWS']]

In [31]:
eff_d_stats_df = eff_d_stats_df[['PLAYER', 'opp_RA_FGA', 'opp_RA_FGM', 'opp_PT_nonRA_FGA', 'opp_PT_nonRA_FGM', 'opp_MR_FGA', 'opp_MR_FGM', 'opp_LC_FGA', 'opp_LC_FGM', 'opp_RC_FGA', 'opp_RC_FGM', 'opp_Corner3_FGA', 'opp_Corner3_FGM', 'opp_ATB3_FGA', 'opp_ATB3_FGM']]

In [32]:
hustle_stats_df = hustle_stats_df.rename(columns={'Player': 'PLAYER', 'Loose\xa0BallsRecovered': 'Loose BallsRecovered', 'Contested2PT\xa0Shots': 'Contested2PT Shots', 'Contested3PT\xa0Shots': 'Contested3PT Shots', 'Team': 'TEAM'})
hustle_stats_df = hustle_stats_df[['PLAYER', 'ScreenAssists', 'Deflections', 'Loose BallsRecovered', 'ChargesDrawn', 'Contested2PT Shots', 'Contested3PT Shots']]

In [33]:
iso_d_stats_df = play_style_cleaner(iso_d_stats_df, 'opp_iso_')

pnrbh_d_stats_df = play_style_cleaner(pnrbh_d_stats_df, 'opp_pnrbh_')

pnrrm_d_stats_df = play_style_cleaner(pnrrm_d_stats_df, 'opp_pnrrm_')

postup_d_stats_df = play_style_cleaner(postup_d_stats_df, 'opp_postup_')

spotup_d_stats_df = play_style_cleaner(spotup_d_stats_df, 'opp_spotup_')

handoff_d_stats_df = play_style_cleaner(handoff_d_stats_df, 'opp_handoff_')

offscrn_d_stats_df = play_style_cleaner(offscrn_d_stats_df, 'opp_offscrn_')

#### Standardize and condense lineup statistics:

In [34]:
lineup_stats_df = lineup_stats_df  # no initial changes

## 1C: Data Compilation

**Objective:** Combine statistics for each player and output player & lineup data as CSV files.

---

In [35]:
dfs_list = [
    player_bio_df, trad_stats_df, adv_stats_df, usg_stats_df,  
    eff_o_stats_df, cns_stats_df, pullup_stats_df,   
    scoring_stats_df, trsn_o_stats_df, iso_o_stats_df, pnrbh_o_stats_df, pnrrm_o_stats_df, postup_o_stats_df,  
    spotup_o_stats_df, handoff_o_stats_df, cuts_o_stats_df, offscrn_o_stats_df, putbk_o_stats_df,
    drives_stats_df, pass_stats_df, dr_0_df, dr_1_df, dr_2_df, dr_3_6_df, dr_7plus_df, tch_0_2_df, tch_2_6_df, tch_6plus_df,
    touches_stats_df, reb_stats_df, mvmt_stats_df, boxouts_stats_df, 
    opp_stats_df, def_stats_df, eff_d_stats_df, hustle_stats_df,   
    iso_d_stats_df, pnrbh_d_stats_df, pnrrm_d_stats_df, postup_d_stats_df, spotup_d_stats_df, handoff_d_stats_df, offscrn_d_stats_df
]

In [36]:
for i in range(len(dfs_list)):
    dups = len(dfs_list[i][dfs_list[i].PLAYER.duplicated()])
    if dups > 0:
        print(i)
        dfs_list[i] = dfs_list[i].iloc[dups:, :].reset_index(drop=True)  # take care of dups

4
34


Above, the dataframes are verified to ensure no duplicates exist. Some instances of scraping sessions can result in nulled player records at beginning of the df (no scattered dups as this has already been taken care off to prevent). Therefore the buggy top duplicates that can randomly aris are taken care of in the above cell block automatically, without the need for manual checking.

#### Merge all dataframes to construct a master copy:

In [37]:
master_df = reduce(lambda df1, df2: pd.merge(df1, df2, how='outer', on='PLAYER'), dfs_list)

In [38]:
master_df

Unnamed: 0,PLAYER,TEAM,POS,H,GP,MIN,PTS,FGM,FGA,FG%,...,opp_pnrrm_FGM,opp_pnrrm_FGA,opp_postup_FGM,opp_postup_FGA,opp_spotup_FGM,opp_spotup_FGA,opp_handoff_FGM,opp_handoff_FGA,opp_offscrn_FGM,opp_offscrn_FGA
0,Precious Achiuwa,TOR,F,6-8,71,1686,647,259,585,44.3,...,13.0,32.0,12.0,29.0,59.0,159.0,,,4.0,10.0
1,Steven Adams,MEM,C,6-11,74,1953,522,207,380,54.5,...,31.0,75.0,16.0,28.0,54.0,159.0,3.0,11.0,7.0,13.0
2,Bam Adebayo,MIA,C-F,6-9,55,1792,1044,398,715,55.7,...,7.0,19.0,15.0,31.0,56.0,137.0,4.0,8.0,3.0,13.0
3,Santi Aldama,MEM,F-C,6-11,30,319,104,43,109,39.4,...,,,,,7.0,19.0,,,,
4,LaMarcus Aldridge,BKN,C-F,6-11,47,1050,607,252,458,55.0,...,8.0,26.0,10.0,24.0,33.0,81.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597,Thaddeus Young,TOR,F,6-8,50,801,303,133,261,51.0,...,,,,,,,,,,
598,Trae Young,ATL,G,6-1,74,2582,2092,689,1501,45.9,...,11.0,30.0,17.0,35.0,82.0,203.0,29.0,64.0,23.0,47.0
599,Omer Yurtseven,MIA,C,6-11,55,671,292,127,233,54.5,...,7.0,18.0,8.0,17.0,24.0,51.0,,,,
600,Cody Zeller,,F-C,6-11,27,355,140,51,90,56.7,...,,,4.0,9.0,9.0,25.0,,,,


#### Rearrange columns for ease of use in next stages:

In [41]:
master_df = master_df[[
    
    'PLAYER', 'H', 'POS', 'TEAM', 'GP', 'MIN',  # Bio / Util info
    
    'POSS', 'USG%', '%FGA', '%3PA', '%FTA', '%REB', '%AST', '%BLKA', '%PFD', '%PTS',  # Usage stats
    
    'AST', 'PassesMade', 'PassesReceived', 'SecondaryAST', 'PotentialAST', 'ScreenAssists', 'AST PTSCreated', 'ASTAdj', 'AST ToPass% Adj', 'TOV',  # Passing info
    
    'PTS', 'FGA', 'FGM', 'FG%',  # Base scoring stats
    
    '%FGA2PT', '%PTS2PT', '%PTSPITP', '%PTS2PT MR',  # 2-Pt scoring vs. player's overall scoring
    'RA_FGA', 'RA_FGM', 'PT_nonRA_FGA', 'PT_nonRA_FGM', 'MR_FGA', 'MR_FGM',  # 2-Pt scoring zones
    'cns_2FGA', 'cns_2FGM', 'pullup_2FGA', 'pullup_2FGM',  # 2-Pt scoring styles
    
    '3PA', '3PM', '3P%',  # Base 3-Pt scoring stats
    '%FGA3PT', '%PTS3PT',  # 3-Pt scoring vs. player's overall scoring
    'Corner3_FGA', 'Corner3_FGM', 'ATB3_FGA', 'ATB3_FGM',   # 3-Pt scoring zones
    'cns_3PA', 'cns_3PM', 'pullup_3PA', 'pullup_3PM',  # 3-Pt scoring styles
    
    'FTA', 'FTM', 'FT%',  # Base FT stats
    '%PTSFT',  # FT scoring vs. player's overall scoring
        
    '2FGM%AST', '2FGM%UAST', '3FGM%AST', '3FGM%UAST',  # Scoring dependency
    
    'dr_0_2FGA', 'dr_0_2FGM', 'dr_0_3PA', 'dr_0_3PM', 'dr_1_2FGA', 'dr_1_2FGM', 'dr_1_3PA', 'dr_1_3PM', 'dr_2_2FGA', 'dr_2_2FGM',  # Dribbles vs. scoring/efficiency
    'dr_2_3PA', 'dr_2_3PM', 'dr_3_6_2FGA', 'dr_3_6_2FGM', 'dr_3_6_3PA', 'dr_3_6_3PM', 'dr_7plus_2FGA', 'dr_7plus_2FGM', 'dr_7plus_3PA', 'dr_7plus_3PM',
    
    'tch_0_2_2FGA', 'tch_0_2_2FGM', 'tch_0_2_3PA', 'tch_0_2_3PM', 'tch_2_6_2FGA', 'tch_2_6_2FGM',  # Touch-time vs. scoring/efficiency
    'tch_2_6_3PA', 'tch_2_6_3PM', 'tch_6plus_2FGA', 'tch_6plus_2FGM', 'tch_6plus_3PA', 'tch_6plus_3PM',
    
    'trsn_FGM', 'trsn_FGA', 'iso_FGM', 'iso_FGA', 'pnrbh_FGM', 'pnrbh_FGA', 'pnrrm_FGM', 'pnrrm_FGA', 'postup_FGM', 'postup_FGA',   # Offensive play-style
    'spotup_FGM', 'spotup_FGA', 'handoff_FGM', 'handoff_FGA', 'cuts_FGM', 'cuts_FGA', 'offscrn_FGM', 'offscrn_FGA', 'putbk_FGM', 'putbk_FGA',
    'DRIVES', 'drives_FGA', 'drives_FGM', 'drives_PTS', 'drives_PASS%', 'drives_AST%',
    
    'TOUCHES', 'Avg Sec PerTouch', 'Avg Drib PerTouch', 'PTS PerTouch', 'ElbowTouches', 'PostUps', 'PaintTouches', 'PTS PerElbow Touch', 'PTS PerPost Touch', 'PTS PerPaint Touch',  # Player activity
    
    'Dist. Miles Off', 'Dist. Miles Def',  # Player movement
    
    'REB', 'ContestedREB%', 'AVG REBDistance', 'Box Outs',  # Rebound-related stats
    
    'DEFWS',  # Adv defensive metric
    'STL', 'BLK', 'Deflections', 'Loose BallsRecovered', 'ChargesDrawn', 'Contested2PT Shots', 'Contested3PT Shots', 'OppTOV', 'OppPF', 'OppFTA',  # Defensive activity
    
    'OppFGA', 'OppFGM', 'OppPTSFB', 'OppPTSPAINT', 'opp_RA_FGA', 'opp_RA_FGM', 'opp_PT_nonRA_FGA', 'opp_PT_nonRA_FGM', 'opp_MR_FGA', 'opp_MR_FGM',  # Opponent 2-Pt efficiency
    'Opp3PA', 'Opp3PM', 'opp_Corner3_FGA', 'opp_Corner3_FGM', 'opp_ATB3_FGA', 'opp_ATB3_FGM',  # Opponent 3-Pt efficiency
    
    'opp_iso_FGM', 'opp_iso_FGA', 'opp_pnrbh_FGM', 'opp_pnrbh_FGA', 'opp_pnrrm_FGM', 'opp_pnrrm_FGA', 'opp_postup_FGM', 'opp_postup_FGA',  # Opponent play-style vs. efficiency
    'opp_spotup_FGM', 'opp_spotup_FGA', 'opp_handoff_FGM', 'opp_handoff_FGA', 'opp_offscrn_FGM', 'opp_offscrn_FGA'
    
]]

#### Output compiled data into CSVs:

In [42]:
master_df.to_csv('./raw_comprehensive_stats.csv', sep=',', index=False)

In [43]:
lineup_stats_df.to_csv('./raw_lineup_stats.csv', sep=',', index=False)