# Clutch Data Augmentation Pipeline
https://api.pbpstats.com/docs#/

Add clutch data to master_clutch dataset

### Imports and Functions

In [7]:
import requests
import pandas as pd

## Speed up process with pyspark later
# import pyspark
# from pyspark.sql import SQLContext
# from pyspark.sql.functions import *

# from pyspark import SparkConf, SparkContext

# sc = pyspark.SparkContext(appName="clutch_dataset")
# sqlContext = SQLContext(sc)

In [8]:
def find_player_list(stat_df, nba_id_df):
    stat_df[['idPlayer', 'Season']] = stat_df.pidSzn.str.split("_", expand = True)
    nba_id_df["idPlayer"] = nba_id_df["idPlayer"].astype(str)
    players_df = stat_df.merge(
        nba_id_df,
        on = "idPlayer",
        how = "inner"
        )
    cols = ["pidSzn","idPlayer","Season","namePlayer"]
    return players_df[cols]

# Pull using PBP API
def get_pbp_clutch_data(seasons_list):
    url = "https://api.pbpstats.com/get-totals/nba"
    clutch_leverage = "High, VeryHigh"
    clutch_json = {}
    for szn in seasons_list:
        params = {
            "Season": szn,
            "SeasonType":"Regular Season",
            "Type":"Player",
            "Leverage":clutch_leverage
        }
        response = requests.get(url, params=params)
        response_json = response.json()
        clutch_json[szn] = response_json["multi_row_table_data"]

    clutch_pdf_list = []
    for szn, data in clutch_json.items():
        curr_clutch_pdf = pd.DataFrame(data)
        curr_clutch_pdf["Season"] = szn
        clutch_pdf_list.append(curr_clutch_pdf)
    clutch_pdf = pd.concat(clutch_pdf_list)
        
    return clutch_pdf

#Preprocess dataset
def pbp_clutch_data_preprocessing(clutch_data, null_threshold):
    nullcnt_df = (clutch_data.isna().sum()
        .to_frame("NULL_CNT").reset_index(names=["colname"])
        .sort_values("NULL_CNT",ascending=False)
    )
    nullcnt_df["NULL_PCT"] = nullcnt_df["NULL_CNT"]/len(clutch_data)

    dropcols = list(nullcnt_df[nullcnt_df["NULL_PCT"]>null_threshold]["colname"])
    processed_clutched_data = clutch_data.drop(columns=dropcols)
    return processed_clutched_data

#Merge dataset into master_data
def merge_into_master(augment_df, master_df, player_list_df):
    #Map pid_szn to clutch_df
    pidSzn_augment_df = pd.merge(augment_df,
                                 clutch_player_list_df, 
                                 left_on=["Name","Season"],
                                 right_on=["namePlayer","Season"],
                                 how="inner")
    pidSzn_augment_df = pidSzn_augment_df.drop(columns=["Name","Season","idPlayer"])

    #Merge using pidSzn
    augmented_master_df = pd.merge(master_df,
                                   pidSzn_augment_df,
                                   on="pidSzn",
                                   how="left")
    
    return augmented_master_df


### Debug

In [9]:
master_clutch_df = pd.read_csv("./MasterData/MasterClutch.csv", index_col=0)
nba_id_df = pd.read_csv("./RawData/stats/nba_ids.csv")  

In [10]:
#Get list of players in master dataset
clutch_player_list_df = find_player_list(master_clutch_df, nba_id_df)

In [None]:
#Sample Pull
url = "https://api.pbpstats.com/get-totals/nba"
params = {
    "Season": "2021-22",
    "SeasonType":"Regular Season",
    "Type":"Player",
    "EntityId": "201142",
    "Leverage":"High, VeryHigh" #Low Leverage Removed

}
response = requests.get(url, params=params)
response_json = response.json()

In [11]:
#Pull PBP Data
seasons_list = list(clutch_player_list_df["Season"].unique())
clutch_data = get_pbp_clutch_data(seasons_list)

In [12]:
#Clean up PBP data
clutch_df = pbp_clutch_data_preprocessing(clutch_data, null_threshold = 0.30)

In [13]:
#Create augmented dataset
augmented_master_clutch_df = merge_into_master(clutch_df, master_clutch_df, clutch_player_list_df)
feature_cols = ['pidSzn', 'MIN', 'GP', 'Season', 'PIE', 'POSS', 'USG_PCT', 'idPlayer',
                'PlusMinus', 'OffPoss', 'DefPoss', 'PenaltyOffPoss', 'PenaltyDefPoss', 
                'SecondChanceOffPoss', 'TotalPoss','AtRimFGA', 'FG2M', 'FG2A', 'Points', 
                'OpponentPoints', 'DefTwoPtRebounds', 'DefRebounds', 'Rebounds', 'Fouls', 
                'FoulsDrawn', 'FirstChancePoints', 'PenaltyOffPossExcludingTakeFouls', 
                'OnOffRtg','OnDefRtg', 'Fg2Pct', 'EfgPct', 'TsPct', 'Usage', 'DefTwoPtReboundPct',
                'DefFGReboundPct', 'AtRimFrequency', 'AtRimFG3AFrequency','ShotQualityAvg', 'Avg2ptShotDistance', 
                'PenaltyOffPossPct']
augmented_master_clutch_df = augmented_master_clutch_df[feature_cols]

In [14]:
augmented_master_clutch_df.head()

Unnamed: 0,pidSzn,MIN,GP,Season,PIE,POSS,USG_PCT,idPlayer,PlusMinus,OffPoss,...,EfgPct,TsPct,Usage,DefTwoPtReboundPct,DefFGReboundPct,AtRimFrequency,AtRimFG3AFrequency,ShotQualityAvg,Avg2ptShotDistance,PenaltyOffPossPct
0,201985_2014-15,0.8,4,2014-15,-0.1,6,0.571,201985,10.0,17.0,...,0.357143,0.357143,38.095238,,,,0.428571,0.45275,17.7,0.058824
1,201166_2014-15,2.7,29,2014-15,0.118,169,0.268,201166,6.0,123.0,...,0.62069,0.636364,29.496403,0.113208,0.126761,0.413793,0.758621,0.477767,4.073684,0.357724
2,203932_2014-15,1.7,16,2014-15,0.197,53,0.105,203932,18.0,55.0,...,0.772727,0.818182,15.492958,0.153846,0.142857,0.454545,0.636364,0.543814,5.066667,0.218182
3,203940_2014-15,1.3,9,2014-15,-0.075,20,0.12,203940,6.0,27.0,...,0.2,0.2,15.151515,,,,0.2,0.43881,16.825,0.185185
4,201143_2014-15,2.6,33,2014-15,0.175,206,0.212,201143,24.0,127.0,...,0.769231,0.767857,21.014493,0.211538,0.25974,0.384615,0.384615,0.496688,10.173077,0.228346


In [15]:
augmented_master_clutch_df.shape

(2672, 40)

### Main

In [17]:
def main():
    master_clutch_df = pd.read_csv("./MasterData/MasterClutch.csv", index_col=0)
    nba_id_df = pd.read_csv("./RawData/stats/nba_ids.csv")  

    #Find players in 
    clutch_player_list_df = find_player_list(master_clutch_df, nba_id_df)

    seasons_list = list(clutch_player_list_df["Season"].unique())
    clutch_data = get_pbp_clutch_data(seasons_list)

    #Clean up PBP data
    clutch_df = pbp_clutch_data_preprocessing(clutch_data, null_threshold = 0.30)

    #Create augmented dataset
    augmented_master_clutch_df = merge_into_master(clutch_df, master_clutch_df, clutch_player_list_df)
    feature_cols = ['pidSzn', 'MIN', 'GP', 'Season', 'PIE', 'POSS', 'USG_PCT', 'idPlayer',
                    'PlusMinus', 'OffPoss', 'DefPoss', 'PenaltyOffPoss', 'PenaltyDefPoss', 
                    'SecondChanceOffPoss', 'TotalPoss','AtRimFGA', 'FG2M', 'FG2A', 'Points', 
                    'OpponentPoints', 'DefTwoPtRebounds', 'DefRebounds', 'Rebounds', 'Fouls', 
                    'FoulsDrawn', 'FirstChancePoints', 'PenaltyOffPossExcludingTakeFouls', 
                    'OnOffRtg','OnDefRtg', 'Fg2Pct', 'EfgPct', 'TsPct', 'Usage', 'DefTwoPtReboundPct',
                    'DefFGReboundPct', 'AtRimFrequency', 'AtRimFG3AFrequency','ShotQualityAvg', 'Avg2ptShotDistance', 
                    'PenaltyOffPossPct']
    augmented_master_clutch_df = augmented_master_clutch_df[feature_cols]

    #Write to master data folder
    augmented_master_clutch_df.to_csv("./MasterData/Augmented_MasterClutch.csv")

main()