In [None]:
import numpy as np
import pandas as pd
from nba_api.stats.endpoints import playercareerstats, playergamelogs, gamerotation
from nba_api.stats.static import players
from collections import defaultdict

In [None]:
SAMPLE = False

In [None]:
salary = pd.read_csv("data/avg_player_salary.csv", index_col=0)
salary

In [None]:
all_players = players.get_active_players()
def get_id_by_name(name):
    for player in all_players:
        if player["full_name"] == name:
            return player["id"]

In [None]:
suffixes = np.array(["0" + suf if len(suf) == 1 else suf for suf in np.arange(14, 24).astype(str)])
seasons = pd.Series(np.arange(2013, 2023)).astype(str) + "-" + pd.Series(suffixes)
season_year = np.arange(2013, 2024)
seasons.values, len(seasons)

In [None]:
basics = ["PLAYER_NAME", "PLAYER_ID", "TEAM_NAME", "TEAM_ABBREVIATION", "GAME_DATE", "MATCHUP"]
stats = ["FG3_PCT", "FG_PCT", "FT_PCT", "PTS_MIN", "AST_MIN", "REB_MIN", "STL_MIN", "BLK_MIN", "TOV_MIN", "PF_MIN", "PFD_MIN"]

In [None]:
def apply_metric(row):
    row["REB_MIN"] = row["REB"]/row["MIN"]
    row["AST_MIN"] = row["AST"]/row["MIN"]
    row["PTS_MIN"] = row["PTS"]/row["MIN"]
    row["TOV_MIN"] = row["TOV"]/row["MIN"]
    row["STL_MIN"] = row["STL"]/row["MIN"]
    row["BLK_MIN"] = row["BLK"]/row["MIN"]
    row["PF_MIN"]= row["PF"]/row["MIN"]
    row["PFD_MIN"] = row["PFD"]/row["MIN"]

    stats = ["FG3_PCT", "FG_PCT", "FT_PCT", "PTS_MIN", "AST_MIN", "REB_MIN", "STL_MIN", "BLK_MIN", "TOV_MIN", "PF_MIN", "PFD_MIN"]
    weightings = [1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1] #negatively weight turnovers and fouls
    return np.dot(row[stats], weightings)


In [None]:
def get_player_season_data(player_id, season):
    games = playergamelogs.PlayerGameLogs(season_nullable = season) 
    df = games.get_data_frames()[0]
    return df.loc[df.PLAYER_ID == player_id, :].iloc[::-1]

def get_player_data(player_id):
    dfs = []  # List to store DataFrames for each season
    for season in seasons:
        season_data = get_player_season_data(player_id, season)
        dfs.append(season_data)
    
    # Use pd.concat to concatenate DataFrames along the rows (axis=0)
    df = pd.concat(dfs, axis=0, ignore_index=True)
    return df

In [None]:
position_by_name = salary.groupby("position").value_counts().reset_index().loc[:, ["position", "name"]].set_index("position")


In [None]:
position_by_name

In [None]:
most_popular_players = {
    "PG": ["Stephen Curry", "Damian Lillard", "Chris Paul", "Kyrie Irving", "Russell Westbrook"],
    "SG": ["James Harden", "Bradley Beal", "Donovan Mitchell", "Zach LaVine", "Devin Booker"],
    "SF": ["LeBron James", "Kevin Durant", "Kawhi Leonard", "Jayson Tatum", "Jimmy Butler"],
    "PF": ["Giannis Antetokounmpo", "Anthony Davis", "Julius Randle", "Domantas Sabonis", "Zion Williamson"],
    "C": ["Nikola Jokic", "Joel Embiid", "Rudy Gobert", "Karl-Anthony Towns", "Bam Adebayo"]
}

In [None]:
def get_sample_aggregate_by_position():
    positions = ["C", "PF", "SF", "SG", "PG"]
    position_dfs = defaultdict(list)
    out = defaultdict()
    for pos in positions:
        players = most_popular_players[pos] #! small sample to make it faster
        for player in players:
            id = get_id_by_name(player)
            player_data = get_player_data(id)
            position_dfs[pos].append(player_data)
            print(player)
        print()
        print(f"REACHED CONCAT STAGE, {pos}")
        out[pos] = pd.concat(position_dfs[pos], axis=0, ignore_index=True)

    return out

def get_aggregate_by_position():
    positions = ["C", "PF", "SF", "SG", "PG"]
    position_dfs = defaultdict(list)
    out = defaultdict()
    for pos in positions:
        players = position_by_name.loc[pos, "name"].tolist()
        for player in players:
            id = get_id_by_name(player)
            player_data = get_player_data(id)
            position_dfs[pos].append(player_data)
            print(player)
        print()
        print(f"REACHED CONCAT STAGE, {pos}")
        out[pos] = pd.concat(position_dfs[pos], axis=0, ignore_index=True)

    return out

In [None]:
def compute_metric_and_save(aggr, sample):
    basics = ["PLAYER_NAME", "PLAYER_ID", "TEAM_NAME", "TEAM_ABBREVIATION", "GAME_DATE", "MATCHUP"]
    cols = np.append(np.array(basics), "METRIC")
    if sample:
        for pos in aggr.keys():
            print(pos)
            aggr[pos] = aggr[pos].loc[aggr[pos].MIN != 0, :]
            aggr[pos]["METRIC"] = aggr[pos].apply(lambda x: apply_metric(x), axis=1)
            pos_out = aggr[pos].loc[:, cols]
            pos_out.to_csv(f"data/sample_aggregate/{pos}_sample_aggregate.csv")
    else:
        for pos in aggr.keys():
            print(pos)
            aggr[pos] = aggr[pos].loc[aggr[pos].MIN != 0, :]
            aggr[pos]["METRIC"] = aggr[pos].apply(apply_metric, axis=1)
            pos_out = aggr[pos].loc[:, cols]
            pos_out.to_csv(f"data/aggregate/{pos}_aggregate.csv")

In [None]:
if SAMPLE:
    sample_aggregate = get_sample_aggregate_by_position()
    compute_metric_and_save(sample_aggregate, SAMPLE)
else:
    aggregate = get_aggregate_by_position()
    compute_metric_and_save(aggregate, SAMPLE)

In [None]:
aggregate.keys()

In [None]:
#! check if save is done correctly
df = pd.read_csv("data/aggregate/SG_aggregate.csv", index_col=0)
df.loc[df.PLAYER_NAME == "James Harden", :]