In [2]:
import numpy as np
import pandas as pd
import time
import sqlite3
import copy
import warnings
from scipy import stats
from ordered_set import OrderedSet
from nba_api.stats.endpoints import boxscoretraditionalv2, leaguegamefinder, leaguestandings
from nba_api.stats.static import teams
from nba_api.stats.library.http import NBAStatsHTTP

warnings.filterwarnings("ignore")

In [3]:
# Create list of teams
nba_teams = teams.get_teams()
team_list = []
for team in nba_teams:
    team_list.append(team['abbreviation'])

In [4]:
# Create dataframe with all boxscores
full_game_list = pd.DataFrame()
for team in nba_teams:
    time.sleep(1)
    team_id = team['id']
    gamefinder = leaguegamefinder.LeagueGameFinder(team_id_nullable=team_id, season_type_nullable='Regular Season')
    games = gamefinder.get_data_frames()[0]
    full_game_list = pd.concat([full_game_list, games])


In [5]:
full_game_list[full_game_list['GAME_ID']=='0022201216']

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
5,22022,1610612737,ATL,Atlanta Hawks,22201216,2023-04-09,ATL @ BOS,L,241,114,...,0.789,16.0,34.0,50.0,27,11.0,5,9,12,-6.0
5,22022,1610612738,BOS,Boston Celtics,22201216,2023-04-09,BOS vs. ATL,W,240,120,...,0.786,12.0,35.0,47.0,33,5.0,3,15,13,6.0


In [6]:
def combine_team_games(df, keep_method='home'):
    '''Combine a TEAM_ID-GAME_ID unique table into rows by game. Slow.

        Parameters
        ----------
        df : Input DataFrame.
        keep_method : {'home', 'away', 'winner', 'loser', ``None``}, default 'home'
            - 'home' : Keep rows where TEAM_A is the home team.
            - 'away' : Keep rows where TEAM_A is the away team.
            - 'winner' : Keep rows where TEAM_A is the losing team.
            - 'loser' : Keep rows where TEAM_A is the winning team.
            - ``None`` : Keep all rows. Will result in an output DataFrame the same
                length as the input DataFrame.
                
        Returns
        -------
        result : DataFrame
    '''
    # Join every row to all others with the same game ID.
    joined = pd.merge(df, df, suffixes=['_A', '_B'],
                      on=['SEASON_ID', 'GAME_ID', 'GAME_DATE'])
    # Filter out any row that is joined to itself.
    result = joined[joined.TEAM_ID_A != joined.TEAM_ID_B]
    # Take action based on the keep_method flag.
    if keep_method is None:
        # Return all the rows.
        pass
    elif keep_method.lower() == 'home':
        # Keep rows where TEAM_A is the home team.
        result = result[result.MATCHUP_A.str.contains(' vs. ')]
    elif keep_method.lower() == 'away':
        # Keep rows where TEAM_A is the away team.
        result = result[result.MATCHUP_A.str.contains(' @ ')]
    elif keep_method.lower() == 'winner':
        result = result[result.WL_A == 'W']
    elif keep_method.lower() == 'loser':
        result = result[result.WL_A == 'L']
    else:
        raise ValueError(f'Invalid keep_method: {keep_method}')
    return result

In [7]:
full_game_list_final = combine_team_games(full_game_list)
full_game_list_final.head()

Unnamed: 0,SEASON_ID,TEAM_ID_A,TEAM_ABBREVIATION_A,TEAM_NAME_A,GAME_ID,GAME_DATE,MATCHUP_A,WL_A,MIN_A,PTS_A,...,FT_PCT_B,OREB_B,DREB_B,REB_B,AST_B,STL_B,BLK_B,TOV_B,PF_B,PLUS_MINUS_B
2,22023,1610612742,DAL,Dallas Mavericks,1522300074,2023-07-16,DAL vs. ATL,W,200,101,...,0.773,13.0,28.0,41.0,16,5.0,6,17,18,-23.0
5,22023,1610612737,ATL,Atlanta Hawks,1522300046,2023-07-13,ATL vs. PHI,W,199,99,...,0.735,1.0,26.0,27.0,15,4.0,9,5,11,-1.4
9,22023,1610612737,ATL,Atlanta Hawks,1522300038,2023-07-12,ATL vs. MIN,W,201,99,...,0.688,9.0,21.0,30.0,24,8.0,10,15,23,-10.8
14,22023,1610612743,DEN,Denver Nuggets,1522300023,2023-07-09,DEN vs. ATL,L,200,93,...,0.529,11.0,26.0,37.0,24,4.0,8,12,20,5.4
17,22023,1610612737,ATL,Atlanta Hawks,1522300007,2023-07-07,ATL vs. SAC,L,200,76,...,0.727,12.0,22.0,34.0,13,13.0,5,14,17,6.4


In [8]:
# Create connection to SQLITE db and write dataframe to file
# conn = sqlite3.connect('my_database.db')
# full_game_list_final.to_sql('games', conn, if_exists='replace', index=False)
# conn.commit()
# conn.close()

In [9]:
# Query from SQLITE table
# conn = sqlite3.connect('my_database.db')
# df_from_db = pd.read_sql_query("SELECT DISTINCT SEASON_ID FROM games", conn)
# conn.close()
# df_from_db.head(10)

In [10]:
def update_ranks(curr_team_stats):
    tmp = {}
    ranks = {}
    for key, value in curr_team_stats.items():
        tmp[key] = {'WIN PCT': value['WIN PCT']}

    # Create a sorted list of win percentages and then assign a ranking to each team
    tmp_sorted = sorted(tmp.items(), key=lambda x: x[1]['WIN PCT'], reverse=True)

    for i, (team, win_pct) in enumerate(tmp_sorted):
        ranks[team] = i+1
        tmp[team]['RANK'] = ranks[team]
    return tmp 

# Initialize master dataframe
final_stats_df = pd.DataFrame()

# Create season game-log dataframe (Starting from 2015 because prior to 2015, NBA game dynamics were less impacted by 3 point shots)
for season in ['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']:
    games_season = full_game_list_final[((full_game_list_final.SEASON_ID.str[-4:]==season) & (full_game_list_final.GAME_ID.str[:2]=='00'))]
    games_season.sort_values('GAME_DATE', ascending=True, inplace=True)
    game_dates = sorted(list(set(games_season.GAME_DATE)))

    # Create dictionary for team stats, which will be updated daily with Wins, Losses, Win %, RANK, and other box score stats
    curr_team_stats = {}
    for team in team_list:
        curr_team_stats[team] = {'W': 0, 'L': 0, 'WIN PCT': 0, 'RANK': 0, 
        '3PM': 0, '3PA': 0, '3PT PCT': 0, '2PM': 0, '2PA': 0, '2PT PCT': 0, 'PTS': 0, 'TO': 0, 'POSS': 0, 'PP100P': 0, 'EFG': 0, 'ORB': 0, 'ORB PCT': 0, 'DRB': 0, 'DRB PCT': 0, 'OPP_MISSED_SHOTS': 0, 
        'OPP_3PM': 0, 'OPP_3PA': 0, 'OPP_3PT PCT': 0, 'OPP_2PM': 0, 'OPP_2PA': 0, 'OPP_2PT PCT': 0, 'OPP_PTS': 0, 'OPP_TO': 0, 'OPP_POSS': 0, 'OPP_PP100P': 0, 'OPP_EFG': 0}

    # At the end of each day we take a snapshot of the cumulative team stats and use that to help us predict the next day's games
    daily_team_stats = {}
    for date in game_dates:
        daily_team_stats[date] = {}
        for team in team_list:
            daily_team_stats[date][team] = {'W': 0, 'L': 0, 'WIN PCT': 0, 'RANK': 0,
            '3PM': 0, '3PA': 0, '3PT PCT': 0, '2PM': 0, '2PA': 0, '2PT PCT': 0, 'PTS': 0, 'TO': 0, 'POSS': 0, 'PP100P': 0, 'EFG': 0, 'ORB': 0, 'ORB PCT': 0, 'DRB': 0, 'DRB PCT': 0, 'OPP_MISSED_SHOTS': 0, 
            'OPP_3PM': 0, 'OPP_3PA': 0, 'OPP_3PT PCT': 0, 'OPP_2PM': 0, 'OPP_2PA': 0, 'OPP_2PT PCT': 0, 'OPP_PTS': 0, 'OPP_TO': 0, 'OPP_POSS': 0, 'OPP_PP100P': 0, 'OPP_EFG': 0}

    # Add helper columns containing the next game dates to tell us when to update the standings and rankings
    games_season['GAME_DATE_LEAD'] = games_season['GAME_DATE'].shift(-1)

    # Update current standings and rankings
    for index, game in games_season.iterrows():
        if game.WL_A == 'W':
            curr_team_stats[game.TEAM_ABBREVIATION_A]['W'] += 1
            curr_team_stats[game.TEAM_ABBREVIATION_B]['L'] += 1
        else:
            curr_team_stats[game.TEAM_ABBREVIATION_A]['L'] += 1
            curr_team_stats[game.TEAM_ABBREVIATION_B]['W'] += 1        
        

        # Update stats - TEAM A [Team A offensive performance]
        curr_team_stats[game.TEAM_ABBREVIATION_A]['WIN PCT'] = curr_team_stats[game.TEAM_ABBREVIATION_A]['W'] / (curr_team_stats[game.TEAM_ABBREVIATION_A]['W'] + curr_team_stats[game.TEAM_ABBREVIATION_A]['L'])
        curr_team_stats[game.TEAM_ABBREVIATION_A]['3PM'] += int(game.FG3M_A)
        curr_team_stats[game.TEAM_ABBREVIATION_A]['3PA'] += int(game.FG3A_A)
        curr_team_stats[game.TEAM_ABBREVIATION_A]['3PT PCT'] = curr_team_stats[game.TEAM_ABBREVIATION_A]['3PM']/curr_team_stats[game.TEAM_ABBREVIATION_A]['3PA']
        curr_team_stats[game.TEAM_ABBREVIATION_A]['2PM'] += (int(game.FGM_A) - int(game.FG3M_A))
        curr_team_stats[game.TEAM_ABBREVIATION_A]['2PA'] += (int(game.FGA_A) - int(game.FG3A_A))
        curr_team_stats[game.TEAM_ABBREVIATION_A]['2PT PCT'] = curr_team_stats[game.TEAM_ABBREVIATION_A]['2PM']/curr_team_stats[game.TEAM_ABBREVIATION_A]['2PA']
        curr_team_stats[game.TEAM_ABBREVIATION_A]['PTS'] += int(game.PTS_A)
        curr_team_stats[game.TEAM_ABBREVIATION_A]['TO'] += int(game.TOV_A)
        curr_team_stats[game.TEAM_ABBREVIATION_A]['POSS'] += (int(game.FGA_A) - int(game.OREB_A) + int(game.TOV_A) + int(game.FTA_A)/2)
        curr_team_stats[game.TEAM_ABBREVIATION_A]['PP100P'] = 100 * (curr_team_stats[game.TEAM_ABBREVIATION_A]['PTS'] / curr_team_stats[game.TEAM_ABBREVIATION_A]['POSS'])
        curr_team_stats[game.TEAM_ABBREVIATION_A]['EFG'] = (1.5 * curr_team_stats[game.TEAM_ABBREVIATION_A]['3PM'] + curr_team_stats[game.TEAM_ABBREVIATION_A]['2PM']) / (curr_team_stats[game.TEAM_ABBREVIATION_A]['3PA'] + curr_team_stats[game.TEAM_ABBREVIATION_A]['2PA'])
        curr_team_stats[game.TEAM_ABBREVIATION_A]['ORB'] += int(game.OREB_A)
        curr_team_stats[game.TEAM_ABBREVIATION_A]['ORB PCT'] = curr_team_stats[game.TEAM_ABBREVIATION_A]['ORB'] / ((curr_team_stats[game.TEAM_ABBREVIATION_A]['2PA'] + curr_team_stats[game.TEAM_ABBREVIATION_A]['3PA']) - 
        (curr_team_stats[game.TEAM_ABBREVIATION_A]['2PM'] + curr_team_stats[game.TEAM_ABBREVIATION_A]['3PM']))
        curr_team_stats[game.TEAM_ABBREVIATION_A]['DRB'] += int(game.DREB_A)
        curr_team_stats[game.TEAM_ABBREVIATION_A]['OPP_MISSED_SHOTS'] += (int(game.FGA_B) - int(game.FGM_B))
        curr_team_stats[game.TEAM_ABBREVIATION_A]['DRB PCT'] = curr_team_stats[game.TEAM_ABBREVIATION_A]['DRB'] / curr_team_stats[game.TEAM_ABBREVIATION_A]['OPP_MISSED_SHOTS']
        
        # Update stats - TEAM A OPPONENTS [Team A defensive performance]
        curr_team_stats[game.TEAM_ABBREVIATION_A]['OPP_3PM'] += int(game.FG3M_B)
        curr_team_stats[game.TEAM_ABBREVIATION_A]['OPP_3PA'] += int(game.FG3A_B)
        curr_team_stats[game.TEAM_ABBREVIATION_A]['OPP_3PT PCT'] = curr_team_stats[game.TEAM_ABBREVIATION_A]['OPP_3PM']/curr_team_stats[game.TEAM_ABBREVIATION_A]['OPP_3PA']
        curr_team_stats[game.TEAM_ABBREVIATION_A]['OPP_2PM'] += (int(game.FGM_B) - int(game.FG3M_B))
        curr_team_stats[game.TEAM_ABBREVIATION_A]['OPP_2PA'] += (int(game.FGA_B) - int(game.FG3A_B))
        curr_team_stats[game.TEAM_ABBREVIATION_A]['OPP_2PT PCT'] = curr_team_stats[game.TEAM_ABBREVIATION_A]['OPP_2PM']/curr_team_stats[game.TEAM_ABBREVIATION_A]['OPP_2PA']
        curr_team_stats[game.TEAM_ABBREVIATION_A]['OPP_PTS'] += int(game.PTS_B)
        curr_team_stats[game.TEAM_ABBREVIATION_A]['OPP_TO'] += int(game.TOV_B)
        curr_team_stats[game.TEAM_ABBREVIATION_A]['OPP_POSS'] += (int(game.FGA_B) - int(game.OREB_B) + int(game.TOV_B) + int(game.FTA_B)/2)
        curr_team_stats[game.TEAM_ABBREVIATION_A]['OPP_PP100P'] = 100 * (curr_team_stats[game.TEAM_ABBREVIATION_A]['OPP_PTS'] / curr_team_stats[game.TEAM_ABBREVIATION_A]['OPP_POSS'])
        curr_team_stats[game.TEAM_ABBREVIATION_A]['OPP_EFG'] = (1.5 * curr_team_stats[game.TEAM_ABBREVIATION_A]['OPP_3PM'] + curr_team_stats[game.TEAM_ABBREVIATION_A]['OPP_2PM']) / (curr_team_stats[game.TEAM_ABBREVIATION_A]['OPP_3PA'] + curr_team_stats[game.TEAM_ABBREVIATION_A]['OPP_2PA'])

        # Update stats - TEAM B [Team B offensive performance]
        curr_team_stats[game.TEAM_ABBREVIATION_B]['WIN PCT'] = curr_team_stats[game.TEAM_ABBREVIATION_B]['W'] / (curr_team_stats[game.TEAM_ABBREVIATION_B]['W'] + curr_team_stats[game.TEAM_ABBREVIATION_B]['L'])
        curr_team_stats[game.TEAM_ABBREVIATION_B]['3PM'] += int(game.FG3M_B)
        curr_team_stats[game.TEAM_ABBREVIATION_B]['3PA'] += int(game.FG3A_B)
        curr_team_stats[game.TEAM_ABBREVIATION_B]['3PT PCT'] = curr_team_stats[game.TEAM_ABBREVIATION_B]['3PM']/curr_team_stats[game.TEAM_ABBREVIATION_B]['3PA']
        curr_team_stats[game.TEAM_ABBREVIATION_B]['2PM'] += (int(game.FGM_B) - int(game.FG3M_B))
        curr_team_stats[game.TEAM_ABBREVIATION_B]['2PA'] += (int(game.FGA_B) - int(game.FG3A_B))
        curr_team_stats[game.TEAM_ABBREVIATION_B]['2PT PCT'] = curr_team_stats[game.TEAM_ABBREVIATION_B]['2PM']/curr_team_stats[game.TEAM_ABBREVIATION_B]['2PA']
        curr_team_stats[game.TEAM_ABBREVIATION_B]['PTS'] += int(game.PTS_B)
        curr_team_stats[game.TEAM_ABBREVIATION_B]['TO'] += int(game.TOV_B)
        curr_team_stats[game.TEAM_ABBREVIATION_B]['POSS'] += (int(game.FGA_B) - int(game.OREB_B) + int(game.TOV_B) + int(game.FTA_B)/2)
        curr_team_stats[game.TEAM_ABBREVIATION_B]['PP100P'] = 100 * (curr_team_stats[game.TEAM_ABBREVIATION_B]['PTS'] / curr_team_stats[game.TEAM_ABBREVIATION_B]['POSS'])
        curr_team_stats[game.TEAM_ABBREVIATION_B]['EFG'] = (1.5 * curr_team_stats[game.TEAM_ABBREVIATION_B]['3PM'] + curr_team_stats[game.TEAM_ABBREVIATION_B]['2PM']) / (curr_team_stats[game.TEAM_ABBREVIATION_B]['3PA'] + curr_team_stats[game.TEAM_ABBREVIATION_B]['2PA'])
        curr_team_stats[game.TEAM_ABBREVIATION_B]['ORB'] += int(game.OREB_B)
        curr_team_stats[game.TEAM_ABBREVIATION_B]['ORB PCT'] = curr_team_stats[game.TEAM_ABBREVIATION_B]['ORB'] / ((curr_team_stats[game.TEAM_ABBREVIATION_B]['2PA'] + curr_team_stats[game.TEAM_ABBREVIATION_B]['3PA']) - 
        (curr_team_stats[game.TEAM_ABBREVIATION_B]['2PM'] + curr_team_stats[game.TEAM_ABBREVIATION_B]['3PM']))
        curr_team_stats[game.TEAM_ABBREVIATION_B]['DRB'] += int(game.DREB_B)
        curr_team_stats[game.TEAM_ABBREVIATION_B]['OPP_MISSED_SHOTS'] += (int(game.FGA_B) - int(game.FGM_B))
        curr_team_stats[game.TEAM_ABBREVIATION_B]['DRB PCT'] = curr_team_stats[game.TEAM_ABBREVIATION_B]['DRB'] / curr_team_stats[game.TEAM_ABBREVIATION_B]['OPP_MISSED_SHOTS']

        # Update stats - TEAM B OPPONENTS [Team B defensive performance]
        curr_team_stats[game.TEAM_ABBREVIATION_B]['OPP_3PM'] += int(game.FG3M_A)
        curr_team_stats[game.TEAM_ABBREVIATION_B]['OPP_3PA'] += int(game.FG3A_A)
        curr_team_stats[game.TEAM_ABBREVIATION_B]['OPP_3PT PCT'] = curr_team_stats[game.TEAM_ABBREVIATION_B]['OPP_3PM']/curr_team_stats[game.TEAM_ABBREVIATION_B]['OPP_3PA']
        curr_team_stats[game.TEAM_ABBREVIATION_B]['OPP_2PM'] += (int(game.FGM_A) - int(game.FG3M_A))
        curr_team_stats[game.TEAM_ABBREVIATION_B]['OPP_2PA'] += (int(game.FGA_A) - int(game.FG3A_A))
        curr_team_stats[game.TEAM_ABBREVIATION_B]['OPP_2PT PCT'] = curr_team_stats[game.TEAM_ABBREVIATION_B]['OPP_2PM']/curr_team_stats[game.TEAM_ABBREVIATION_B]['OPP_2PA']
        curr_team_stats[game.TEAM_ABBREVIATION_B]['OPP_PTS'] += int(game.PTS_A)
        curr_team_stats[game.TEAM_ABBREVIATION_B]['OPP_TO'] += int(game.TOV_A)
        curr_team_stats[game.TEAM_ABBREVIATION_B]['OPP_POSS'] += (int(game.FGA_A) - int(game.OREB_A) + int(game.TOV_A) + int(game.FTA_A)/2)
        curr_team_stats[game.TEAM_ABBREVIATION_B]['OPP_PP100P'] = 100 * (curr_team_stats[game.TEAM_ABBREVIATION_B]['OPP_PTS'] / curr_team_stats[game.TEAM_ABBREVIATION_B]['OPP_POSS'])
        curr_team_stats[game.TEAM_ABBREVIATION_B]['OPP_EFG'] = (1.5 * curr_team_stats[game.TEAM_ABBREVIATION_B]['OPP_3PM'] + curr_team_stats[game.TEAM_ABBREVIATION_B]['OPP_2PM']) / (curr_team_stats[game.TEAM_ABBREVIATION_B]['OPP_3PA'] + curr_team_stats[game.TEAM_ABBREVIATION_B]['OPP_2PA'])


        # Update rankings/stats at the end of the day and store a snapshot
        if game.GAME_DATE != game.GAME_DATE_LEAD or game.GAME_DATE_LEAD is None:
            ranks = update_ranks(curr_team_stats)
            for team in team_list:
                curr_team_stats[team]['RANK'] = ranks[team]['RANK']

            # Store standings/ranking snapshot
            team_stats_snapshot = copy.deepcopy(curr_team_stats)
            daily_team_stats[game.GAME_DATE] = team_stats_snapshot

    # Instantiate a dictionary to track opponent rankings
    opponent_ranks = {}
    for team in team_list:
        opponent_ranks[team] = {'RANKS': [], 'W_L': []}

    # At the end of each day we take a snapshot of the opponent rankings and use that to help us predict the next day's games
    daily_opponent_ranks = {}
    for date in game_dates:
        daily_opponent_ranks[date] = {}
        for team in team_list:
            daily_opponent_ranks[date][team] = {'RANKS': [], 'W_L': [], 'GP': 0}
            

    # Iterate through each game in our dataframe to build out each team's opponent rank list (to calculate an average) using the daily_standings dictionary

    for index, game in games_season.iterrows():
        # Team A
        opp_rank = daily_team_stats[game.GAME_DATE][game.TEAM_ABBREVIATION_B]['RANK']
        opponent_ranks[game.TEAM_ABBREVIATION_A]['RANKS'].append(opp_rank)
        opponent_ranks[game.TEAM_ABBREVIATION_A]['W_L'].append(1 if game.WL_A == 'W' else 0)
        opponent_ranks[game.TEAM_ABBREVIATION_A]['GP'] = len(opponent_ranks[game.TEAM_ABBREVIATION_A]['W_L'])

        # Team B
        opp_rank = daily_team_stats[game.GAME_DATE][game.TEAM_ABBREVIATION_A]['RANK']
        opponent_ranks[game.TEAM_ABBREVIATION_B]['RANKS'].append(opp_rank)
        opponent_ranks[game.TEAM_ABBREVIATION_B]['W_L'].append(1 if game.WL_B == 'W' else 0)
        opponent_ranks[game.TEAM_ABBREVIATION_B]['GP'] = len(opponent_ranks[game.TEAM_ABBREVIATION_B]['W_L'])

        # Store daily snapshot of opponent rankings
        if game.GAME_DATE != game.GAME_DATE_LEAD or game.GAME_DATE_LEAD is None:
            # Store ranking snapshot
            rankings_snapshot = copy.deepcopy(opponent_ranks)
            daily_opponent_ranks[game.GAME_DATE] = rankings_snapshot
        
    # Add daily stats for each team in each game

    # Team A
    team_a_gp = []
    team_a_sos = []
    team_a_sos_last10 = []
    team_a_win_pct = []
    team_a_win_pct_last10 = []
    team_a_3pt_pct = []
    team_a_2pt_pct = []
    team_a_pp100p = []
    team_a_efg = []
    team_a_orb_pct = []
    team_a_drb_pct = []
    team_a_opp_3pt_pct = []
    team_a_opp_2pt_pct = []
    team_a_opp_pp100p = []
    team_a_opp_efg = []


    # Team B
    team_b_gp = []
    team_b_sos = []
    team_b_sos_last10 = []
    team_b_win_pct = []
    team_b_win_pct_last10 = []
    team_b_3pt_pct = []
    team_b_2pt_pct = []
    team_b_pp100p = []
    team_b_efg = []
    team_b_orb_pct = []
    team_b_drb_pct = []
    team_b_opp_3pt_pct = []
    team_b_opp_2pt_pct = []
    team_b_opp_pp100p = []
    team_b_opp_efg = []


    for index, game in games_season.iterrows():
        # Team A
        team_a_gp.append(daily_opponent_ranks[game.GAME_DATE][game.TEAM_ABBREVIATION_A]['GP'])
        team_a_sos.append(np.mean(daily_opponent_ranks[game.GAME_DATE][game.TEAM_ABBREVIATION_A]['RANKS']))
        team_a_sos_last10.append(np.mean(daily_opponent_ranks[game.GAME_DATE][game.TEAM_ABBREVIATION_A]['RANKS'][-10:]))
        team_a_win_pct.append(np.mean(daily_opponent_ranks[game.GAME_DATE][game.TEAM_ABBREVIATION_A]['W_L']))
        team_a_win_pct_last10.append(np.mean(daily_opponent_ranks[game.GAME_DATE][game.TEAM_ABBREVIATION_A]['W_L'][-10:]))

        team_a_3pt_pct.append(daily_team_stats[game.GAME_DATE][game.TEAM_ABBREVIATION_A]['3PT PCT'])
        team_a_2pt_pct.append(daily_team_stats[game.GAME_DATE][game.TEAM_ABBREVIATION_A]['2PT PCT'])
        team_a_pp100p.append(daily_team_stats[game.GAME_DATE][game.TEAM_ABBREVIATION_A]['PP100P'])
        team_a_efg.append(daily_team_stats[game.GAME_DATE][game.TEAM_ABBREVIATION_A]['EFG'])
        team_a_orb_pct.append(daily_team_stats[game.GAME_DATE][game.TEAM_ABBREVIATION_A]['ORB PCT'])
        team_a_drb_pct.append(daily_team_stats[game.GAME_DATE][game.TEAM_ABBREVIATION_A]['DRB PCT'])
        team_a_opp_3pt_pct.append(daily_team_stats[game.GAME_DATE][game.TEAM_ABBREVIATION_A]['OPP_3PT PCT'])
        team_a_opp_2pt_pct.append(daily_team_stats[game.GAME_DATE][game.TEAM_ABBREVIATION_A]['OPP_2PT PCT'])
        team_a_opp_pp100p.append(daily_team_stats[game.GAME_DATE][game.TEAM_ABBREVIATION_A]['OPP_PP100P'])
        team_a_opp_efg.append(daily_team_stats[game.GAME_DATE][game.TEAM_ABBREVIATION_A]['OPP_EFG'])


        # Team B
        team_b_gp.append(daily_opponent_ranks[game.GAME_DATE][game.TEAM_ABBREVIATION_B]['GP'])
        team_b_sos.append(np.mean(daily_opponent_ranks[game.GAME_DATE][game.TEAM_ABBREVIATION_B]['RANKS']))
        team_b_sos_last10.append(np.mean(daily_opponent_ranks[game.GAME_DATE][game.TEAM_ABBREVIATION_B]['RANKS'][-10:]))
        team_b_win_pct.append(np.mean(daily_opponent_ranks[game.GAME_DATE][game.TEAM_ABBREVIATION_B]['W_L']))
        team_b_win_pct_last10.append(np.mean(daily_opponent_ranks[game.GAME_DATE][game.TEAM_ABBREVIATION_B]['W_L'][-10:]))

        team_b_3pt_pct.append(daily_team_stats[game.GAME_DATE][game.TEAM_ABBREVIATION_B]['3PT PCT'])
        team_b_2pt_pct.append(daily_team_stats[game.GAME_DATE][game.TEAM_ABBREVIATION_B]['2PT PCT'])
        team_b_pp100p.append(daily_team_stats[game.GAME_DATE][game.TEAM_ABBREVIATION_B]['PP100P'])
        team_b_efg.append(daily_team_stats[game.GAME_DATE][game.TEAM_ABBREVIATION_B]['EFG'])
        team_b_orb_pct.append(daily_team_stats[game.GAME_DATE][game.TEAM_ABBREVIATION_B]['ORB PCT'])
        team_b_drb_pct.append(daily_team_stats[game.GAME_DATE][game.TEAM_ABBREVIATION_B]['DRB PCT'])
        team_b_opp_3pt_pct.append(daily_team_stats[game.GAME_DATE][game.TEAM_ABBREVIATION_B]['OPP_3PT PCT'])
        team_b_opp_2pt_pct.append(daily_team_stats[game.GAME_DATE][game.TEAM_ABBREVIATION_B]['OPP_2PT PCT'])
        team_b_opp_pp100p.append(daily_team_stats[game.GAME_DATE][game.TEAM_ABBREVIATION_B]['OPP_PP100P'])
        team_b_opp_efg.append(daily_team_stats[game.GAME_DATE][game.TEAM_ABBREVIATION_B]['OPP_EFG'])


    games_season['TEAM_A_GP'] = team_a_gp
    games_season['TEAM_B_GP'] = team_b_gp

    # Team A
    games_season['TEAM_A_SOS'] = team_a_sos
    games_season['TEAM_A_SOS_LAST10'] = team_a_sos_last10
    games_season['TEAM_A_WIN_PCT'] = team_a_win_pct
    games_season['TEAM_A_WIN_PCT_LAST10'] = team_a_win_pct_last10
    games_season['TEAM_A_3PT_PCT'] = team_a_3pt_pct
    games_season['TEAM_A_2PT_PCT'] = team_a_2pt_pct
    games_season['TEAM_A_PP100P'] = team_a_pp100p
    games_season['TEAM_A_EFG'] = team_a_efg
    games_season['TEAM_A_ORB_PCT'] = team_a_orb_pct
    games_season['TEAM_A_DRB_PCT'] = team_a_drb_pct
    games_season['TEAM_A_OPP_3PT_PCT'] = team_a_opp_3pt_pct
    games_season['TEAM_A_OPP_2PT_PCT'] = team_a_opp_2pt_pct
    games_season['TEAM_A_OPP_PP100P'] = team_a_opp_pp100p
    games_season['TEAM_A_OPP_EFG'] = team_a_opp_efg

    # Team B
    games_season['TEAM_B_SOS'] = team_b_sos
    games_season['TEAM_B_SOS_LAST10'] = team_b_sos_last10
    games_season['TEAM_B_WIN_PCT'] = team_b_win_pct
    games_season['TEAM_B_WIN_PCT_LAST10'] = team_b_win_pct_last10
    games_season['TEAM_B_3PT_PCT'] = team_b_3pt_pct
    games_season['TEAM_B_2PT_PCT'] = team_b_2pt_pct
    games_season['TEAM_B_PP100P'] = team_b_pp100p
    games_season['TEAM_B_EFG'] = team_b_efg
    games_season['TEAM_B_ORB_PCT'] = team_b_orb_pct
    games_season['TEAM_B_DRB_PCT'] = team_b_drb_pct
    games_season['TEAM_B_OPP_3PT_PCT'] = team_b_opp_3pt_pct
    games_season['TEAM_B_OPP_2PT_PCT'] = team_b_opp_2pt_pct
    games_season['TEAM_B_OPP_PP100P'] = team_b_opp_pp100p
    games_season['TEAM_B_OPP_EFG'] = team_b_opp_efg


    initial_columns = ['SEASON_ID', 'TEAM_ABBREVIATION_A', 'TEAM_ABBREVIATION_B', 'GAME_ID', 'GAME_DATE']
    additional_columns =  list(games_season.columns[-30:])
    additional_columns.extend(['PLUS_MINUS_A'])
    final_columns = initial_columns + additional_columns
    final_df = games_season[final_columns]

    # Concatenate team stats data into master DF with multiple seasons
    final_stats_df = pd.concat([final_stats_df, final_df])

In [11]:
final_stats_df.head()

Unnamed: 0,SEASON_ID,TEAM_ABBREVIATION_A,TEAM_ABBREVIATION_B,GAME_ID,GAME_DATE,TEAM_A_GP,TEAM_B_GP,TEAM_A_SOS,TEAM_A_SOS_LAST10,TEAM_A_WIN_PCT,...,TEAM_B_2PT_PCT,TEAM_B_PP100P,TEAM_B_EFG,TEAM_B_ORB_PCT,TEAM_B_DRB_PCT,TEAM_B_OPP_3PT_PCT,TEAM_B_OPP_2PT_PCT,TEAM_B_OPP_PP100P,TEAM_B_OPP_EFG,PLUS_MINUS_A
39967,22015,GSW,NOP,21500003,2015-10-27,1.0,1.0,7.0,7.0,1.0,...,0.446154,89.201878,0.457831,0.166667,0.520833,0.3,0.484848,104.716981,0.473958,16.0
2696,22015,ATL,DET,21500001,2015-10-27,1.0,1.0,3.0,3.0,0.0,...,0.373134,104.950495,0.447917,0.389831,0.610169,0.296296,0.527273,96.410256,0.5,-12.0
28098,22015,CHI,CLE,21500002,2015-10-27,1.0,1.0,6.0,6.0,1.0,...,0.446154,93.596059,0.452128,0.196429,0.696429,0.368421,0.441176,92.822967,0.465517,2.0
28094,22015,MEM,CLE,21500011,2015-10-28,1.0,2.0,15.0,15.0,0.0,...,0.475,100.0,0.505618,0.232323,0.818182,0.257143,0.425373,85.432099,0.41716,-30.0
68638,22015,HOU,DEN,21500010,2015-10-28,1.0,1.0,4.0,4.0,0.0,...,0.519231,105.0,0.588608,0.230769,1.025641,0.228571,0.423077,84.158416,0.390805,-20.0


In [12]:
final_stats_df.tail()

Unnamed: 0,SEASON_ID,TEAM_ABBREVIATION_A,TEAM_ABBREVIATION_B,GAME_ID,GAME_DATE,TEAM_A_GP,TEAM_B_GP,TEAM_A_SOS,TEAM_A_SOS_LAST10,TEAM_A_WIN_PCT,...,TEAM_B_2PT_PCT,TEAM_B_PP100P,TEAM_B_EFG,TEAM_B_ORB_PCT,TEAM_B_DRB_PCT,TEAM_B_OPP_3PT_PCT,TEAM_B_OPP_2PT_PCT,TEAM_B_OPP_PP100P,TEAM_B_OPP_EFG,PLUS_MINUS_A
25665,22022,CLE,CHA,22201218,2023-04-09,82.0,82.0,16.45122,19.3,0.621951,...,0.527825,105.796849,0.516053,0.223684,0.69225,0.357041,0.549902,111.659792,0.544446,-11.0
142762,22022,NYK,IND,22201220,2023-04-09,82.0,82.0,15.621951,18.4,0.573171,...,0.54044,110.333256,0.544588,0.213279,0.67426,0.37285,0.551198,114.626726,0.554213,-5.0
77373,22022,POR,GSW,22201230,2023-04-09,82.0,82.0,16.060976,14.2,0.402439,...,0.564495,112.836235,0.570743,0.224125,0.706016,0.363636,0.53638,110.561151,0.539943,-56.0
106245,22022,LAL,UTA,22201228,2023-04-09,82.0,82.0,14.902439,18.3,0.52439,...,0.560478,111.673355,0.547454,0.249485,0.704133,0.361484,0.540448,112.831575,0.541118,11.0
22,22022,BOS,ATL,22201216,2023-04-09,82.0,82.0,16.304878,16.1,0.695122,...,0.547643,113.023743,0.541194,0.234934,0.705684,0.356259,0.561935,112.803493,0.551703,6.0


In [13]:
# Save final dataframe
final_stats_df.to_csv('../../generated_datasets/final_stats_df.csv', index=False)