In [181]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [30]:
base_nba_per_game = pd.read_csv("bases_nba_stats/base_nba_per_game_17_18_arrumada.csv", sep=";")

In [31]:
base_nba_per_game["fl_home"] = np.where(base_nba_per_game["GAME"].str[6:9] == base_nba_per_game["TEAM_ABBREVIATION"], 1, 0)

In [32]:
home_games = base_nba_per_game[base_nba_per_game["fl_home"] == 1].set_index("GAME")
away_games = base_nba_per_game[base_nba_per_game["fl_home"] == 0].set_index("GAME")

In [33]:
print("Home", home_games.shape)
print("Away", away_games.shape)

Home (1230, 112)
Away (1230, 112)


In [34]:
all_games = home_games.join(away_games, how="inner", lsuffix="_home", rsuffix="_away")
all_games.drop(["GAME_ID_away", "GAME_DATE_away", "GAME_PLACE_away",
                "MIN_home", "MIN_away", 'PTS_hustle_home', 'PTS_hustle_away',
                "fl_home_away", "fl_home_home"], axis=1, inplace=True)

In [35]:
all_games[["TEAM_ABBREVIATION_away", "PTS_away", "TEAM_ABBREVIATION_home", "PTS_home"]].head()

Unnamed: 0_level_0,TEAM_ABBREVIATION_away,PTS_away,TEAM_ABBREVIATION_home,PTS_home
GAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BOS @ CLE 2017-10-17,BOS,99,CLE,102
HOU @ GSW 2017-10-17,HOU,122,GSW,121
CHA @ DET 2017-10-18,CHA,90,DET,102
BKN @ IND 2017-10-18,BKN,131,IND,140
MIA @ ORL 2017-10-18,MIA,109,ORL,116


In [36]:
def get_season(date):
    ano = date.year
    if(date.month >= 10):
        return ano + 1
    return ano

def is_playoff(date):
    # Playoffs 2016
    if date >= datetime(2016, 4, 16) and date < datetime(2016, 6, 30):
        return 1
    
    # Playoffs 2017
    elif date >= datetime(2017, 4, 15) and date < datetime(2017, 6, 30):
        return 1
    
     # Playoffs 2018
    elif date >= datetime(2018, 4, 14) and date < datetime(2018, 6, 30):
        return 1
    
    return 0

In [52]:
all_games["DATE"] = [datetime.strptime(str(x), '%d/%m/%Y') for x in all_games.GAME_DATE_home]
all_games["SEASON"] = [get_season(x) for x in all_games.DATE]
all_games["fl_playoff"] = [is_playoff(x) for x in all_games.DATE]
all_games['fl_home_win'] = np.where(all_games['PTS_home'] > all_games['PTS_away'], 1, 0)
all_games = all_games.sort_values('DATE')

In [39]:
all_games = all_games.rename(columns={'TEAM_ABBREVIATION_home': "team_home",
                                     'TEAM_ABBREVIATION_away': "team_away"})

In [54]:
all_games["team_home_game_num"] = all_games.groupby(['team_home']).cumcount() + 1
all_games["team_away_game_num"] = all_games.groupby(['team_away']).cumcount() + 1

In [55]:
all_games.to_csv("./bases_nba_stats/all_games_nba_17_18.csv")

### Cria features de Número de Jogos

In [303]:
all_games["NUM_GAMES_LAST_2_DAYS_home"] = [np.where((all_games["DATE"] >= (x.DATE - timedelta(days=2))) & 
                                                    (all_games["DATE"] < x.DATE) &
                                                    ((all_games["team_home"] == x.team_home) |
                                                     (all_games["team_away"] == x.team_home)), 1, 0).sum() 
                                           for _, x in all_games.iterrows()]

all_games["NUM_GAMES_LAST_4_DAYS_home"] = [np.where((all_games["DATE"] >= (x.DATE - timedelta(days=4))) & 
                                                    (all_games["DATE"] < x.DATE) &
                                                    ((all_games["team_home"] == x.team_home) |
                                                     (all_games["team_away"] == x.team_home)), 1, 0).sum() 
                                           
                                           for _, x in all_games.iterrows()]
all_games["NUM_GAMES_LAST_6_DAYS_home"] = [np.where((all_games["DATE"] >= (x.DATE - timedelta(days=6))) & 
                                                    (all_games["DATE"] < x.DATE) &
                                                    ((all_games["team_home"] == x.team_home) |
                                                     (all_games["team_away"] == x.team_home)), 1, 0).sum() 
                                           for _, x in all_games.iterrows()]

all_games["NUM_GAMES_LAST_10_DAYS_home"] = [np.where((all_games["DATE"] >= (x.DATE - timedelta(days=10))) & 
                                                    (all_games["DATE"] < x.DATE) &
                                                    ((all_games["team_home"] == x.team_home) |
                                                     (all_games["team_away"] == x.team_home)), 1, 0).sum() 
                                           for _, x in all_games.iterrows()]

all_games["NUM_GAMES_AWAY_LAST_2_DAYS_home"] = [np.where((all_games["DATE"] >= (x.DATE - timedelta(days=2))) & 
                                                    (all_games["DATE"] < x.DATE) &
                                                    (all_games["team_away"] == x.team_home), 1, 0).sum() 
                                           for _, x in all_games.iterrows()]
all_games["NUM_GAMES_AWAY_LAST_4_DAYS_home"] = [np.where((all_games["DATE"] >= (x.DATE - timedelta(days=4))) & 
                                                    (all_games["DATE"] < x.DATE) &
                                                    (all_games["team_away"] == x.team_home), 1, 0).sum() 
                                           for _, x in all_games.iterrows()]
all_games["NUM_GAMES_AWAY_LAST_6_DAYS_home"] = [np.where((all_games["DATE"] >= (x.DATE - timedelta(days=6))) & 
                                                    (all_games["DATE"] < x.DATE) &
                                                    (all_games["team_away"] == x.team_home), 1, 0).sum() 
                                           for _, x in all_games.iterrows()]
all_games["NUM_GAMES_AWAY_LAST_10_DAYS_home"] = [np.where((all_games["DATE"] >= (x.DATE - timedelta(days=10))) & 
                                                    (all_games["DATE"] < x.DATE) &
                                                    (all_games["team_away"] == x.team_home), 1, 0).sum() 
                                           for _, x in all_games.iterrows()]

In [305]:
all_games["NUM_GAMES_LAST_2_DAYS_away"] = [np.where((all_games["DATE"] >= (x.DATE - timedelta(days=2))) & 
                                                    (all_games["DATE"] < x.DATE) &
                                                    ((all_games["team_away"] == x.team_away) |
                                                     (all_games["team_home"] == x.team_away)), 1, 0).sum() 
                                           for _, x in all_games.iterrows()]

all_games["NUM_GAMES_LAST_4_DAYS_away"] = [np.where((all_games["DATE"] >= (x.DATE - timedelta(days=4))) & 
                                                    (all_games["DATE"] < x.DATE) &
                                                    ((all_games["team_away"] == x.team_away) |
                                                     (all_games["team_home"] == x.team_away)), 1, 0).sum() 
                                           
                                           for _, x in all_games.iterrows()]
all_games["NUM_GAMES_LAST_6_DAYS_away"] = [np.where((all_games["DATE"] >= (x.DATE - timedelta(days=6))) & 
                                                    (all_games["DATE"] < x.DATE) &
                                                    ((all_games["team_away"] == x.team_away) |
                                                     (all_games["team_home"] == x.team_away)), 1, 0).sum() 
                                           for _, x in all_games.iterrows()]

all_games["NUM_GAMES_LAST_10_DAYS_away"] = [np.where((all_games["DATE"] >= (x.DATE - timedelta(days=10))) & 
                                                    (all_games["DATE"] < x.DATE) &
                                                    ((all_games["team_away"] == x.team_away) |
                                                     (all_games["team_home"] == x.team_away)), 1, 0).sum() 
                                           for _, x in all_games.iterrows()]

all_games["NUM_GAMES_AWAY_LAST_2_DAYS_away"] = [np.where((all_games["DATE"] >= (x.DATE - timedelta(days=2))) & 
                                                    (all_games["DATE"] < x.DATE) &
                                                    (all_games["team_away"] == x.team_away), 1, 0).sum() 
                                           for _, x in all_games.iterrows()]
all_games["NUM_GAMES_AWAY_LAST_4_DAYS_away"] = [np.where((all_games["DATE"] >= (x.DATE - timedelta(days=4))) & 
                                                    (all_games["DATE"] < x.DATE) &
                                                    (all_games["team_away"] == x.team_away), 1, 0).sum() 
                                           for _, x in all_games.iterrows()]
all_games["NUM_GAMES_AWAY_LAST_6_DAYS_away"] = [np.where((all_games["DATE"] >= (x.DATE - timedelta(days=6))) & 
                                                    (all_games["DATE"] < x.DATE) &
                                                    (all_games["team_away"] == x.team_away), 1, 0).sum() 
                                           for _, x in all_games.iterrows()]
all_games["NUM_GAMES_AWAY_LAST_10_DAYS_away"] = [np.where((all_games["DATE"] >= (x.DATE - timedelta(days=10))) & 
                                                    (all_games["DATE"] < x.DATE) &
                                                    (all_games["team_away"] == x.team_away), 1, 0).sum() 
                                           for _, x in all_games.iterrows()]

### Últimos 5 Jogos

In [306]:
home_columns = [x for x in all_games.columns if x.endswith("_home") and x not in ['GAME_ID_home', 'TEAM_CITY_home', 'GAME_DATE_home', 'GAME_PLACE_home', 'TEAM_NICKNAME_home']]
away_columns = [x for x in all_games.columns if x.endswith("_away") and x not in ['TEAM_CITY_away', 'TEAM_NICKNAME_away']]

In [307]:
extra_columns = ["WIN_HOME", "WIN_AWAY", "WIN", "WIN_HOME_PCT", "WIN_AWAY_PCT", "WIN_PCT", 
                 "NUM_GAMES_HOME", "NUM_GAMES_AWAY", "DAYS_DIFF_TOTAL", "DAYS_DIFF_NEXT_GAME", 
                 "DAYS_DIFF_NEXT_GAMES_STD"]

home_columns += [x + "_home" for x in extra_columns]
away_columns += [x + "_away" for x in extra_columns]

In [311]:
def get_last_games(df, data, team_name, n = 5, filter="all", data_ref = None, verbose=False):
    if(filter == "all"):
        last_games = df[(df["DATE"] < data) & 
                        ((df["team_home"] == team_name) | (df["team_away"] == team_name))].tail(n)
        
    elif(filter == "home"):
        last_games = df[(df["DATE"] < data) & (df["team_home"] == team_name)].tail(n)
    elif(filter == "away"):
        last_games = df[(df["DATE"] < data) & (df["team_away"] == team_name)].tail(n)
    
    # Cria variávies de Win %
    # Visão home
    last_games["WIN_HOME"] = np.where((last_games["team_home"] == team_name) & (last_games["fl_home_win"] == 1) , 1, 0) * n
    last_games["NUM_GAMES_HOME"] = np.where(last_games["team_home"] == team_name, 1, 0).sum()
    last_games["WIN_HOME_PCT"] = (last_games["WIN_HOME"] / last_games["NUM_GAMES_HOME"]) 
    
    # Visão Away
    last_games["WIN_AWAY"] = np.where((last_games["team_away"] == team_name) & (last_games["fl_home_win"] == 0), 1, 0) * n
    last_games["NUM_GAMES_AWAY"] = np.where(last_games["team_away"] == team_name, 1, 0).sum()
    last_games["WIN_AWAY_PCT"] = (last_games["WIN_AWAY"] / last_games["NUM_GAMES_AWAY"]) 
    
    # Visão geral
    last_games["WIN_PCT"] = np.where(((last_games["team_home"] == team_name) & (last_games["fl_home_win"] == 1)) | 
                                            ((last_games["team_away"] == team_name) & (last_games["fl_home_win"] == 0)), 1, 0)
    last_games["WIN"] = last_games["WIN_PCT"] * n
    
    # Cria variáveis de data
    if(data_ref is None):
        data_ref = np.max(last_games["DATE"]) + timedelta(days=1)
    
    # Visão da série
    last_games["DAYS_DIFF_TOTAL"] = (np.max(last_games["DATE"]) - np.min(last_games["DATE"])).days
    last_games["DAYS_DIFF_NEXT_GAME"] = [-x.days if not np.isnan(x.days) else 0 
                      for x in last_games["DATE"].sub(last_games["DATE"].shift(-1).fillna(data_ref))]
    last_games["DAYS_DIFF_NEXT_GAMES_STD"] = np.std(last_games["DAYS_DIFF_NEXT_GAME"])

    if(verbose):
        print(last_games[["team_home", "team_away", "PTS_home", "PTS_away", "DATE",
                          "WIN_HOME", "WIN_AWAY", "WIN", "WIN_HOME_PCT", "WIN_AWAY_PCT", "WIN_PCT",
                         "NUM_GAMES_HOME", "NUM_GAMES_AWAY", "DAYS_DIFF_TOTAL", "NUM_GAMES_LAST_2_DAYS", 
                         "DAYS_DIFF_NEXT_GAME", "DAYS_DIFF_NEXT_GAMES_STD"]])
    
    return(last_games)

def get_avg_last_games(last_games, team_name, n = 5, rivals = False):
    
    if(rivals == False):
        last_games_home = last_games[last_games["team_home"] == team_name][home_columns].groupby(["team_home"]).sum()
        last_games_away = last_games[last_games["team_away"] == team_name][away_columns].groupby(["team_away"]).sum()
        last_games_home.columns = [x.replace("_home","") for x in last_games_home.columns]
        last_games_away.columns = [x.replace("_away","") for x in last_games_away.columns]
    else:
        last_games_home = last_games[last_games["team_home"] != team_name][home_columns].drop("team_home", axis = 1).sum()
        last_games_away = last_games[last_games["team_away"] != team_name][away_columns].drop("team_away", axis = 1).sum()
        last_games_home.index = [x.replace("_home","_opponent") for x in last_games_home.index]
        last_games_away.index = [x.replace("_away","_opponent") for x in last_games_away.index]
    
    if(len(last_games_home) == 0):
        if(rivals):
            return (last_games_away/n).to_frame().transpose()
        else:
            return (last_games_away/n)
    
    if(len(last_games_away) == 0):
        if(rivals):
            return(last_games_home/n).to_frame().transpose()
        else:
            return(last_games_home/n)
    
    if(rivals):
        return((last_games_home + last_games_away) / n).to_frame().transpose()
    else:
        return((last_games_home + last_games_away) / n)    

In [312]:
teste = all_games[all_games["team_home"] == "ATL"]
teste = get_last_games(all_games, teste.iloc[10]["DATE"], "ATL", verbose=True).reset_index()
get_avg_last_games(teste, "ATL", rivals=False)

                     team_home team_away  PTS_home  PTS_away       DATE  \
GAME                                                                      
LAC @ ATL 2017-11-22       ATL       LAC       103       116 2017-11-22   
NYK @ ATL 2017-11-24       ATL       NYK       116       104 2017-11-24   
TOR @ ATL 2017-11-25       ATL       TOR        78       112 2017-11-25   
CLE @ ATL 2017-11-30       ATL       CLE       114       121 2017-11-30   
ATL @ BKN 2017-12-02       BKN       ATL       102       114 2017-12-02   

                      WIN_HOME  WIN_AWAY  WIN  WIN_HOME_PCT  WIN_AWAY_PCT  \
GAME                                                                        
LAC @ ATL 2017-11-22         0         0    0          0.00           0.0   
NYK @ ATL 2017-11-24         5         0    5          1.25           0.0   
TOR @ ATL 2017-11-25         0         0    0          0.00           0.0   
CLE @ ATL 2017-11-30         0         0    0          0.00           0.0   
ATL @ BKN 20

KeyError: "['WIN_HOME_home' 'WIN_AWAY_home' 'WIN_home' 'WIN_HOME_PCT_home'\n 'WIN_AWAY_PCT_home' 'WIN_PCT_home' 'NUM_GAMES_HOME_home'\n 'NUM_GAMES_AWAY_home' 'DAYS_DIFF_TOTAL_home' 'DAYS_DIFF_NEXT_GAME_home'\n 'DAYS_DIFF_NEXT_GAMES_STD_home'] not in index"

In [278]:
resp = []
season = all_games.reset_index()

for index, row in season.iterrows():
#for index, row in regular_season.groupby("season").first().iterrows():
        # Home team
        home_last_games = get_last_games(season, row["DATE"], row["team_home"])
        home_avg_last_games = get_avg_last_games(home_last_games, row["team_home"])
        home_rivals_last_games = get_avg_last_games(home_last_games, row["team_home"], rivals=True)

        home_avg_last_games["game_ref"] = [row.GAME]
        home_avg_last_games.set_index("game_ref", inplace=True)
        home_avg_last_games.drop(["team_home", "team_away"],axis=1 ,errors="ignore", inplace=True)

        home_rivals_last_games["game_ref"] = [row.GAME]
        home_rivals_last_games.set_index("game_ref", inplace=True)
        home_rivals_last_games.drop(["team_home", "team_away"],axis=1 ,errors="ignore", inplace=True)

        #print(home_rivals_last_games.index, home_avg_last_games.index)

        # Away team
        away_last_games = get_last_games(season, row["DATE"], row["team_away"]).reset_index()
        away_avg_last_games = get_avg_last_games(away_last_games, row["team_away"])
        away_rivals_last_games = get_avg_last_games(away_last_games, row["team_away"], rivals=True)

        away_avg_last_games["game_ref"] = [row.GAME]
        away_avg_last_games.set_index("game_ref", inplace=True)
        away_avg_last_games.drop(["team_home", "team_away"],axis=1 ,errors="ignore", inplace=True)

        away_rivals_last_games["game_ref"] = [row.GAME]
        away_rivals_last_games.set_index("game_ref", inplace=True)
        away_rivals_last_games.drop(["team_home", "team_away"],axis=1 ,errors="ignore", inplace=True)

        #print(away_rivals_last_games.index, away_avg_last_games.index)

        # Junta bases 
        rivals_last_games = home_rivals_last_games.join(away_rivals_last_games, how="inner",
                                            lsuffix='_home_last_5_games', rsuffix='_away_last_5_games')
    
        avg_last_games = home_avg_last_games.join(away_avg_last_games, how="inner", 
                             lsuffix='_home_last_5_games', rsuffix='_away_last_5_games')

        
        print(rivals_last_games.columns)
        
        game_line = avg_last_games.join(rivals_last_games,
                                        how="inner")

        game_line = pd.concat([row.to_frame().transpose().set_index("GAME"), game_line], axis=1)

        print(str(row.GAME), end="\r")
        
        resp.append(game_line)

KeyError: "['WIN_HOME_home' 'WIN_AWAY_home' 'WIN_home' 'WIN_HOME_PCT_home'\n 'WIN_AWAY_PCT_home' 'WIN_PCT_home' 'NUM_GAMES_HOME_home'\n 'NUM_GAMES_AWAY_home' 'NUM_GAMES_LAST_2_DAYS_home'\n 'NUM_GAMES_LAST_4_DAYS_home' 'NUM_GAMES_LAST_6_DAYS_home'\n 'NUM_GAMES_LAST_10_DAYS_home' 'NUM_GAMES_LAST_2_DAYS_HOME_home'\n 'NUM_GAMES_LAST_4_DAYS_HOME_home' 'NUM_GAMES_LAST_6_DAYS_HOME_home'\n 'NUM_GAMES_LAST_10_DAYS_HOME_home' 'NUM_GAMES_LAST_2_DAYS_AWAY_home'\n 'NUM_GAMES_LAST_4_DAYS_AWAY_home' 'NUM_GAMES_LAST_6_DAYS_AWAY_home'\n 'NUM_GAMES_LAST_10_DAYS_AWAY_home' 'DAYS_DIFF_TOTAL_home'\n 'DAYS_DIFF_NEXT_GAME_home' 'DAYS_DIFF_NEXT_GAMES_STD_home'] not in index"