In [61]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import geopy.distance
import gc

In [59]:
def get_last_games(df, data, team_name, n = 5, filter="all", verbose=False):
    if(filter == "all"):
        last_games = df[(df["DATE"] < data) & 
                        ((df["team_home"] == team_name) | (df["team_away"] == team_name))].tail(n)
        
    elif(filter == "home"):
        last_games = df[(df["DATE"] < data) & (df["team_home"] == team_name)].tail(n)
    elif(filter == "away"):
        last_games = df[(df["DATE"] < data) & (df["team_away"] == team_name)].tail(n)

    if(verbose):
        print(last_games[["team_home", "team_away", "PTS_home", "PTS_away", "DATE"]])
    
    if not isinstance(last_games, pd.DataFrame):
        last_games = last_games.to_frame().transpose()
        
    return(last_games)

def get_avg_last_games(last_games, team_name, n = 5, data_ref = None, rivals = False, 
                       to_drop=['fl_home_win', 'fl_playoff', 'fl_win','index', 'team_away_game_num',
                                'team_game_num', 'team_home_game_num', "TEAM_CITY_away", "TEAM_CITY_home",
                                "TEAM_CITY_home", "TEAM_ID_home", "TEAM_NAME_advanced_home",
                                "TEAM_NAME_fourfactors_home", "TEAM_NAME_hustle_home",
                                "TEAM_NAME_misc_home", "TEAM_NAME_home", "TEAM_NAME_playertrack_home",
                                "TEAM_NAME_scoring_home", 'TEAM_ID_home', 'TEAM_ID_away', "TEAM_NAME_advanced_away",
                                "TEAM_NAME_fourfactors_away", "TEAM_NAME_hustle_away",
                                "TEAM_NAME_misc_away", "TEAM_NAME_away", "TEAM_NAME_playertrack_away",
                                "TEAM_NAME_scoring_away", "DATE", "SEASON", 'GAME', 'GAME_ID_home', 'GAME_ID_away',
                               'GAME_DATE_home', 'GAME_PLACE_home']):
    
    if(rivals == False):
        last_games_home = last_games[last_games["team_home"] == team_name].sum().to_frame().transpose().drop(away_columns + to_drop + ["team_home"], axis=1, errors="ignore")
        last_games_away = last_games[last_games["team_away"] == team_name].sum().to_frame().transpose().drop(home_columns + to_drop + ["team_away"], axis=1, errors="ignore")
        
        last_games_home.columns = [x.replace("_home","") for x in last_games_home.columns]
        last_games_away.columns = [x.replace("_away","") for x in last_games_away.columns]
    else:
        last_games_home = last_games[last_games["team_home"] != team_name].sum().to_frame().transpose().drop(away_columns + to_drop + ["team_home"], axis=1, errors="ignore")
        last_games_away = last_games[last_games["team_away"] != team_name].sum().to_frame().transpose().drop(home_columns + to_drop + ["team_away"], axis=1, errors="ignore")
            
        last_games_home.columns = [x.replace("_home","_opponent") for x in last_games_home.columns]
        last_games_away.columns = [x.replace("_away","_opponent") for x in last_games_away.columns]
    
    if(n == 10000):
        n = len(last_games_home) + len(last_games_away)
    
    if(len(last_games_home) == 0):
        if(rivals):
            #resp = (last_games_away/n).to_frame().transpose()
            resp = (last_games_away/n)
        else:
            resp = last_games_away/n
    
    if(len(last_games_away) == 0):
        if(rivals):
            #resp = (last_games_home/n).to_frame().transpose()
            resp = (last_games_home/n)
        else:
            resp = last_games_home/n
            
    if(len(last_games_away) > 0 and len(last_games_home) > 0):
        
        if(rivals):
            #resp = ((last_games_home + last_games_away) / n).to_frame().transpose()
            resp = ((last_games_home + last_games_away) / n)
        else:
            resp = ((last_games_home + last_games_away) / n)    
    
    if not rivals:
        var_criadas = cria_variaveis_sumarizacao(last_games, team_name, n = 5, data_ref = None)
        resp = pd.concat([resp, var_criadas], axis=1).sum().to_frame().transpose()
    
    return(resp)

def get_season(date):
    ano = date.year
    if(date.month >= 10):
        return ano + 1
    return ano

def is_playoff(date):
    # Playoffs 2016
    if date >= datetime(2016, 4, 16) and date < datetime(2016, 6, 30):
        return 1

    # Playoffs 2017
    elif date >= datetime(2017, 4, 15) and date < datetime(2017, 6, 30):
        return 1

     # Playoffs 2018
    elif date >= datetime(2018, 4, 14) and date < datetime(2018, 6, 30):
        return 1

    return 0

def get_dist_last_game(df, data, df_dist, team_home, team_away, is_home=True):    
    """
        Retorna a distância em KM percorrida pelo time específico para chegar a um jogo
    """

    if(is_home):
        last_game = get_last_games(df, data, team_home, n = 1)

        if(len(last_game) == 0):
            return(0)

        if (last_game.team_home.iloc[0] == team_home):
            return(0)
        else:
            return(df_dist.loc[team_home, last_game.team_home.iloc[0]])        
    else:
        last_game = get_last_games(df, data, team_away, n = 1)

        if(len(last_game) == 0):
            return(df_dist.loc[team_home, team_away])

        if (last_game.team_away.iloc[0] == team_away):
            return(df_dist.loc[team_home, last_game.team_away.iloc[0]])
        else:
            return(df_dist.loc[team_home, team_away])

def get_days_from_last_game(df, data, team_name):    
    """
        Retorna o número de dias entre o jogo atual e o jogo passado
    """

    last_game = get_last_games(df, data, team_name, n = 1)

    if(len(last_game) == 0):
        return(np.nan)

    return(-(last_game.DATE - data).iloc[0].days)

def cria_variaveis_sumarizacao(last_games, team_name, n = 5, data_ref = None, verbose = False):
    resp = {}
    
    # Cria variávies de Win %
    # Visão home
    resp["N_WINS_HOME"] = [np.where((last_games["team_home"] == team_name) &
                             (last_games["fl_home_win"] == 1) , 1, 0).sum()]
    resp["N_GAMES_HOME"] = [np.where(last_games["team_home"] == team_name, 1, 0).sum()]
    resp["WIN_HOME_PCT"] = [(resp["N_WINS_HOME"][0] / resp["N_GAMES_HOME"][0])]
    
    # Visão Away
    resp["N_WINS_AWAY"] = [np.where((last_games["team_away"] == team_name) & 
                                      (last_games["fl_home_win"] == 0), 1, 0).sum()]
    resp["N_GAMES_AWAY"] = [np.where(last_games["team_away"] == team_name, 1, 0).sum()]
    resp["WIN_AWAY_PCT"] = [(resp["N_WINS_AWAY"][0] / resp["N_GAMES_AWAY"][0])]
    
    # Visão geral
    resp["N_WINS_TOTAL"] = [resp["N_WINS_AWAY"][0] + resp["N_WINS_HOME"][0]]
    resp["WIN_PCT"] = [resp["N_WINS_TOTAL"][0]/n]
    
    if verbose:
        print("Win_PCT", resp["WIN_PCT"][0], resp["N_WINS_AWAY"][0], resp["N_WINS_HOME"][0])
    
    # Cria variáveis de data
    if(data_ref is None):
        data_ref = np.max(last_games["DATE"]) + timedelta(days=1)
    
    # Visão da série
    resp["TOTAL_DAYS_DIFF"] = [(np.max(last_games["DATE"]) - np.min(last_games["DATE"])).days]
    days_diff_last_games = [-x.days if not np.isnan(x.days) else 0 
                            for x in last_games["DATE"].sub(last_games["DATE"].shift(-1).fillna(data_ref))]
    resp["DAYS_DIFF_LG_STD"] = [np.std(days_diff_last_games)]
    resp["DAYS_DIFF_LG_MEAN"] = [np.mean(days_diff_last_games)]
    
    if verbose:
        print("Days_Diff", days_diff_last_games, resp["DAYS_DIFF_LG_MEAN"][0], resp["TOTAL_DAYS_DIFF"][0]) 
    
    # All
    resp["N_GAMES_L2_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=2))) & 
                            (last_games["DATE"] < data_ref) &
                            ((last_games["team_home"] == team_name) |
                             (last_games["team_away"] == team_name)), 1, 0).sum()]
    
    resp["N_GAMES_L4_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=4))) & 
                            (last_games["DATE"] < data_ref) &
                            ((last_games["team_home"] == team_name) |
                             (last_games["team_away"] == team_name)), 1, 0).sum()]
                             
    resp["N_GAMES_L6_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=6))) & 
                            (last_games["DATE"] < data_ref) &
                            ((last_games["team_home"] == team_name) |
                             (last_games["team_away"] == team_name)), 1, 0).sum()]
    
    resp["N_GAMES_L8_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=8))) & 
                            (last_games["DATE"] < data_ref) &
                            ((last_games["team_home"] == team_name) |
                             (last_games["team_away"] == team_name)), 1, 0).sum()]
    
    resp["N_GAMES_L10_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=10))) & 
                            (last_games["DATE"] < data_ref) &
                            ((last_games["team_home"] == team_name) |
                             (last_games["team_away"] == team_name)), 1, 0).sum()]
    
    # Away
    resp["N_GAMES_AWAY_L2_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=2))) & 
                                    (last_games["DATE"] < data_ref) &
                                    (last_games["team_away"] == team_name), 1, 0).sum()]
    
    resp["N_GAMES_AWAY_L4_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=4))) & 
                                    (last_games["DATE"] < data_ref) &
                                    (last_games["team_away"] == team_name), 1, 0).sum()]
    
    resp["N_GAMES_AWAY_L6_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=6))) & 
                                    (last_games["DATE"] < data_ref) &
                                    (last_games["team_away"] == team_name), 1, 0).sum()] 
    
    resp["N_GAMES_AWAY_L8_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=8))) & 
                                    (last_games["DATE"] < data_ref) &
                                    (last_games["team_away"] == team_name), 1, 0).sum()]
    
    resp["N_GAMES_AWAY_L8_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=10))) & 
                                    (last_games["DATE"] < data_ref) &
                                    (last_games["team_away"] == team_name), 1, 0).sum()]
    
    # Home    
    resp["N_GAMES_AWAY_L2_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=2))) & 
                                    (last_games["DATE"] < data_ref) &
                                    (last_games["team_home"] == team_name), 1, 0).sum()]
    
    resp["N_GAMES_AWAY_L4_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=4))) & 
                                    (last_games["DATE"] < data_ref) &
                                    (last_games["team_home"] == team_name), 1, 0).sum()]
    
    resp["N_GAMES_AWAY_L6_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=6))) & 
                                    (last_games["DATE"] < data_ref) &
                                    (last_games["team_home"] == team_name), 1, 0).sum()]
    
    resp["N_GAMES_AWAY_L8_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=8))) & 
                                    (last_games["DATE"] < data_ref) &
                                    (last_games["team_home"] == team_name), 1, 0).sum()]
    
    resp["N_GAMES_AWAY_L10_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=10))) & 
                                    (last_games["DATE"] < data_ref) &
                                    (last_games["team_home"] == team_name), 1, 0).sum()]
    
    if verbose:
        print("Num Games Last X Days", resp["N_GAMES_L6_days"][0], resp["N_GAMES_AWAY_L6_days"][0])
        
        
    # Distance KM
    resp["SUM_DIST_KM"] = [last_games[last_games["team_home"] == team_name]["DISTANCE_KM_home"].sum()
                           + last_games[last_games["team_away"] == team_name]["DISTANCE_KM_away"].sum()]
    
    dist_list = (list(last_games[last_games["team_home"] == team_name]["DISTANCE_KM_home"])
                 + list(last_games[last_games["team_away"] == team_name]["DISTANCE_KM_away"]))
    
    resp["AVG_DIST_KM"] = [np.average(dist_list)]
    
    # Back to Back
    resp["BACK_TO_BACK"] = [np.where(pd.Series(dist_list) > 3500, 1, 0).sum()]
    
    
    # Days from last Game
    
    days_from_last_games_list = (list(last_games[last_games["team_home"] == team_name]["DAYS_FROM_LAST_GAME_home"])
                                 + list(last_games[last_games["team_away"] == team_name]["DAYS_FROM_LAST_GAME_away"]))
    
    if(len(days_from_last_games_list) > 0):
        resp["AVG_DAYS_FROM_LG"] = [np.average(days_from_last_games_list)]
        resp["STD_DAYS_FROM_LG"] = [np.std(days_from_last_games_list)]
        resp["MIN_DAYS_FROM_LG"] = [np.min(days_from_last_games_list)]
    else:
        resp["AVG_DAYS_FROM_LG"] = [np.nan]
        resp["STD_DAYS_FROM_LG"] = [np.nan]
        resp["MIN_DAYS_FROM_LG"] = [np.nan]
    
    return(pd.DataFrame(resp))

In [68]:
def prepara_base(base):
    base["fl_home"] = np.where(base["GAME"].str[6:9] == base["TEAM_ABBREVIATION"], 1, 0)
    
    home_games = base[base["fl_home"] == 1].set_index("GAME")
    away_games = base[base["fl_home"] == 0].set_index("GAME")
    
    all_games = home_games.join(away_games, how="inner", lsuffix="_home", rsuffix="_away")
    all_games.drop(["GAME_ID_away", "GAME_DATE_away", "GAME_PLACE_away",
                "MIN_home", "MIN_away", 'PTS_hustle_home', 'PTS_hustle_away',
                "fl_home_away", "fl_home_home"], axis=1, inplace=True)
    
    all_games["DATE"] = [datetime.strptime(str(x), '%Y-%m-%d') for x in all_games.GAME_DATE_home]
    all_games["SEASON"] = [get_season(x) for x in all_games.DATE]
    all_games["fl_playoff"] = [is_playoff(x) for x in all_games.DATE]
    all_games['fl_home_win'] = np.where(all_games['PTS_home'] > all_games['PTS_away'], 1, 0)
    all_games = all_games.sort_values('DATE')
    
    all_games = all_games.rename(columns={'TEAM_ABBREVIATION_home': "team_home",
                                     'TEAM_ABBREVIATION_away': "team_away"})
    
    all_games["team_home_game_num"] = all_games.groupby(['team_home']).cumcount() + 1
    all_games["team_away_game_num"] = all_games.groupby(['team_away']).cumcount() + 1
    
    return(all_games)

def cria_features(base):
    df_dist = pd.read_csv("../old_files/dist_matrix_km.csv", index_col=0)
    
    base["DISTANCE_KM_home"] = [get_dist_last_game(base, x.DATE, df_dist, x.team_home, x.team_away, is_home=True) 
                                for _, x in base.iterrows()]
    base["DISTANCE_KM_away"] = [get_dist_last_game(base, x.DATE, df_dist, x.team_home, x.team_away, is_home=False) 
                                for _, x in base.iterrows()]
    
    base["DAYS_FROM_LAST_GAME_home"] = [get_days_from_last_game(base, x.DATE, x.team_home) 
                                        for _, x in base.iterrows()]
    base["DAYS_FROM_LAST_GAME_away"] = [get_days_from_last_game(base, x.DATE, x.team_away) 
                                        for _, x in base.iterrows()]
    
    
def gera_last_N_games(all_games, N = [1, 5, 10, 10000]):
    gc.enable()
    resp = []
    
    home_columns = [x for x in all_games.columns if x.endswith("_home") and x not in ['GAME_ID_home', 'TEAM_CITY_home', 'GAME_DATE_home', 'GAME_PLACE_home', 'TEAM_NICKNAME_home']]
    away_columns = [x for x in all_games.columns if x.endswith("_away") and x not in ['TEAM_CITY_away', 'TEAM_NICKNAME_away']]

    for index, row in all_games.reset_index().iterrows():
        game_line_n = []
        for n_games in N:
            # Home team
            home_last_games = get_last_games(all_games, row["DATE"], row["team_home"], n=n_games)
            home_last_games_as_home = get_last_games(all_games, row["DATE"], row["team_home"], filter="home", n=n_games)

            home_avg_last_games = get_avg_last_games(home_last_games, row["team_home"], data_ref=row["DATE"])
            home_avg_last_games_as_home = get_avg_last_games(home_last_games_as_home, row["team_home"], data_ref=row["DATE"])
            home_rivals_last_games = get_avg_last_games(home_last_games, row["team_home"], rivals=True, data_ref=row["DATE"])

            home_avg_last_games["game_ref"] = [row.GAME]
            home_avg_last_games.set_index("game_ref", inplace=True)
            home_avg_last_games.drop(["team_home", "team_away", 0],axis=1 ,errors="ignore", inplace=True)

            home_avg_last_games_as_home["game_ref"] = [row.GAME]
            home_avg_last_games_as_home.set_index("game_ref", inplace=True)
            home_avg_last_games_as_home.drop(["team_home", "team_away", 0], axis=1 ,errors="ignore", inplace=True)

            home_rivals_last_games["game_ref"] = [row.GAME]
            home_rivals_last_games.set_index("game_ref", inplace=True)
            home_rivals_last_games.drop(["team_home", "team_away", 0],axis=1 ,errors="ignore", inplace=True)

            #print(home_rivals_last_games.index, home_avg_last_games.index)

            # Away team
            away_last_games = get_last_games(all_games, row["DATE"], row["team_away"], n=n_games).reset_index()
            away_last_games_as_away = get_last_games(all_games, row["DATE"], row["team_away"], filter="away", n=n_games).reset_index()

            away_avg_last_games = get_avg_last_games(away_last_games, row["team_away"], data_ref=row["DATE"])
            away_avg_last_games_as_away = get_avg_last_games(away_last_games_as_away, row["team_away"], data_ref=row["DATE"])
            away_rivals_last_games = get_avg_last_games(away_last_games, row["team_away"], rivals=True, data_ref=row["DATE"])

            away_avg_last_games["game_ref"] = [row.GAME]
            away_avg_last_games.set_index("game_ref", inplace=True)
            away_avg_last_games.drop(["team_home", "team_away", 0],axis=1 ,errors="ignore", inplace=True)

            away_avg_last_games_as_away["game_ref"] = [row.GAME]
            away_avg_last_games_as_away.set_index("game_ref", inplace=True)
            away_avg_last_games_as_away.drop(["team_home", "team_away", 0],axis=1 ,errors="ignore", inplace=True)

            away_rivals_last_games["game_ref"] = [row.GAME]
            away_rivals_last_games.set_index("game_ref", inplace=True)
            away_rivals_last_games.drop(["team_home", "team_away", 0],axis=1 ,errors="ignore", inplace=True)

            #print(away_rivals_last_games.index, away_avg_last_games.index)

            # Junta bases 

            if (n_games == 10000):
                n_games_str = "ALL"
            else:
                n_games_str = str(n_games)

            avg_last_games = home_avg_last_games.join(away_avg_last_games, how="inner", 
                                 lsuffix='_home_L' + n_games_str, rsuffix='_away_L' + n_games_str).drop('level_0', axis=1, errors="ignore")

            avg_last_games_as = home_avg_last_games_as_home.join(away_avg_last_games_as_away, how="inner", 
                                     lsuffix='_home_L' + n_games_str + '_HOME', rsuffix='_away_L' + n_games_str + '_AWAY').drop('level_0', axis=1, errors="ignore")


            rivals_last_games = home_rivals_last_games.join(away_rivals_last_games, how="inner",
                                    lsuffix='_home_L' + n_games_str, rsuffix='_away_L' + n_games_str).drop('level_0', axis=1, errors="ignore")


            #print(rivals_last_games.columns)

            game_line = avg_last_games.join(rivals_last_games, how="inner")

            game_line = game_line.join(avg_last_games_as, how="inner")

            game_line = pd.concat([row.to_frame().transpose().set_index("GAME"), game_line], axis=1)

            print(str(row.GAME) + " " + str(n_games), end="\r")

            game_line_n.append(game_line)

        resp.append(game_line_n)

        del home_last_games
        del home_last_games_as_home
        del home_avg_last_games
        del home_avg_last_games_as_home
        del home_rivals_last_games

        del away_last_games
        del away_last_games_as_away
        del away_avg_last_games
        del away_avg_last_games_as_away
        del away_rivals_last_games

        del avg_last_games
        del avg_last_games_as
        del game_line
        del game_line_n

        gc.collect()
        
    return(resp)

In [45]:
base_nba_per_game = pd.read_csv("../bases_nba_stats/base_nba_per_game_16_17.csv")

In [20]:
all_games = prepara_base(base_nba_per_game)

In [55]:
cria_features(all_games)

In [69]:
gera_last_N_games(all_games, N=[5])

NameError: name 'away_columns' is not defined