In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import geopy.distance

In [2]:
base_nba_per_game = pd.read_csv("bases_nba_stats/base_nba_per_game_17_18_arrumada.csv", sep=";")

In [31]:
base_nba_per_game["fl_home"] = np.where(base_nba_per_game["GAME"].str[6:9] == base_nba_per_game["TEAM_ABBREVIATION"], 1, 0)

In [32]:
home_games = base_nba_per_game[base_nba_per_game["fl_home"] == 1].set_index("GAME")
away_games = base_nba_per_game[base_nba_per_game["fl_home"] == 0].set_index("GAME")

In [33]:
print("Home", home_games.shape)
print("Away", away_games.shape)

Home (1230, 112)
Away (1230, 112)


In [34]:
all_games = home_games.join(away_games, how="inner", lsuffix="_home", rsuffix="_away")
all_games.drop(["GAME_ID_away", "GAME_DATE_away", "GAME_PLACE_away",
                "MIN_home", "MIN_away", 'PTS_hustle_home', 'PTS_hustle_away',
                "fl_home_away", "fl_home_home"], axis=1, inplace=True)

In [35]:
all_games[["TEAM_ABBREVIATION_away", "PTS_away", "TEAM_ABBREVIATION_home", "PTS_home"]].head()

Unnamed: 0_level_0,TEAM_ABBREVIATION_away,PTS_away,TEAM_ABBREVIATION_home,PTS_home
GAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BOS @ CLE 2017-10-17,BOS,99,CLE,102
HOU @ GSW 2017-10-17,HOU,122,GSW,121
CHA @ DET 2017-10-18,CHA,90,DET,102
BKN @ IND 2017-10-18,BKN,131,IND,140
MIA @ ORL 2017-10-18,MIA,109,ORL,116


In [12]:
def get_season(date):
    ano = date.year
    if(date.month >= 10):
        return ano + 1
    return ano

def is_playoff(date):
    # Playoffs 2016
    if date >= datetime(2016, 4, 16) and date < datetime(2016, 6, 30):
        return 1
    
    # Playoffs 2017
    elif date >= datetime(2017, 4, 15) and date < datetime(2017, 6, 30):
        return 1
    
     # Playoffs 2018
    elif date >= datetime(2018, 4, 14) and date < datetime(2018, 6, 30):
        return 1
    
    return 0

In [13]:
all_games["DATE"] = [datetime.strptime(str(x), '%d/%m/%Y') for x in all_games.GAME_DATE_home]
all_games["SEASON"] = [get_season(x) for x in all_games.DATE]
all_games["fl_playoff"] = [is_playoff(x) for x in all_games.DATE]
all_games['fl_home_win'] = np.where(all_games['PTS_home'] > all_games['PTS_away'], 1, 0)
all_games = all_games.sort_values('DATE')

In [39]:
all_games = all_games.rename(columns={'TEAM_ABBREVIATION_home': "team_home",
                                     'TEAM_ABBREVIATION_away': "team_away"})

In [54]:
all_games["team_home_game_num"] = all_games.groupby(['team_home']).cumcount() + 1
all_games["team_away_game_num"] = all_games.groupby(['team_away']).cumcount() + 1

In [14]:
all_games.to_csv("./bases_nba_stats/all_games_nba_17_18.csv")

### Matriz de Distância entre os Times

In [10]:
base_lat_long = pd.read_excel("./lat_long_teams.xlsx").drop("team_id", axis=1)

In [11]:
base_lat_long.head()

Unnamed: 0,abreviation,team_name,lat,lon
0,MIA,Miami Heat,25.7814,-80.1878
1,ORL,Orlando Magic,28.5392,-81.3836
2,SAS,San Antonio Spurs,29.4269,-98.4375
3,HOU,Houston Rockets,29.7508,-95.3622
4,NOP,New Orleans Pelicans,29.9489,-90.0822


In [12]:
teams = base_lat_long.abreviation.unique()

In [13]:
teams = base_lat_long.abreviation.unique()
df_dist = pd.DataFrame(columns=teams, index=teams)

for team in teams:
    for team_2 in teams:
        team1_location = (base_lat_long[base_lat_long.abreviation == team].lat.iloc[0], 
                          base_lat_long[base_lat_long.abreviation == team].lon.iloc[0])
        
        team2_location = (base_lat_long[base_lat_long.abreviation == team_2].lat.iloc[0], 
                          base_lat_long[base_lat_long.abreviation == team_2].lon.iloc[0])
        
        df_dist.loc[team, team_2] = geopy.distance.vincenty(team1_location, team2_location).km        

In [14]:
df_dist.to_csv("dist_matrix_km.csv")

### Cria features de Número de Jogos

In [3]:
all_games = pd.read_csv("./bases_nba_stats/all_games_nba_17_18.csv").drop("Unnamed: 0", axis=1)

In [4]:
all_games["DATE"] = [datetime.strptime(str(x), '%d/%m/%Y') for x in all_games.GAME_DATE_home]

In [4]:
def get_dist_last_game(df, data, team_home, team_away, is_home=True):    
    """
        Retorna a distância em KM percorrida pelo time específico para chegar a um jogo
    """
    
    if(is_home):
        last_game = get_last_games(df, data, team_home, n = 1)
        
        if(len(last_game) == 0):
            return(0)
        
        if (last_game.team_home.iloc[0] == team_home):
            return(0)
        else:
            return(df_dist.loc[team_home, last_game.team_home.iloc[0]])        
    else:
        last_game = get_last_games(df, data, team_away, n = 1)
        
        if(len(last_game) == 0):
            return(df_dist.loc[team_home, team_away])
        
        if (last_game.team_away.iloc[0] == team_away):
            return(df_dist.loc[team_home, last_game.team_away.iloc[0]])
        else:
            return(df_dist.loc[team_home, team_away])    

In [15]:
all_games["DISTANCE_KM_home"] = [get_dist_last_game(all_games, x.DATE, x.team_home, x.team_away, is_home=True) for x in all_games.itertuples()]
all_games["DISTANCE_KM_away"] = [get_dist_last_game(all_games, x.DATE, x.team_home, x.team_away, is_home=False) for x in all_games.itertuples()]

In [16]:
def get_days_from_last_game(df, data, team_name):    
    """
        Retorna o número de dias entre o jogo atual e o jogo passado
    """
    
    last_game = get_last_games(df, data, team_name, n = 1)

    if(len(last_game) == 0):
        return(np.nan)

    return(-(last_game.DATE - data).iloc[0].days)

In [17]:
all_games["DAYS_FROM_LAST_GAME_home"] = [get_days_from_last_game(all_games, x.DATE, x.team_home) for x in all_games.itertuples()]
all_games["DAYS_FROM_LAST_GAME_away"] = [get_days_from_last_game(all_games, x.DATE, x.team_away) for x in all_games.itertuples()]

In [12]:
all_games = all_games[["GAME", "DATE", "team_home", "team_away", "fl_home_win", "FG_PCT_home", 
                       "FG_PCT_away", "FG3_PCT_home", "FG3_PCT_away"]]
all_games.head()

Unnamed: 0,GAME,DATE,team_home,team_away,fl_home_win,FG_PCT_home,FG_PCT_away,FG3_PCT_home,FG3_PCT_away
0,BOS @ CLE 2017-10-17,2017-10-17,CLE,BOS,1,0.458,0.409,0.227,0.25
1,HOU @ GSW 2017-10-17,2017-10-17,GSW,HOU,0,0.537,0.485,0.533,0.366
2,NOP @ MEM 2017-10-18,2017-10-18,MEM,NOP,1,0.424,0.38,0.31,0.28
3,HOU @ SAC 2017-10-18,2017-10-18,SAC,HOU,0,0.477,0.398,0.348,0.267
4,DEN @ UTA 2017-10-18,2017-10-18,UTA,DEN,1,0.506,0.468,0.375,0.481


In [92]:
# all_games.to_csv("./bases_nba_stats/all_games_nba_17_18.csv")

### Últimos 5 Jogos

In [5]:
home_columns = [x for x in all_games.columns if x.endswith("_home") and x not in ['GAME_ID_home', 'TEAM_CITY_home', 'GAME_DATE_home', 'GAME_PLACE_home', 'TEAM_NICKNAME_home']]
away_columns = [x for x in all_games.columns if x.endswith("_away") and x not in ['TEAM_CITY_away', 'TEAM_NICKNAME_away']]

In [7]:
# "['WIN_HOME_away' 'WIN_AWAY_away' 'WIN_away' 'WIN_HOME_PCT_away'\n 'WIN_AWAY_PCT_away'
# 'WIN_PCT_away' 'NUM_GAMES_HOME_away'\n 'NUM_GAMES_AWAY_away' 'DAYS_DIFF_TOTAL_away' 
# 'DAYS_DIFF_NEXT_GAME_away'\n 'DAYS_DIFF_NEXT_GAMES_STD_away']

In [8]:
# extra_columns = []

# home_columns += [x + "_home" for x in extra_columns]
# away_columns += [x + "_away" for x in extra_columns]

In [6]:
def cria_variaveis_sumarizacao(last_games, team_name, n = 5, data_ref = None, verbose = False):
    resp = {}
    
    # Cria variávies de Win %
    # Visão home
    resp["N_WINS_HOME"] = [np.where((last_games["team_home"] == team_name) &
                             (last_games["fl_home_win"] == 1) , 1, 0).sum()]
    resp["N_GAMES_HOME"] = [np.where(last_games["team_home"] == team_name, 1, 0).sum()]
    resp["WIN_HOME_PCT"] = [(resp["N_WINS_HOME"][0] / resp["N_GAMES_HOME"][0])]
    
    # Visão Away
    resp["N_WINS_AWAY"] = [np.where((last_games["team_away"] == team_name) & 
                                      (last_games["fl_home_win"] == 0), 1, 0).sum()]
    resp["N_GAMES_AWAY"] = [np.where(last_games["team_away"] == team_name, 1, 0).sum()]
    resp["WIN_AWAY_PCT"] = [(resp["N_WINS_AWAY"][0] / resp["N_GAMES_AWAY"][0])]
    
    # Visão geral
    resp["N_WINS_TOTAL"] = [resp["N_WINS_AWAY"][0] + resp["N_WINS_HOME"][0]]
    resp["WIN_PCT"] = [resp["N_WINS_TOTAL"][0]/n]
    
    if verbose:
        print("Win_PCT", resp["WIN_PCT"][0], resp["N_WINS_AWAY"][0], resp["N_WINS_HOME"][0])
    
    # Cria variáveis de data
    if(data_ref is None):
        data_ref = np.max(last_games["DATE"]) + timedelta(days=1)
    
    # Visão da série
    resp["TOTAL_DAYS_DIFF"] = [(np.max(last_games["DATE"]) - np.min(last_games["DATE"])).days]
    days_diff_last_games = [-x.days if not np.isnan(x.days) else 0 
                            for x in last_games["DATE"].sub(last_games["DATE"].shift(-1).fillna(data_ref))]
    resp["DAYS_DIFF_LG_STD"] = [np.std(days_diff_last_games)]
    resp["DAYS_DIFF_LG_MEAN"] = [np.mean(days_diff_last_games)]
    
    if verbose:
        print("Days_Diff", days_diff_last_games, resp["DAYS_DIFF_LG_MEAN"][0], resp["TOTAL_DAYS_DIFF"][0]) 
    
    # All
    resp["N_GAMES_L2_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=2))) & 
                            (last_games["DATE"] < data_ref) &
                            ((last_games["team_home"] == team_name) |
                             (last_games["team_away"] == team_name)), 1, 0).sum()]
    
    resp["N_GAMES_L4_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=4))) & 
                            (last_games["DATE"] < data_ref) &
                            ((last_games["team_home"] == team_name) |
                             (last_games["team_away"] == team_name)), 1, 0).sum()]
                             
    resp["N_GAMES_L6_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=6))) & 
                            (last_games["DATE"] < data_ref) &
                            ((last_games["team_home"] == team_name) |
                             (last_games["team_away"] == team_name)), 1, 0).sum()]
    
    resp["N_GAMES_L8_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=8))) & 
                            (last_games["DATE"] < data_ref) &
                            ((last_games["team_home"] == team_name) |
                             (last_games["team_away"] == team_name)), 1, 0).sum()]
    
    resp["N_GAMES_L10_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=10))) & 
                            (last_games["DATE"] < data_ref) &
                            ((last_games["team_home"] == team_name) |
                             (last_games["team_away"] == team_name)), 1, 0).sum()]
    
    # Away
    resp["N_GAMES_AWAY_L2_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=2))) & 
                                    (last_games["DATE"] < data_ref) &
                                    (last_games["team_away"] == team_name), 1, 0).sum()]
    
    resp["N_GAMES_AWAY_L4_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=4))) & 
                                    (last_games["DATE"] < data_ref) &
                                    (last_games["team_away"] == team_name), 1, 0).sum()]
    
    resp["N_GAMES_AWAY_L6_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=6))) & 
                                    (last_games["DATE"] < data_ref) &
                                    (last_games["team_away"] == team_name), 1, 0).sum()] 
    
    resp["N_GAMES_AWAY_L8_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=8))) & 
                                    (last_games["DATE"] < data_ref) &
                                    (last_games["team_away"] == team_name), 1, 0).sum()]
    
    resp["N_GAMES_AWAY_L8_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=10))) & 
                                    (last_games["DATE"] < data_ref) &
                                    (last_games["team_away"] == team_name), 1, 0).sum()]
    
    # Home    
    resp["N_GAMES_AWAY_L2_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=2))) & 
                                    (last_games["DATE"] < data_ref) &
                                    (last_games["team_home"] == team_name), 1, 0).sum()]
    
    resp["N_GAMES_AWAY_L4_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=4))) & 
                                    (last_games["DATE"] < data_ref) &
                                    (last_games["team_home"] == team_name), 1, 0).sum()]
    
    resp["N_GAMES_AWAY_L6_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=6))) & 
                                    (last_games["DATE"] < data_ref) &
                                    (last_games["team_home"] == team_name), 1, 0).sum()]
    
    resp["N_GAMES_AWAY_L8_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=8))) & 
                                    (last_games["DATE"] < data_ref) &
                                    (last_games["team_home"] == team_name), 1, 0).sum()]
    
    resp["N_GAMES_AWAY_L10_days"] = [np.where((last_games["DATE"] >= (data_ref - timedelta(days=10))) & 
                                    (last_games["DATE"] < data_ref) &
                                    (last_games["team_home"] == team_name), 1, 0).sum()]
    
    if verbose:
        print("Num Games Last X Days", resp["N_GAMES_L6_days"][0], resp["N_GAMES_AWAY_L6_days"][0])
        
        
    # Distance KM
    resp["SUM_DIST_KM"] = [last_games[last_games["team_home"] == team_name]["DISTANCE_KM_home"].sum()
                           + last_games[last_games["team_away"] == team_name]["DISTANCE_KM_away"].sum()]
    
    dist_list = (list(last_games[last_games["team_home"] == team_name]["DISTANCE_KM_home"])
                 + list(last_games[last_games["team_away"] == team_name]["DISTANCE_KM_away"]))
    
    resp["AVG_DIST_KM"] = [np.average(dist_list)]
    
    # Back to Back
    resp["BACK_TO_BACK"] = [np.where(pd.Series(dist_list) > 3500, 1, 0).sum()]
    
    
    # Days from last Game
    
    days_from_last_games_list = (list(last_games[last_games["team_home"] == team_name]["DAYS_FROM_LAST_GAME_home"])
                                 + list(last_games[last_games["team_away"] == team_name]["DAYS_FROM_LAST_GAME_away"]))
    
    if(len(days_from_last_games_list) > 0):
        resp["AVG_DAYS_FROM_LG"] = [np.average(days_from_last_games_list)]
        resp["STD_DAYS_FROM_LG"] = [np.std(days_from_last_games_list)]
        resp["MIN_DAYS_FROM_LG"] = [np.min(days_from_last_games_list)]
    else:
        resp["AVG_DAYS_FROM_LG"] = [np.nan]
        resp["STD_DAYS_FROM_LG"] = [np.nan]
        resp["MIN_DAYS_FROM_LG"] = [np.nan]
    
    return(pd.DataFrame(resp))

In [7]:
def get_last_games(df, data, team_name, n = 5, filter="all", verbose=False):
    if(filter == "all"):
        last_games = df[(df["DATE"] < data) & 
                        ((df["team_home"] == team_name) | (df["team_away"] == team_name))].tail(n)
        
    elif(filter == "home"):
        last_games = df[(df["DATE"] < data) & (df["team_home"] == team_name)].tail(n)
    elif(filter == "away"):
        last_games = df[(df["DATE"] < data) & (df["team_away"] == team_name)].tail(n)

    if(verbose):
        print(last_games[["team_home", "team_away", "PTS_home", "PTS_away", "DATE"]])
    
    if not isinstance(last_games, pd.DataFrame):
        last_games = last_games.to_frame().transpose()
        
    return(last_games)

def get_avg_last_games(last_games, team_name, n = 5, data_ref = None, rivals = False, 
                       to_drop=['fl_home_win', 'fl_playoff', 'fl_win','index', 'team_away_game_num',
                                'team_game_num', 'team_home_game_num', "TEAM_CITY_away", "TEAM_CITY_home",
                                "TEAM_CITY_home", "TEAM_ID_home", "TEAM_NAME_advanced_home",
                                "TEAM_NAME_fourfactors_home", "TEAM_NAME_hustle_home",
                                "TEAM_NAME_misc_home", "TEAM_NAME_home", "TEAM_NAME_playertrack_home",
                                "TEAM_NAME_scoring_home", 'TEAM_ID_home', 'TEAM_ID_away', "TEAM_NAME_advanced_away",
                                "TEAM_NAME_fourfactors_away", "TEAM_NAME_hustle_away",
                                "TEAM_NAME_misc_away", "TEAM_NAME_away", "TEAM_NAME_playertrack_away",
                                "TEAM_NAME_scoring_away", "DATE", "SEASON", 'GAME', 'GAME_ID_home', 'GAME_ID_away',
                               'GAME_DATE_home', 'GAME_PLACE_home']):
    
    if(rivals == False):
        last_games_home = last_games[last_games["team_home"] == team_name].sum().to_frame().transpose().drop(away_columns + to_drop + ["team_home"], axis=1, errors="ignore")
        last_games_away = last_games[last_games["team_away"] == team_name].sum().to_frame().transpose().drop(home_columns + to_drop + ["team_away"], axis=1, errors="ignore")
        
        last_games_home.columns = [x.replace("_home","") for x in last_games_home.columns]
        last_games_away.columns = [x.replace("_away","") for x in last_games_away.columns]
    else:
        last_games_home = last_games[last_games["team_home"] != team_name].sum().to_frame().transpose().drop(away_columns + to_drop + ["team_home"], axis=1, errors="ignore")
        last_games_away = last_games[last_games["team_away"] != team_name].sum().to_frame().transpose().drop(home_columns + to_drop + ["team_away"], axis=1, errors="ignore")
            
        last_games_home.columns = [x.replace("_home","_opponent") for x in last_games_home.columns]
        last_games_away.columns = [x.replace("_away","_opponent") for x in last_games_away.columns]
    
    if(n == 10000):
        n = len(last_games_home) + len(last_games_away)
    
    if(len(last_games_home) == 0):
        if(rivals):
            #resp = (last_games_away/n).to_frame().transpose()
            resp = (last_games_away/n)
        else:
            resp = last_games_away/n
    
    if(len(last_games_away) == 0):
        if(rivals):
            #resp = (last_games_home/n).to_frame().transpose()
            resp = (last_games_home/n)
        else:
            resp = last_games_home/n
            
    if(len(last_games_away) > 0 and len(last_games_home) > 0):
        
        if(rivals):
            #resp = ((last_games_home + last_games_away) / n).to_frame().transpose()
            resp = ((last_games_home + last_games_away) / n)
        else:
            resp = ((last_games_home + last_games_away) / n)    
    
    if not rivals:
        var_criadas = cria_variaveis_sumarizacao(last_games, team_name, n = 5, data_ref = None)
        resp = pd.concat([resp, var_criadas], axis=1).sum().to_frame().transpose()
    
    return(resp)

In [8]:
teste = all_games[all_games["team_away"] == "POR"]
teste = get_last_games(all_games, teste.iloc[10]["DATE"], "POR", verbose=False, filter="home").reset_index()

In [9]:
teste[["team_home", "team_away", "FG_PCT_home", "FG_PCT_away"]]

Unnamed: 0,team_home,team_away,FG_PCT_home,FG_PCT_away
0,POR,SAC,0.543,0.382
1,POR,MIL,0.422,0.483
2,POR,NOP,0.448,0.523
3,POR,WAS,0.41,0.472
4,POR,HOU,0.511,0.533


In [10]:
cria_variaveis_sumarizacao(teste, "POR", verbose = True)

Win_PCT 0.2 0 1
Days_Diff [12, 2, 3, 4, 1] 4.4 21
Num Games Last X Days 2 2


  from ipykernel import kernelapp as app


Unnamed: 0,N_WINS_HOME,N_GAMES_HOME,WIN_HOME_PCT,N_WINS_AWAY,N_GAMES_AWAY,WIN_AWAY_PCT,N_WINS_TOTAL,WIN_PCT,TOTAL_DAYS_DIFF,DAYS_DIFF_LG_STD,...,N_GAMES_AWAY_L4_days,N_GAMES_AWAY_L6_days,N_GAMES_AWAY_L8_days,N_GAMES_AWAY_L10_days,SUM_DIST_KM,AVG_DIST_KM,BACK_TO_BACK,AVG_DAYS_FROM_LG,STD_DAYS_FROM_LG,MIN_DAYS_FROM_LG
0,1,5,0.2,0,0,,1,0.2,21,3.929377,...,1,2,3,4,4704.277771,940.855554,1,2.6,1.019804,1.0


In [10]:
resp = get_avg_last_games(teste, "POR", rivals=False)

  from ipykernel import kernelapp as app


In [11]:
resp

Unnamed: 0,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,...,N_GAMES_AWAY_L4_days,N_GAMES_AWAY_L6_days,N_GAMES_AWAY_L8_days,N_GAMES_AWAY_L10_days,SUM_DIST_KM,AVG_DIST_KM,BACK_TO_BACK,AVG_DAYS_FROM_LG,STD_DAYS_FROM_LG,MIN_DAYS_FROM_LG
0,39.4,84.8,0.4668,11.4,29.4,0.3868,13.4,16.4,0.8312,7.6,...,1.0,2.0,3.0,4.0,4704.277771,940.855554,1.0,2.6,1.019804,1.0


In [13]:
resp = []
season = all_games.reset_index()

for index, row in season.iterrows():
#for index, row in regular_season.groupby("season").first().iterrows():
    game_line_n = []
    for n_games in [1, 5, 10, 10000]:
        # Home team
        home_last_games = get_last_games(season, row["DATE"], row["team_home"], n=n_games)
        home_last_games_as_home = get_last_games(season, row["DATE"], row["team_home"], filter="home", n=n_games)
        
        home_avg_last_games = get_avg_last_games(home_last_games, row["team_home"], data_ref=row["DATE"])
        home_avg_last_games_as_home = get_avg_last_games(home_last_games_as_home, row["team_home"], data_ref=row["DATE"])
        home_rivals_last_games = get_avg_last_games(home_last_games, row["team_home"], rivals=True, data_ref=row["DATE"])

        home_avg_last_games["game_ref"] = [row.GAME]
        home_avg_last_games.set_index("game_ref", inplace=True)
        home_avg_last_games.drop(["team_home", "team_away", 0],axis=1 ,errors="ignore", inplace=True)
        
        home_avg_last_games_as_home["game_ref"] = [row.GAME]
        home_avg_last_games_as_home.set_index("game_ref", inplace=True)
        home_avg_last_games_as_home.drop(["team_home", "team_away", 0], axis=1 ,errors="ignore", inplace=True)

        home_rivals_last_games["game_ref"] = [row.GAME]
        home_rivals_last_games.set_index("game_ref", inplace=True)
        home_rivals_last_games.drop(["team_home", "team_away", 0],axis=1 ,errors="ignore", inplace=True)

        #print(home_rivals_last_games.index, home_avg_last_games.index)

        # Away team
        away_last_games = get_last_games(season, row["DATE"], row["team_away"], n=n_games).reset_index()
        away_last_games_as_away = get_last_games(season, row["DATE"], row["team_away"], filter="away", n=n_games).reset_index()
        
        away_avg_last_games = get_avg_last_games(away_last_games, row["team_away"], data_ref=row["DATE"])
        away_avg_last_games_as_away = get_avg_last_games(away_last_games_as_away, row["team_away"], data_ref=row["DATE"])
        away_rivals_last_games = get_avg_last_games(away_last_games, row["team_away"], rivals=True, data_ref=row["DATE"])

        away_avg_last_games["game_ref"] = [row.GAME]
        away_avg_last_games.set_index("game_ref", inplace=True)
        away_avg_last_games.drop(["team_home", "team_away", 0],axis=1 ,errors="ignore", inplace=True)
        
        away_avg_last_games_as_away["game_ref"] = [row.GAME]
        away_avg_last_games_as_away.set_index("game_ref", inplace=True)
        away_avg_last_games_as_away.drop(["team_home", "team_away", 0],axis=1 ,errors="ignore", inplace=True)

        away_rivals_last_games["game_ref"] = [row.GAME]
        away_rivals_last_games.set_index("game_ref", inplace=True)
        away_rivals_last_games.drop(["team_home", "team_away", 0],axis=1 ,errors="ignore", inplace=True)

        #print(away_rivals_last_games.index, away_avg_last_games.index)

        # Junta bases 
        
        if (n_games == 10000):
            n_games_str = "ALL"
        else:
            n_games_str = str(n_games)
            
        avg_last_games = home_avg_last_games.join(away_avg_last_games, how="inner", 
                             lsuffix='_home_L' + n_games_str, rsuffix='_away_L' + n_games_str).drop('level_0', axis=1, errors="ignore")
        
        avg_last_games_as = home_avg_last_games_as_home.join(away_avg_last_games_as_away, how="inner", 
                                 lsuffix='_home_L' + n_games_str + '_HOME', rsuffix='_away_L' + n_games_str + '_AWAY').drop('level_0', axis=1, errors="ignore")
        
        
        rivals_last_games = home_rivals_last_games.join(away_rivals_last_games, how="inner",
                                lsuffix='_home_L' + n_games_str, rsuffix='_away_L' + n_games_str).drop('level_0', axis=1, errors="ignore")

        
        #print(rivals_last_games.columns)
        
        game_line = avg_last_games.join(rivals_last_games, how="inner")
        
        game_line = game_line.join(avg_last_games_as, how="inner")

        game_line = pd.concat([row.to_frame().transpose().set_index("GAME"), game_line], axis=1)

        print(str(row.GAME) + " " + str(n_games), end="\r")
        
        game_line_n.append(game_line)
        
    resp.append(game_line_n)

    del home_last_games
    del home_last_games_as_home
    del home_avg_last_games
    del home_avg_last_games_as_home
    del home_rivals_last_games

    del away_last_games
    del away_last_games_as_away
    del away_avg_last_games
    del away_avg_last_games_as_away
    del away_rivals_last_games

    del avg_last_games
    del avg_last_games_as
    del game_line
    del game_line_n

    gc.collect()

  if __name__ == '__main__':
  from ipykernel import kernelapp as app
  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)


BOS @ PHI 2017-10-20 10000

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


HOU @ SAC 2018-04-11 10000

In [None]:
len(resp)

In [14]:
import pickle

In [17]:
resp2 = []

for r in resp:
    resp2.append(pd.concat(r, axis=1))

In [15]:
pickle.dump(resp, open("resp.p", "wb"))

In [16]:
gc.collect()

367

In [19]:
df_resp = pd.concat(resp2)

In [20]:
df_resp.to_csv("df_resp.csv")

In [8]:
df_resp = pd.read_csv("df_resp.csv").drop('Unnamed: 0', axis=1)

In [21]:
del resp
del resp2

In [22]:
import gc
gc.collect()

43375

In [23]:
lista = []
for var in df_resp.columns:
    if "_home_L5" in var:
        lista.append(var.replace("_home_L5_HOME", "").replace("_opponent_home_L5", "").replace("_home_L5", ""))

In [24]:
lista_features = set(lista)

In [206]:
df_resp["week"] = [x.isocalendar()[1] for x in df_resp.DATE]

In [207]:
df_resp.to_csv("bases_nba_stats/base_last_5_+_jogos.csv")

### Variáveis Cross

In [26]:
filtrada = df_resp

In [27]:
filtrada.columns

Index(['index', 'GAME_ID_home', 'TEAM_ID_home', 'TEAM_NAME_home', 'team_home',
       'TEAM_CITY_home', 'FGM_home', 'FGA_home', 'FG_PCT_home', 'FG3M_home',
       ...
       'N_GAMES_AWAY_L4_days_away_LALL_AWAY',
       'N_GAMES_AWAY_L6_days_away_LALL_AWAY',
       'N_GAMES_AWAY_L8_days_away_LALL_AWAY',
       'N_GAMES_AWAY_L10_days_away_LALL_AWAY', 'SUM_DIST_KM_away_LALL_AWAY',
       'AVG_DIST_KM_away_LALL_AWAY', 'BACK_TO_BACK_away_LALL_AWAY',
       'AVG_DAYS_FROM_LG_away_LALL_AWAY', 'STD_DAYS_FROM_LG_away_LALL_AWAY',
       'MIN_DAYS_FROM_LG_away_LALL_AWAY'],
      dtype='object', length=3688)

In [42]:
columns_subtract = lista_features

for column in columns_subtract:
    for n_games in [1, 5, 10, 10000]:
        if (n_games == 10000):
            n_games_str = "ALL"
        else:
            n_games_str = str(n_games)
        
        filtrada["D1_" + column + "_L " + n_games_str] = filtrada[column + "_home_L" + n_games_str] - filtrada[column + "_away_L" + n_games_str]
        filtrada["D2_" + column + "_L " + n_games_str] = filtrada[column + "_home_L" + n_games_str + "_HOME"] - filtrada[column + "_away_L" + n_games_str + "_AWAY"]
        try:
            filtrada["C1_" + column + "_L " + n_games_str] = filtrada[column + "_opponent_home_L" + n_games_str] - filtrada[column + "_away_L" + n_games_str]
            filtrada["C2_" + column + "_L " + n_games_str] = filtrada[column + "_opponent_away_L" + n_games_str] - filtrada[column + "_home_L" + n_games_str]
        except KeyError:
            pass

In [43]:
to_keep_delta = []
for var in filtrada.columns:
    if "D1_" in var or "C1_" in var or "D2_" in var or "C2_" in var:
        to_keep_delta.append(var)

In [30]:
keep_home = list(sorted(lista_features))[:-31]

In [33]:
filtrada[[x + "_home" for x in keep_home] + to_keep_delta + ["fl_home_win"]].to_csv("bases_nba_stats/base_delta_cross_+_home.csv")

In [24]:
filtrada[[x + "_home" for x in lista_features] + to_keep_delta + ["fl_home_win"]].to_csv("bases_nba_stats/base_delta_cross_home.csv")

In [None]:
filtrada.drop(lista_features)

In [48]:
 filtrada.loc[:,~filtrada.columns.duplicated()][to_keep_delta + ["fl_home_win"]].to_csv("base_delta_cross.csv")

In [44]:
filtrada[to_keep_delta + ["fl_home_win"]]

Unnamed: 0,D1_OPP_FTA_RATE,D2_OPP_FTA_RATE,C1_OPP_FTA_RATE,C2_OPP_FTA_RATE,D1_TO,D2_TO,C1_TO,C2_TO,D1_WIN_HOME_PCT,D2_WIN_HOME_PCT,...,C1_OPP_TOV_PCT_L 10,C2_OPP_TOV_PCT_L 10,D1_OPP_TOV_PCT_L ALL,D2_OPP_TOV_PCT_L ALL,C1_OPP_TOV_PCT_L ALL,C2_OPP_TOV_PCT_L ALL,fl_home_win,fl_home_win.1,fl_home_win.2,fl_home_win.3
BOS @ CLE 2017-10-17,0.0000,0.0000,0,0,0.0,0.0,0,0,0.000000,0.000000,...,0,0,0.0000,0.0000,0,0,1,1,1,1
HOU @ GSW 2017-10-17,0.0000,0.0000,0,0,0.0,0.0,0,0,0.000000,0.000000,...,0,0,0.0000,0.0000,0,0,0,0,0,0
NOP @ MEM 2017-10-18,0.0000,0.0000,0,0,0.0,0.0,0,0,0.000000,0.000000,...,0,0,0.0000,0.0000,0,0,1,1,1,1
HOU @ SAC 2017-10-18,-0.0526,-0.0526,-0.0526,0.0392,-2.4,-2.4,-2.4,3.4,0.000000,0.000000,...,-0.034,0.024,-0.0340,-0.0340,-0.034,0.024,0,0,0,0
DEN @ UTA 2017-10-18,0.0000,0.0000,0,0,0.0,0.0,0,0,0.000000,0.000000,...,0,0,0.0000,0.0000,0,0,1,1,1,1
ATL @ DAL 2017-10-18,0.0000,0.0000,0,0,0.0,0.0,0,0,0.000000,0.000000,...,0,0,0.0000,0.0000,0,0,0,0,0,0
BKN @ IND 2017-10-18,0.0000,0.0000,0,0,0.0,0.0,0,0,0.000000,0.000000,...,0,0,0.0000,0.0000,0,0,1,1,1,1
POR @ PHX 2017-10-18,0.0000,0.0000,0,0,0.0,0.0,0,0,0.000000,0.000000,...,0,0,0.0000,0.0000,0,0,0,0,0,0
PHI @ WAS 2017-10-18,0.0000,0.0000,0,0,0.0,0.0,0,0,0.000000,0.000000,...,0,0,0.0000,0.0000,0,0,1,1,1,1
MIA @ ORL 2017-10-18,0.0000,0.0000,0,0,0.0,0.0,0,0,0.000000,0.000000,...,0,0,0.0000,0.0000,0,0,1,1,1,1


### Cria base de Modelagem

In [209]:
to_drop_modelagem = ['FGM_home', 'FGA_home', 'FG_PCT_home', 'FG3M_home', 'FG3A_home', 
                     'FG3_PCT_home', 'FTM_home', 'FTA_home', 'FT_PCT_home', 'OREB_home', 
                     'DREB_home', 'REB_home', 'AST_home', 'STL_home', 'BLK_home', 'TO_home',
                     'PF_home', 'TEAM_NAME_advanced_home', 'AST_PCT_home',
                     'AST_TOV_home', 'AST_RATIO_home', 'OREB_PCT_home', 'DREB_PCT_home',
                     'REB_PCT_home', 'TM_TOV_PCT_home', 'EFG_PCT_home', 'TS_PCT_home', 
                     'USG_PCT_home', 'PACE_home', 'PIE_home', 'TEAM_NAME_scoring_home', 
                     'PCT_FGA_2PT_home', 'PCT_FGA_3PT_home', 'PCT_PTS_2PT_home', 'PCT_PTS_2PT_MR_home', 
                     'PCT_PTS_3PT_home', 'PCT_PTS_FB_home', 'PCT_PTS_FT_home', 'PCT_PTS_OFF_TOV_home', 
                     'PCT_PTS_PAINT_home', 'PCT_AST_2PM_home', 'PCT_UAST_2PM_home', 'PCT_AST_3PM_home', 
                     'PCT_UAST_3PM_home', 'PCT_AST_FGM_home', 'PCT_UAST_FGM_home', 'TEAM_NAME_misc_home',
                     'PTS_OFF_TOV_home', 'PTS_2ND_CHANCE_home', 'PTS_FB_home', 'PTS_PAINT_home', 
                     'OPP_PTS_OFF_TOV_home', 'OPP_PTS_2ND_CHANCE_home', 'OPP_PTS_FB_home', 
                     'OPP_PTS_PAINT_home', 'BLK_misc_home', 'BLKA_home', 'PF_misc_home', 'PFD_home', 
                     'TEAM_NAME_fourfactors_home', 'EFG_PCT_fourfactors_home', 'FTA_RATE_home', 
                     'TM_TOV_PCT_fourfactors_home', 'OREB_PCT_fourfactors_home', 'OPP_EFG_PCT_home',
                     'OPP_FTA_RATE_home', 'OPP_TOV_PCT_home', 'OPP_OREB_PCT_home', 'TEAM_NAME_playertrack_home',
                     'DIST_home', 'ORBC_home', 'DRBC_home', 'RBC_home', 'TCHS_home', 'SAST_home', 'FTAST_home', 
                     'PASS_home', 'AST_playertrack_home', 'CFGM_home', 'CFGA_home', 'CFG_PCT_home', 'UFGM_home',
                     'UFGA_home', 'UFG_PCT_home', 'FG_PCT_playertrack_home', 'DFGM_home', 'DFGA_home', 
                     'DFG_PCT_home', 'TEAM_NAME_hustle_home', 'CONTESTED_SHOTS_home', 'CONTESTED_SHOTS_2PT_home', 
                     'CONTESTED_SHOTS_3PT_home', 'DEFLECTIONS_home', 'LOOSE_BALLS_RECOVERED_home', 'CHARGES_DRAWN_home',
                     'SCREEN_ASSISTS_home', 'BOX_OUTS_home', 'FGM_away', 'FGA_away', 'FG_PCT_away', 
                     'FG3M_away', 'FG3A_away', 'FG3_PCT_away', 'FTM_away', 'FTA_away', 'FT_PCT_away', 'OREB_away',
                     'DREB_away', 'REB_away', 'AST_away', 'STL_away', 'BLK_away', 'TO_away', 'PF_away',
                     'PLUS_MINUS_away', 'TEAM_NAME_advanced_away', 'AST_PCT_away', 'AST_TOV_away',
                     'AST_RATIO_away', 'OREB_PCT_away',
                     'DREB_PCT_away', 'REB_PCT_away', 'TM_TOV_PCT_away', 'EFG_PCT_away', 'TS_PCT_away', 
                     'USG_PCT_away', 'PACE_away', 'PIE_away', 'TEAM_NAME_scoring_away', 'PCT_FGA_2PT_away',
                     'PCT_FGA_3PT_away', 'PCT_PTS_2PT_away', 'PCT_PTS_2PT_MR_away', 'PCT_PTS_3PT_away', 
                     'PCT_PTS_FB_away', 'PCT_PTS_FT_away', 'PCT_PTS_OFF_TOV_away', 'PCT_PTS_PAINT_away',
                     'PCT_AST_2PM_away', 'PCT_UAST_2PM_away', 'PCT_AST_3PM_away', 'PCT_UAST_3PM_away', 
                     'PCT_AST_FGM_away', 'PCT_UAST_FGM_away', 'TEAM_NAME_misc_away', 'PTS_OFF_TOV_away', 
                     'PTS_2ND_CHANCE_away', 'PTS_FB_away', 'PTS_PAINT_away', 'OPP_PTS_OFF_TOV_away', 
                     'OPP_PTS_2ND_CHANCE_away', 'OPP_PTS_FB_away', 'OPP_PTS_PAINT_away', 'BLK_misc_away',
                     'BLKA_away', 'PF_misc_away', 'PFD_away', 'TEAM_NAME_fourfactors_away', 'EFG_PCT_fourfactors_away', 
                     'FTA_RATE_away', 'TM_TOV_PCT_fourfactors_away', 'OREB_PCT_fourfactors_away', 'OPP_EFG_PCT_away', 
                     'OPP_FTA_RATE_away', 'OPP_TOV_PCT_away', 'OPP_OREB_PCT_away', 'TEAM_NAME_playertrack_away',
                     'DIST_away', 'ORBC_away', 'DRBC_away', 'RBC_away', 'TCHS_away', 'SAST_away', 'FTAST_away', 
                     'PASS_away', 'AST_playertrack_away', 'CFGM_away', 'CFGA_away', 'CFG_PCT_away', 'UFGM_away',
                     'UFGA_away', 'UFG_PCT_away', 'FG_PCT_playertrack_away', 'DFGM_away', 'DFGA_away', 'DFG_PCT_away', 
                     'TEAM_NAME_hustle_away', 'CONTESTED_SHOTS_away', 'CONTESTED_SHOTS_2PT_away', 
                     'CONTESTED_SHOTS_3PT_away', 'DEFLECTIONS_away', 'LOOSE_BALLS_RECOVERED_away', 
                     'CHARGES_DRAWN_away', 'SCREEN_ASSISTS_away', 'BOX_OUTS_away']

In [210]:
df_modelagem = df_resp.drop(to_drop_modelagem, axis=1).rename(columns={"week":"year_week"})

In [211]:
df_modelagem.to_csv("bases_nba_stats/base_modelagem_nba_17_18.csv")

In [212]:
df_truncada = df_modelagem.copy()
df_truncada[df_truncada.select_dtypes(include=['float64']).columns] = df_truncada[df_truncada.select_dtypes(include=['float64']).columns].applymap('{:,.4f}'.format)
df_truncada.to_csv("bases_nba_stats/base_modelagem_nba_17-18_4_digitos.csv")