#### Importa os pacotes

In [2]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

#### Siglas dos Times

In [249]:
de_para_siglas = pd.read_csv("NBA_Games/" + 'de_para_siglas.csv').set_index("sigla").to_dict()['nome']

#### Separa as informações no padrão do nome do arquivo 

In [254]:
def parse_filename(file_name):
    fl_name = file_name.replace(".csv", "").split("_")
    
    away_team = str(fl_name[0])
    home_team = str(fl_name[2])
    
    date_game = datetime.strptime(fl_name[3] + fl_name[4] + fl_name[5], '%b%d%Y')
    
    stat_team = fl_name[6]
    
    for key in de_para_siglas.keys():
        away_team_full = away_team.replace(key, de_para_siglas[key])
        stat_team_full = stat_team.replace(key, de_para_siglas[key])
        home_team_full = home_team.replace(key, de_para_siglas[key])
    
    df = pd.DataFrame(columns=["home_team", "away_team",
                               "date", "team"])
    df.loc[0] = [home_team, away_team,
                 date_game, stat_team]
    
    df["home_full"] = df["home_team"].replace(de_para_siglas, regex=True)
    df["away_full"] = df["away_team"].replace(de_para_siglas, regex=True)
    df["team_full"] = df["team"].replace(de_para_siglas, regex=True)
    
    return(df)

#### Lê o dataframe e retorna sem a coluna extra

In [255]:
def load_df(file_name):
    df = pd.read_csv("NBA_Games/dfs/" + file_name)
    df = df.drop(["Unnamed: 0"], axis=1)
    return(df)

#### Junta as estatísticas básicas com as avançadas

In [262]:
def join_basic_adv_df(file_name_prefix):
    df_adv = load_df(file_name_prefix + "_Advanced.csv")
    df_basic = load_df(file_name_prefix + "_Basic.csv")
    
    df_full = pd.concat([df_basic, df_adv.drop(["Starters", "MP"], axis=1)], axis=1)
    columns_to_numeric = ['MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA',
       'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
       '+/-', 'TS%', 'eFG%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%',
       'STL%', 'BLK%', 'TOV%', 'USG%', 'ORtg', 'DRtg']

    df_full["MP"] = df_full["MP"].str.replace(":", ".") 
    df_full[columns_to_numeric] = df_full[columns_to_numeric].apply(pd.to_numeric, errors='coerce')
    
    info = parse_filename(file_name)
    df_full[list(info.columns)] = pd.concat([info]*len(df_full), ignore_index=True)
    
    df_full = df_full[["Starters"]  + list(info.columns) + columns_to_numeric]
    
    return(df_full)

#### Teste para um jogo

In [263]:
file_name = "ATL_@_BKN_Apr_02_2017_ATL"
df_full = join_basic_adv_df(file_name)

In [264]:
df_full.head(5)

Unnamed: 0,Starters,home_team,away_team,date,team,home_full,away_full,team_full,MP,FG,...,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg
0,Dennis Schroder,BKN,ATL,2017-04-02,ATL,Brooklyn Nets,Atlanta Hawks,Atlanta Hawks,40.52,7.0,...,2.3,10.7,6.2,43.1,2.4,2.4,20.0,26.9,76.0,92.0
1,Taurean Waller-Prince,BKN,ATL,2017-04-02,ATL,Brooklyn Nets,Atlanta Hawks,Atlanta Hawks,36.08,5.0,...,2.6,12.1,7.0,11.4,2.7,2.7,0.0,17.3,98.0,90.0
2,Tim Hardaway,BKN,ATL,2017-04-02,ATL,Brooklyn Nets,Atlanta Hawks,Atlanta Hawks,35.51,5.0,...,0.0,18.3,8.5,5.7,0.0,0.0,29.8,20.6,79.0,97.0
3,Dwight Howard,BKN,ATL,2017-04-02,ATL,Brooklyn Nets,Atlanta Hawks,Atlanta Hawks,25.25,4.0,...,3.7,42.9,21.9,0.0,0.0,3.9,39.2,22.1,80.0,88.0
4,Ersan Ilyasova,BKN,ATL,2017-04-02,ATL,Brooklyn Nets,Atlanta Hawks,Atlanta Hawks,20.18,1.0,...,0.0,0.0,0.0,17.1,2.4,4.8,14.5,14.9,70.0,93.0


#### Carrega as lista de arquivos de todos os jogos

In [265]:
directory = "NBA_Games/dfs/"

files_list = []

for filename in os.listdir(directory):
    if filename.endswith(".csv"): 
        file_name = filename.replace("_Basic.csv", "")
        file_name = file_name.replace("_Advanced.csv", "")
        files_list.append(file_name)
        
files_list = list(dict.fromkeys(files_list))

#### Gera DataFrame completo com informações

In [274]:
df_resp = None

for file_name in files_list:
    if(df_resp is None):
        df_resp = join_basic_adv_df(file_name)
    else:
        df_resp = pd.concat([df_resp, join_basic_adv_df(file_name)], axis=0).reset_index(drop=True)

df_resp['year_month'] = df_resp['date'].map(lambda x: 100*x.year + x.month)
df_resp["game"] = str(df_resp['away_team'] + " @ " + df_resp['home_team'] + " " + df_resp["date"].map(lambda x: x.strftime('%Y-%m-%d'))

In [296]:
df_resp.to_csv("nba_games_dataset.csv")

In [316]:
df_resp.columns

Index(['Starters', 'home_team', 'away_team', 'date', 'team', 'home_full',
       'away_full', 'team_full', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%',
       'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV',
       'PF', 'PTS', '+/-', 'TS%', 'eFG%', '3PAr', 'FTr', 'ORB%', 'DRB%',
       'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'ORtg', 'DRtg',
       'year_month', 'game'],
      dtype='object')

In [None]:
# Basic Box Score Stats

# MP -- Minutes Played
# FG -- Field Goals
# FGA -- Field Goal Attempts
# FG% -- Field Goal Percentage
# 3P -- 3-Point Field Goals
# 3PA -- 3-Point Field Goal Attempts
# 3P% -- 3-Point Field Goal Percentage
# FT -- Free Throws
# FTA -- Free Throw Attempts
# FT% -- Free Throw Percentage
# ORB -- Offensive Rebounds
# DRB -- Defensive Rebounds
# TRB -- Total Rebounds
# AST -- Assists
# STL -- Steals
# BLK -- Blocks
# TOV -- Turnovers
# PF -- Personal Fouls
# PTS -- Points
# +/- -- Plus/Minus

# Advanced Box Score Stats
# MP -- Minutes Played
# TS% -- True Shooting Percentage
# A measure of shooting efficiency that takes into account 2-point field goals, 3-point field goals, and free throws.
# eFG% -- Effective Field Goal Percentage
# This statistic adjusts for the fact that a 3-point field goal is worth one more point than a 2-point field goal.
# 3PAr -- 3-Point Attempt Rate
# Percentage of FG Attempts from 3-Point Range
# FTr -- Free Throw Attempt Rate
# Number of FT Attempts Per FG Attempt
# ORB% -- Offensive Rebound Percentage
# An estimate of the percentage of available offensive rebounds a player grabbed while he was on the floor.
# DRB% -- Defensive Rebound Percentage
# An estimate of the percentage of available defensive rebounds a player grabbed while he was on the floor.
# TRB% -- Total Rebound Percentage
# An estimate of the percentage of available rebounds a player grabbed while he was on the floor.
# AST% -- Assist Percentage
# An estimate of the percentage of teammate field goals a player assisted while he was on the floor.
# STL% -- Steal Percentage
# An estimate of the percentage of opponent possessions that end with a steal by the player while he was on the floor.
# BLK% -- Block Percentage
# An estimate of the percentage of opponent two-point field goal attempts blocked by the player while he was on the floor.
# TOV% -- Turnover Percentage
# An estimate of turnovers committed per 100 plays.
# USG% -- Usage Percentage
# An estimate of the percentage of team plays used by a player while he was on the floor.
# ORtg -- Offensive Rating
# An estimate of points produced (players) or scored (teams) per 100 possessions
# DRtg -- Defensive Rating
# An estimate of points allowed per 100 possessions

In [None]:
columns_sum = ['MP', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV',
       'PF', 'PTS', '+/-', '3PAr', 'FTr', 'ORtg', 'DRtg']
columns_avg = ['FG%', '3P%', 'FT%', 'TS%', 'eFG%', 'ORB%', 'DRB%',
              'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%',
              ]

In [325]:
game_totals = df_resp[df_resp.Starters == "Team Totals"].groupby(["game", "team"], as_index=False).sum()

In [326]:
game_totals.to_csv("game_totals.csv")

In [12]:
game_totals = pd.read_csv("game_totals.csv")
game_totals = game_totals.drop(["Unnamed: 0"], axis=1)

In [13]:
game_totals

Unnamed: 0,game,team,MP,FG,FGA,FG%,3P,3PA,3P%,FT,...,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg,year_month
0,ATL @ BKN 2015-11-17,ATL,240.0,33.0,76.0,0.434,12.0,29.0,0.414,10.0,...,80.0,50.0,66.7,7.6,10.9,19.6,100.0,96.1,98.3,201511
1,ATL @ BKN 2015-11-17,BKN,240.0,37.0,84.0,0.440,9.0,20.0,0.450,7.0,...,81.4,50.0,59.5,15.3,6.4,10.9,100.0,98.3,96.1,201511
2,ATL @ BKN 2017-01-10,ATL,240.0,44.0,92.0,0.478,7.0,23.0,0.304,22.0,...,73.1,52.5,56.8,10.9,20.7,9.9,100.0,116.0,96.2,201701
3,ATL @ BKN 2017-01-10,BKN,240.0,35.0,87.0,0.402,10.0,29.0,0.345,17.0,...,69.4,47.5,62.9,5.0,5.8,14.9,100.0,96.2,116.0,201701
4,ATL @ BKN 2017-04-02,ATL,240.0,30.0,80.0,0.375,7.0,24.0,0.292,15.0,...,81.8,46.3,63.3,9.0,8.2,17.4,100.0,82.4,91.4,201704
5,ATL @ BKN 2017-04-02,BKN,240.0,32.0,82.0,0.390,10.0,33.0,0.303,17.0,...,84.3,53.7,65.6,11.1,10.7,17.3,100.0,91.4,82.4,201704
6,ATL @ BKN 2017-10-22,ATL,240.0,32.0,94.0,0.340,7.0,27.0,0.259,33.0,...,71.1,48.5,65.6,5.7,3.3,12.7,100.0,99.5,110.9,201710
7,ATL @ BKN 2017-10-22,BKN,240.0,41.0,87.0,0.471,11.0,27.0,0.407,23.0,...,70.4,51.5,65.9,11.5,11.9,15.0,100.0,110.9,99.5,201710
8,ATL @ BKN 2017-12-02,ATL,240.0,47.0,99.0,0.475,10.0,27.0,0.370,10.0,...,73.9,50.5,38.3,12.4,12.5,7.1,100.0,118.0,105.6,201712
9,ATL @ BKN 2017-12-02,BKN,240.0,34.0,82.0,0.415,17.0,42.0,0.405,17.0,...,72.3,49.5,76.5,1.0,11.1,17.2,100.0,105.6,118.0,201712


In [319]:
#group = df_resp.groupby(["game", "team"], as_index=False)