In [1]:
import selenium
import pandas as pd
import numpy as np
import os
import time
from datetime import datetime
import json
import requests
%matplotlib inline
import pickle

In [2]:
url = 'https://stats.nba.com/stats/boxscoredefensive'

params = {
    'EndPeriod':10,
    'EndRange':28800,
    'GameID':'0021600008',
    'RangeType':0,
    'Season':2017-18,
    'SeasonType':'Regular+Season',
    'StartPeriod':1,
    'StartRange':0
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}

resp = requests.get(url=url, params=params, headers=headers)

In [14]:
resp = requests.get(url="https://stats.nba.com/stats/hustlestatsboxscore",
                    params=params, headers=headers)

In [31]:
resp = requests.get(url="https://stats.nba.com/stats/boxscoredefensive",
                    params=params, headers=headers)

In [None]:
resp.json()

In [None]:
get_df_nba_json(resp.json(), rs=0)

In [3]:
def get_date_place(game_id, year = "2017"):
    #0021700784
    try:
        resp = requests.get(url="https://data.nba.com/data/10s/v2015/json/mobile_teams/nba/" + str(year) + "/scores/gamedetail/"+ str(game_id) +"_gamedetail.json",
                            headers=headers)
        data = resp.json()["g"]["gdte"]
        place = resp.json()["g"]["an"]
    except Exception as e:
        print(e)
        data = np.nan
        place = np.nan
    
    return(data, place)

In [4]:
def get_df_nba_json(resp_json, date_place = False, rs=0):
    dict_resp = resp_json['resultSets'][rs]
    df_resp = pd.DataFrame(dict_resp["rowSet"])
    df_resp.columns = dict_resp["headers"]
    
    if(date_place):
        game_date, game_place = get_date_place(df_resp.GAME_ID.iloc[0])

        df_resp["GAME_DATE"] = np.repeat(game_date, len(df_resp))
        df_resp["GAME_PLACE"] = np.repeat(game_place, len(df_resp))

        teams = df_resp.TEAM_ABBREVIATION.unique()

        df_resp["GAME"] = np.repeat(teams[0] + " @ " + teams[1] + " " + game_date, len(df_resp))
    
    if "TEAM_NAME" in df_resp.columns:
        df_resp = df_resp.sort_values("TEAM_NAME")
    else:
        if "TEAM_NICKNAME" in df_resp.columns:
            df_resp = df_resp.sort_values("TEAM_NICKNAME")
        else:
            df_resp = df_resp.sort_values("TEAM_ABBREVIATION")
        
    return(df_resp)

In [5]:
def junta_df_tipos(df_tipos, cols_drop = ['GAME_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_CITY', 
                                          'PLAYER_ID', 'PLAYER_NAME', 'START_POSITION', 
                                          'COMMENT', 'MIN', 'MINUTES']):
    resp = df_tipos[0]
    for i in range(1, len(df_tipos)):
        junta = df_tipos[i]
        colunas_repetidas = list(set(junta.columns).intersection(resp.columns))
        
        junta = junta.drop(cols_drop, axis=1, errors="ignore")
        junta.columns = [str(col) + '_' + lista_sites[i]
                         if col in colunas_repetidas else str(col) 
                         for col in junta.columns]
        
        resp = resp.merge(junta, how="left", 
                          left_index=True, right_index=True)
    return(resp)

In [6]:
lista_sites = ["traditional", "advanced", "scoring", "misc", "usage", "fourfactors", "playertrack", "hustle", "defensive"]

In [7]:
game_ids = []
for i in range(1, 1231):
    game_ids.append("002150" + ('{0:0>4}'.format(i)))

year = 2015

In [None]:
params = {
    'EndPeriod':10,
    'EndRange':28800,
    'GameID':'0021600784',
    'RangeType':0,
    'Season':2016-17,
    'SeasonType':'Regular+Season',
    'StartPeriod':1,
    'StartRange':0
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}

df_full = []
df_full_jogo = []
erros = []

for game_id in game_ids:
    
    df_tipos = []
    df_tipos_jogo = []
    
    try:
        game_date, game_place = get_date_place(game_id, year)

        for site in lista_sites:
            if(site == "hustle"):
                url = "https://stats.nba.com/stats/hustlestatsboxscore"
                rs = 1

            elif(site == "defensive"):
                url = "https://stats.nba.com/stats/boxscore" + site
                rs = -1

            else:
                url = "https://stats.nba.com/stats/boxscore" + site + "v2"
                rs = 0

            print(game_id + " " + str(game_date) + " - " + site + "              ", end="\r")

            params["GameID"] = game_id

            resp = requests.get(url=url, params=params, headers=headers)
            #time.sleep(0.5)
            
            if(rs == 0):
                df_tipos.append(get_df_nba_json(resp.json(), rs=0))
                df_tipos_jogo.append(get_df_nba_json(resp.json(), rs=1))
            elif(rs == 1):
                df_tipos.append(get_df_nba_json(resp.json(), rs=1))
                df_tipos_jogo.append(get_df_nba_json(resp.json(), rs=2))
            elif(rs == -1):
                df_tipos.append(get_df_nba_json(resp.json(), rs=0))

        df_resp = junta_df_tipos(df_tipos)
        df_resp_jogo = junta_df_tipos(df_tipos_jogo)
        
        df_resp["GAME_DATE"] = np.repeat(game_date, len(df_resp))
        df_resp["GAME_PLACE"] = np.repeat(game_place, len(df_resp))
        
        df_resp_jogo["GAME_DATE"] = np.repeat(game_date, len(df_resp_jogo))
        df_resp_jogo["GAME_PLACE"] = np.repeat(game_place, len(df_resp_jogo))

        teams = df_resp.TEAM_ABBREVIATION.unique()
        game_str = teams[0] + " @ " + teams[1] + " " + game_date
        
        df_resp["GAME"] = np.repeat(game_str, len(df_resp))
        df_resp_jogo["GAME"] = np.repeat(game_str, len(df_resp_jogo))

        df_full.append(df_resp.set_index("GAME"))
        df_full_jogo.append(df_resp_jogo.set_index("GAME"))

        #pickle.dump(df_full, open("df_full.p", "wb"))
        #pickle.dump(df_full_jogo, open("df_full_jogo.p", "wb"))
    except Exception as e:
        erros.append(game_id)
        print(e)
        time.sleep(3)
    

0021500011 2015-10-28 - traditional              

In [59]:
base_nba = pd.concat(df_full)

In [60]:
base_nba_jogo = pd.concat(df_full_jogo)

In [62]:
base_nba.to_csv("bases_nba_stats/aux_files/base_nba_per_player_16_17.csv")

In [63]:
base_nba_jogo.to_csv("bases_nba_stats/aux_files/base_nba_per_game_16_17.csv")

In [65]:
import gc
gc.collect()

0

### ANTIGO

In [20]:
#driver = webdriver.Chrome("C:/Users/rafae/Downloads/chromedriver_win32/chromedriver.exe")
driver = webdriver.Ie("C:/Users/rafae/Downloads/IEDriverServer_x64_3.14.0/IEDriverServer.exe")

In [21]:
driver.get("https://stats.nba.com/stats/boxscoretraditionalv2?EndPeriod=10&EndRange=28800&GameID=0021700784&RangeType=0&Season=2017-18&SeasonType=Regular+Season&StartPeriod=1&StartRange=0")

In [3]:
de_para_siglas = pd.read_csv('de_para_siglas.csv').set_index('nome').to_dict()['sigla']

In [5]:
def trata_df(table_html):
    df = pd.read_html("<table>" + table_html + "</table>")
    df = pd.concat(df)
    return(df)

In [None]:
#https://stats.nba.com/stats/boxscoreusagev2?EndPeriod=10&EndRange=28800&GameID=0021700784&RangeType=0&Season=2017-18&SeasonType=Regular+Season&StartPeriod=1&StartRange=0
#https://data.nba.com/data/10s/v2015/json/mobile_teams/nba/2017/scores/gamedetail/0021700784_gamedetail.json
#https://stats.nba.com/stats/boxscoretraditionalv2?EndPeriod=10&EndRange=28800&GameID=0021700784&RangeType=0&Season=2017-18&SeasonType=Regular+Season&StartPeriod=1&StartRange=0

In [10]:
#game_pages = ["https://stats.nba.com/game/0021700784/"]
game_pages = []
for i in range(1, 1231):
    game_pages.append("https://stats.nba.com/game/002170" + ('{0:0>4}'.format(i)) + "/")

In [11]:
tipo_dados = ["", "advanced", "scoring", "misc", "usage", "four-factors", "tracking", "hustle", "defense"]

In [12]:
def junta_df_tipos(df_tipos):
    resp = df_tipos[0]
    for i in range(1, len(df_tipos)):
        junta = df_tipos[i]
        colunas_repetidas = list(set(junta.columns).intersection(resp.columns))
        junta = junta.drop(colunas_repetidas, axis=1, errors="ignore")
        
        resp = resp.merge(junta, how="left", 
                          left_index=True, right_index=True)
    return(resp)

In [16]:
driver.implicitly_wait(30)
driver.set_page_load_timeout(100)

In [None]:
resp = []
erro = []

tent = 0
for g in range(33, len(game_pages)):
    try:
        driver.get(game_pages[g])
        time.sleep(5)
        
        game_date = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CLASS_NAME, 'game-summary__date')))

        #game_date = driver.find_element_by_class_name("game-summary__date").get_attribute("innerHTML")
        game_date = game_date.get_attribute("innerHTML")
        game_date = datetime.strptime(game_date, '%b %d, %Y')

        game_str = ""
        team_names_siglas = []

        df_tipos = []

        for tipo in tipo_dados:
            print(str(g) + "... " + game_pages[g] + tipo + "     ", end="\r")
            if(tipo != ""):
                driver.get(game_pages[g] + tipo)
                time.sleep(30)

            teams = driver.find_elements_by_tag_name("nba-stat-table")
            team_names = driver.find_elements_by_class_name("game-summary-team__name")

            df_stats = []

            i = 0
            for team in teams:
                if(tipo == ""):                
                    team_name = team_names[i].find_element_by_tag_name("a").get_attribute("innerHTML")

                    team_name_sigla = team_name
                    for key in de_para_siglas.keys():
                        team_name_sigla = str(team_name_sigla).replace(key, de_para_siglas[key])

                    team_names_siglas.append(team_name_sigla)

                tentativas = 0
                while(tentativas < 3):
                    try:
                        tb_stats = team.find_elements_by_tag_name("table")[0].get_attribute('innerHTML')

                        if tb_stats is not None:
                            break
                    except IndexError:
                        pass
                    time.sleep(2)
                    tentativas += 1

                df_stat_team = trata_df(tb_stats)
                df_stat_team["team"] = np.repeat(team_names_siglas[i], len(df_stat_team))
                df_stat_team["key"] = df_stat_team["Player"] + "_" +  df_stat_team["team"]
                df_stat_team.set_index("key", inplace = True)

                df_stats.append(df_stat_team)
                i += 1 

            df_tipos.append(pd.concat(df_stats))

        df_game = junta_df_tipos(df_tipos)

        game_str = team_names_siglas[0] + " @ " + team_names_siglas[1] + " " + game_date.strftime('%Y-%m-%d')
        
        print(str(g) + ": " + game_pages[g] + tipo + " [" + game_str + "] " + str(len(df_game)))
        
        df_game["game"] = np.repeat(game_str, len(df_game))
        df_game.set_index("game")

        df_game.to_csv("C:/Users/rafae/Google Drive/Projetos/NBA_Games/dfs_nba/"+ str(g) +"_" +game_str+".csv")

        resp.append(df_game)
    except Exception as e:
        print(e)
        if(tent > 3):
            erro.append(game_pages[g])
            try:
                driver.refresh()
            except 
            tent = -1
        else:
            driver.quit()
            driver = webdriver.Ie("C:/Users/rafae/Downloads/IEDriverServer_x64_3.14.0/IEDriverServer.exe")
            driver.implicitly_wait(50)
            driver.set_page_load_timeout(100)
            time.sleep(60)
            g -= 1      
        tent += 1