In [131]:
import selenium
import pandas as pd
import numpy as np
import os
import time
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import json
import requests
%matplotlib inline
import pickle

In [120]:
url = 'https://stats.nba.com/stats/boxscoredefensive'

params = {
    'EndPeriod':10,
    'EndRange':28800,
    'GameID':'0021700784',
    'RangeType':0,
    'Season':2017-18,
    'SeasonType':'Regular+Season',
    'StartPeriod':1,
    'StartRange':0
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}

resp = requests.get(url=url, params=params, headers=headers)

In [65]:
resp = requests.get(url="https://data.nba.com/data/10s/v2015/json/mobile_teams/nba/2017/scores/gamedetail/0021700784_gamedetail.json",
                    params=params, headers=headers)

In [142]:
def get_date_place(game_id):
    #0021700784
    try:
        resp = requests.get(url="https://data.nba.com/data/10s/v2015/json/mobile_teams/nba/2017/scores/gamedetail/"+ game_id +"_gamedetail.json",
                            headers=headers)
        data = resp.json()["g"]["gdte"]
        place = resp.json()["g"]["an"]
    except:
        data = "NaN"
        place = "NaN"
    
    return(data, place)

In [143]:
def get_df_nba_json(resp_json, date_place = False):
    dict_resp = resp_json['resultSets'][0]
    df_resp = pd.DataFrame(dict_resp["rowSet"])
    df_resp.columns = dict_resp["headers"]
    
    if(date_place):
        game_date, game_place = get_date_place(df_resp.GAME_ID.iloc[0])

        df_resp["GAME_DATE"] = np.repeat(game_date, len(df_resp))
        df_resp["GAME_PLACE"] = np.repeat(game_place, len(df_resp))

        teams = df_resp.TEAM_ABBREVIATION.unique()

        df_resp["GAME"] = np.repeat(teams[0] + " @ " + teams[1] + " " + game_date, len(df_resp))
    
    return(df_resp)

In [144]:
def junta_df_tipos(df_tipos):
    resp = df_tipos[0]
    for i in range(1, len(df_tipos)):
        junta = df_tipos[i]
        colunas_repetidas = list(set(junta.columns).intersection(resp.columns))
        junta = junta.drop(colunas_repetidas, axis=1, errors="ignore")
        
        resp = resp.merge(junta, how="left", 
                          left_index=True, right_index=True)
    return(resp)

In [117]:
lista_sites = ["traditional", "advanced", "scoring", "misc", "usage", "fourfactors", "playertrack", "hustle", "defensive"]

In [99]:
game_ids = []
for i in range(1, 1231):
    game_ids.append("002170" + ('{0:0>4}'.format(i)))

In [141]:
erros = []

In [160]:
params = {
    'EndPeriod':10,
    'EndRange':28800,
    'GameID':'0021700784',
    'RangeType':0,
    'Season':2017-18,
    'SeasonType':'Regular+Season',
    'StartPeriod':1,
    'StartRange':0
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}

df_full = []

for game_id in game_ids[:184]:
    
    df_tipos = []
    
    try:
        game_date, game_place = get_date_place(game_id)

        for site in lista_sites:
            if(site == "hustle"):
                url = "https://stats.nba.com/stats/hustlestatsboxscore"

            elif(site == "defensive"):
                url = "https://stats.nba.com/stats/boxscore" + site

            else:
                url = "https://stats.nba.com/stats/boxscore" + site + "v2"

            print(game_id + " - " + site + "              ", end="\r")

            params["GameID"] = game_id

            resp = requests.get(url=url, params=params, headers=headers)
            #time.sleep(0.5)

            df_tipos.append(get_df_nba_json(resp.json()))

        df_resp = junta_df_tipos(df_tipos)
        df_resp["GAME_DATE"] = np.repeat(game_date, len(df_resp))
        df_resp["GAME_PLACE"] = np.repeat(game_place, len(df_resp))

        teams = df_resp.TEAM_ABBREVIATION.unique()
        df_resp["GAME"] = np.repeat(teams[0] + " @ " + teams[1] + " " + game_date, len(df_resp))

        df_full.append(df_resp.set_index("GAME"))

        pickle.dump(df_full, open("df_full_1.p", "wb"))
    except:
        erros.append(game_id)
        time.sleep(3)
    

0021700184 - defensive                

In [161]:
df_full_1 = pd.concat(df_full)

In [147]:
df_full_junto = pd.concat(df_full)

In [165]:
base_nba = pd.concat([df_full_1, df_full_junto])

In [173]:
base_nba.shape

(30980, 121)

In [175]:
print(list(base_nba.columns))

['GAME_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_CITY', 'PLAYER_ID', 'PLAYER_NAME', 'START_POSITION', 'COMMENT', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS', 'PLUS_MINUS', 'OFF_RATING', 'DEF_RATING', 'NET_RATING', 'AST_PCT', 'AST_TOV', 'AST_RATIO', 'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'TM_TOV_PCT', 'EFG_PCT', 'TS_PCT', 'USG_PCT', 'PACE', 'PIE', 'PCT_FGA_2PT', 'PCT_FGA_3PT', 'PCT_PTS_2PT', 'PCT_PTS_2PT_MR', 'PCT_PTS_3PT', 'PCT_PTS_FB', 'PCT_PTS_FT', 'PCT_PTS_OFF_TOV', 'PCT_PTS_PAINT', 'PCT_AST_2PM', 'PCT_UAST_2PM', 'PCT_AST_3PM', 'PCT_UAST_3PM', 'PCT_AST_FGM', 'PCT_UAST_FGM', 'PTS_OFF_TOV', 'PTS_2ND_CHANCE', 'PTS_FB', 'PTS_PAINT', 'OPP_PTS_OFF_TOV', 'OPP_PTS_2ND_CHANCE', 'OPP_PTS_FB', 'OPP_PTS_PAINT', 'BLKA', 'PFD', 'PCT_FGM', 'PCT_FGA', 'PCT_FG3M', 'PCT_FG3A', 'PCT_FTM', 'PCT_FTA', 'PCT_OREB', 'PCT_DREB', 'PCT_REB', 'PCT_AST', 'PCT_TOV', 'PCT_STL', 'PCT_BLK', 'PCT_BLKA', 'PCT_PF', 'PCT_PF

In [176]:
base_nba.to_csv("base_nba_full_17_18.csv")

In [156]:
df_full_junto.to_csv("df_full_junto.csv")

In [167]:
len(list(base_nba.GAME_ID.unique()))

1230

In [170]:
import missingno

In [149]:
import gc
gc.collect()

148

### ANTIGO

In [20]:
#driver = webdriver.Chrome("C:/Users/rafae/Downloads/chromedriver_win32/chromedriver.exe")
driver = webdriver.Ie("C:/Users/rafae/Downloads/IEDriverServer_x64_3.14.0/IEDriverServer.exe")

In [21]:
driver.get("https://stats.nba.com/stats/boxscoretraditionalv2?EndPeriod=10&EndRange=28800&GameID=0021700784&RangeType=0&Season=2017-18&SeasonType=Regular+Season&StartPeriod=1&StartRange=0")

In [3]:
de_para_siglas = pd.read_csv('de_para_siglas.csv').set_index('nome').to_dict()['sigla']

In [5]:
def trata_df(table_html):
    df = pd.read_html("<table>" + table_html + "</table>")
    df = pd.concat(df)
    return(df)

In [None]:
#https://stats.nba.com/stats/boxscoreusagev2?EndPeriod=10&EndRange=28800&GameID=0021700784&RangeType=0&Season=2017-18&SeasonType=Regular+Season&StartPeriod=1&StartRange=0
#https://data.nba.com/data/10s/v2015/json/mobile_teams/nba/2017/scores/gamedetail/0021700784_gamedetail.json
#https://stats.nba.com/stats/boxscoretraditionalv2?EndPeriod=10&EndRange=28800&GameID=0021700784&RangeType=0&Season=2017-18&SeasonType=Regular+Season&StartPeriod=1&StartRange=0

In [10]:
#game_pages = ["https://stats.nba.com/game/0021700784/"]
game_pages = []
for i in range(1, 1231):
    game_pages.append("https://stats.nba.com/game/002170" + ('{0:0>4}'.format(i)) + "/")

In [11]:
tipo_dados = ["", "advanced", "scoring", "misc", "usage", "four-factors", "tracking", "hustle", "defense"]

In [12]:
def junta_df_tipos(df_tipos):
    resp = df_tipos[0]
    for i in range(1, len(df_tipos)):
        junta = df_tipos[i]
        colunas_repetidas = list(set(junta.columns).intersection(resp.columns))
        junta = junta.drop(colunas_repetidas, axis=1, errors="ignore")
        
        resp = resp.merge(junta, how="left", 
                          left_index=True, right_index=True)
    return(resp)

In [16]:
driver.implicitly_wait(30)
driver.set_page_load_timeout(100)

In [None]:
resp = []
erro = []

tent = 0
for g in range(33, len(game_pages)):
    try:
        driver.get(game_pages[g])
        time.sleep(5)
        
        game_date = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CLASS_NAME, 'game-summary__date')))

        #game_date = driver.find_element_by_class_name("game-summary__date").get_attribute("innerHTML")
        game_date = game_date.get_attribute("innerHTML")
        game_date = datetime.strptime(game_date, '%b %d, %Y')

        game_str = ""
        team_names_siglas = []

        df_tipos = []

        for tipo in tipo_dados:
            print(str(g) + "... " + game_pages[g] + tipo + "     ", end="\r")
            if(tipo != ""):
                driver.get(game_pages[g] + tipo)
                time.sleep(30)

            teams = driver.find_elements_by_tag_name("nba-stat-table")
            team_names = driver.find_elements_by_class_name("game-summary-team__name")

            df_stats = []

            i = 0
            for team in teams:
                if(tipo == ""):                
                    team_name = team_names[i].find_element_by_tag_name("a").get_attribute("innerHTML")

                    team_name_sigla = team_name
                    for key in de_para_siglas.keys():
                        team_name_sigla = str(team_name_sigla).replace(key, de_para_siglas[key])

                    team_names_siglas.append(team_name_sigla)

                tentativas = 0
                while(tentativas < 3):
                    try:
                        tb_stats = team.find_elements_by_tag_name("table")[0].get_attribute('innerHTML')

                        if tb_stats is not None:
                            break
                    except IndexError:
                        pass
                    time.sleep(2)
                    tentativas += 1

                df_stat_team = trata_df(tb_stats)
                df_stat_team["team"] = np.repeat(team_names_siglas[i], len(df_stat_team))
                df_stat_team["key"] = df_stat_team["Player"] + "_" +  df_stat_team["team"]
                df_stat_team.set_index("key", inplace = True)

                df_stats.append(df_stat_team)
                i += 1 

            df_tipos.append(pd.concat(df_stats))

        df_game = junta_df_tipos(df_tipos)

        game_str = team_names_siglas[0] + " @ " + team_names_siglas[1] + " " + game_date.strftime('%Y-%m-%d')
        
        print(str(g) + ": " + game_pages[g] + tipo + " [" + game_str + "] " + str(len(df_game)))
        
        df_game["game"] = np.repeat(game_str, len(df_game))
        df_game.set_index("game")

        df_game.to_csv("C:/Users/rafae/Google Drive/Projetos/NBA_Games/dfs_nba/"+ str(g) +"_" +game_str+".csv")

        resp.append(df_game)
    except Exception as e:
        print(e)
        if(tent > 3):
            erro.append(game_pages[g])
            try:
                driver.refresh()
            except 
            tent = -1
        else:
            driver.quit()
            driver = webdriver.Ie("C:/Users/rafae/Downloads/IEDriverServer_x64_3.14.0/IEDriverServer.exe")
            driver.implicitly_wait(50)
            driver.set_page_load_timeout(100)
            time.sleep(60)
            g -= 1      
        tent += 1