In [20]:
from numpy.core.fromnumeric import mean
from flask import jsonify
import pandas as pd
import numpy as np
import json
import re
import http
import xgboost as xgb
import time
import urllib
from urllib.error import URLError, HTTPError, ContentTooShortError
from datetime import datetime
from itertools import chain, starmap
%load_ext jupyternotify

# years completed 2002-2020
years_arr = 2020
def download(url, num_retries=5): 
#     print('Downloading:', url)
    try: 
        
        html = urllib.request.urlopen(url).read()
    except (URLError, HTTPError, ContentTooShortError, http.client.HTTPException, http.client.IncompleteRead) as e: 
        print('Download error:', url)
        html = None 
        if num_retries > 0: 
            if hasattr(e, 'code') and 500 <= e.code < 600: 
                time.sleep(10)
                # recursively retry 5xx HTTP errors 
                return download(url, num_retries - 1) 
        if num_retries > 0: 
            if e == http.client.IncompleteRead: 
                time.sleep(10)
                return download(url, num_retries - 1)
    return html

def flatten_json_iterative(dictionary, sep = '.', ind_start = 0):
    """Flattening a nested json file"""

    def unpack_one(parent_key, parent_value):
        """Unpack one level (only one) of nesting in json file"""
        # Unpacking one level        
        if isinstance(parent_value, dict):
            for key, value in parent_value.items():
                t1 = parent_key + sep + key
                yield t1, value
        elif isinstance(parent_value, list):
            i = ind_start 
            for value in parent_value:
                t2 = parent_key + sep +str(i) 
                i += 1
                yield t2, value
        else:
            yield parent_key, parent_value    

            
    # Continue iterating the unpack_one function until the terminating condition is satisfied
    while True:
        # Continue unpacking the json file until all values are atomic elements (aka neither a dictionary nor a list)
        dictionary = dict(chain.from_iterable(starmap(unpack_one, dictionary.items())))
        # Terminating condition: none of the values in the json file are a dictionary or a list
        if not any(isinstance(value, dict) for value in dictionary.values()) and \
           not any(isinstance(value, list) for value in dictionary.values()):
            break

    return dictionary

def wbb_pbp(gameId):
        """cfb_pbp()
        Pull the game by id
        Data from API endpoints:
        * college-football/playbyplay
        * college-football/summary
        """
        # play by play
        pbp_url = "http://cdn.espn.com/core/womens-college-basketball/playbyplay?gameId={}&xhr=1&render=false&userab=18".format(gameId)
        pbp_resp = download(url=pbp_url)
        pbp_txt = {}
        
        if pbp_resp is not None:
            pbp_txt['teams'] = np.array([])
            pbp_txt['id'] = np.array([])
            pbp_txt['competitions'] = np.array([])
            pbp_txt['season'] = np.array([])
            pbp_txt['header'] = {}
            pbp_txt['pickcenter'] = np.array([])
            pbp_txt['broadcasts'] = np.array([])
            pbp_txt['videos'] = np.array([])
            pbp_txt['standings'] = np.array([])
            pbp_txt['boxscore'] = np.array([])
            pbp_txt['espnWP'] = np.array([])
            pbp_txt['gameInfo'] = np.array([])
            pbp_txt['season'] = np.array([])
            pbp_txt['timeouts'] = {}
            pbp_d = json.loads(pbp_resp)
            pbp_txt = pbp_d['gamepackageJSON']
            pbp_txt['gameId'] = pbp_d['gameId']
            # summary endpoint for pickcenter array
            summary_url = "http://site.api.espn.com/apis/site/v2/sports/basketball/womens-college-basketball/summary?event={}".format(gameId)
            summary_resp = download(url=summary_url)
            summary = json.loads(summary_resp)
            if 'pickcenter' in summary:
                summary_txt = summary['pickcenter']
            else:
                summary_txt = {}
            # ESPN's win probability
            wp = "winprobability"
            if wp in summary:
                espnWP = summary["winprobability"]
            else:
                espnWP = np.array([])

            if 'news' in pbp_txt.keys():
                del pbp_txt['news']
            if 'shop' in pbp_txt.keys():
                del pbp_txt['shop']
            pbp_txt['gameInfo'] = pbp_txt['header']['competitions'][0]
            pbp_txt['season'] = pbp_txt['header']['season']['year']
            pbp_txt['pickcenter'] = summary_txt
            pbp_txt['espnWP'] = espnWP
            # Home and Away identification variables
            homeTeamId = int(pbp_txt['header']['competitions'][0]['competitors'][0]['team']['id'])
            awayTeamId = int(pbp_txt['header']['competitions'][0]['competitors'][1]['team']['id'])
            homeTeamMascot = str(pbp_txt['header']['competitions'][0]['competitors'][0]['team']['name'])
            awayTeamMascot = str(pbp_txt['header']['competitions'][0]['competitors'][1]['team']['name'])
            homeTeamName = str(pbp_txt['header']['competitions'][0]['competitors'][0]['team']['location'])
            awayTeamName = str(pbp_txt['header']['competitions'][0]['competitors'][1]['team']['location'])
            homeTeamAbbrev = str(pbp_txt['header']['competitions'][0]['competitors'][0]['team']['abbreviation'])
            awayTeamAbbrev = str(pbp_txt['header']['competitions'][0]['competitors'][1]['team']['abbreviation'])
            homeTeamNameAlt = re.sub("Stat(.+)", "St", str(homeTeamName))
            awayTeamNameAlt = re.sub("Stat(.+)", "St", str(awayTeamName))

            if len(pbp_txt['espnWP']) > 1:
                pbp_txt['espnWP'] = espnWP
            else:
                pbp_txt['espnWP'] = espnWP
            pbp_txt['plays_mod'] = []
            for play in pbp_txt['plays']:
                p = flatten_json_iterative(play)
                pbp_txt['plays_mod'].append(p)
            pbp_txt['plays'] = pd.json_normalize(pbp_txt,'plays_mod')
            if len(pbp_txt['plays'])>1:
                pbp_txt['plays']['season'] = pbp_txt['header']['season']['year']
                pbp_txt['plays']['seasonType'] = pbp_txt['header']['season']['type']
                pbp_txt['plays']["awayTeamId"] = awayTeamId
                pbp_txt['plays']["awayTeamName"] = str(awayTeamName)
                pbp_txt['plays']["awayTeamMascot"] = str(awayTeamMascot)
                pbp_txt['plays']["awayTeamAbbrev"] = str(awayTeamAbbrev)
                pbp_txt['plays']["awayTeamNameAlt"] = str(awayTeamNameAlt)
                pbp_txt['plays']["homeTeamId"] = homeTeamId
                pbp_txt['plays']["homeTeamName"] = str(homeTeamName)
                pbp_txt['plays']["homeTeamMascot"] = str(homeTeamMascot)
                pbp_txt['plays']["homeTeamAbbrev"] = str(homeTeamAbbrev)
                pbp_txt['plays']["homeTeamNameAlt"] = str(homeTeamNameAlt)
                # Spread definition
                pbp_txt['plays']["homeTeamSpread"] = 2.5
                if len(pbp_txt['pickcenter']) > 1:
                    if 'spread' in pbp_txt['pickcenter'][1].keys():
                        gameSpread =  pbp_txt['pickcenter'][1]['spread']
                        homeFavorite = pbp_txt['pickcenter'][1]['homeTeamOdds']['favorite']
                    else:
                        gameSpread =  pbp_txt['pickcenter'][0]['spread']
                        homeFavorite = pbp_txt['pickcenter'][0]['homeTeamOdds']['favorite']

                else:
                    gameSpread = 2.5
                    homeFavorite = True
                pbp_txt['plays']["gameSpread"] = abs(gameSpread)
                pbp_txt['plays']["homeTeamSpread"] = np.where(homeFavorite == True, abs(gameSpread), -1*abs(gameSpread))
                pbp_txt['homeTeamSpread'] = np.where(homeFavorite == True, abs(gameSpread), -1*abs(gameSpread))
                pbp_txt['plays']["homeFavorite"] = homeFavorite
                pbp_txt['plays']["gameSpread"] = gameSpread
                pbp_txt['plays']["homeFavorite"] = homeFavorite

                pbp_txt['plays'] = pd.DataFrame(pbp_txt['plays'])
                pbp_txt['plays']['season'] = pbp_txt['header']['season']['year']
                pbp_txt['plays']['seasonType'] = pbp_txt['header']['season']['type']
                pbp_txt['plays']["homeTeamId"] = homeTeamId
                pbp_txt['plays']["awayTeamId"] = awayTeamId
                pbp_txt['plays']["homeTeamName"] = str(homeTeamName)
                pbp_txt['plays']["awayTeamName"] = str(awayTeamName)
                pbp_txt['plays']["homeTeamMascot"] = str(homeTeamMascot)
                pbp_txt['plays']["awayTeamMascot"] = str(awayTeamMascot)
                pbp_txt['plays']["homeTeamAbbrev"] = str(homeTeamAbbrev)
                pbp_txt['plays']["awayTeamAbbrev"] = str(awayTeamAbbrev)
                pbp_txt['plays']["homeTeamNameAlt"] = str(homeTeamNameAlt)
                pbp_txt['plays']["awayTeamNameAlt"] = str(awayTeamNameAlt)
                pbp_txt['plays']['period.number'] = pbp_txt['plays']['period.number'].apply(lambda x: int(x))
                #----- Figuring out Timeouts ---------
                pbp_txt['timeouts'] = {}
                pbp_txt['timeouts'][homeTeamId] = {"1": [], "2": []}
                pbp_txt['timeouts'][awayTeamId] = {"1": [], "2": []}
                pbp_txt['homeTeamSpread'] = np.where(homeFavorite == True, abs(gameSpread), -1*abs(gameSpread))
                pbp_txt['plays']["gameSpread"] = abs(gameSpread)
                pbp_txt['plays']["homeTeamSpread"] = np.where(homeFavorite == True, abs(gameSpread), -1*abs(gameSpread))
                pbp_txt['plays']["homeFavorite"] = homeFavorite
                pbp_txt['plays']["gameSpread"] = gameSpread
                pbp_txt['plays']["homeFavorite"] = homeFavorite

                #----- Time ---------------
                pbp_txt['plays']['clock.mm'] = pbp_txt['plays']['clock.displayValue'].str.split(pat=':')
                pbp_txt['plays'][['clock.minutes','clock.seconds']] = pbp_txt['plays']['clock.mm'].to_list()
                pbp_txt['plays']['half'] = np.where(pbp_txt['plays']['period.number'] <= 2, "1","2")
                pbp_txt['plays']['lag_half'] = pbp_txt['plays']['half'].shift(1)
                pbp_txt['plays']['lead_half'] = pbp_txt['plays']['half'].shift(-1)

                pbp_txt['plays']['game_play_number'] = np.arange(len(pbp_txt['plays']))+1
                pbp_txt['plays']['text'] = pbp_txt['plays']['text'].astype(str)
                pbp_txt['plays']['id'] = pbp_txt['plays']['id'].apply(lambda x: int(x))
                del pbp_txt['plays']['clock.mm']
            else:
                if len(pbp_txt['pickcenter']) > 1:
                    gameSpread = pbp_txt['pickcenter'][1]['spread']
                    homeFavorite = pbp_txt['pickcenter'][1]['homeTeamOdds']['favorite']
                else:
                    gameSpread = 2.5
                    homeFavorite = True
                pbp_txt['homeTeamSpread'] = np.where(homeFavorite == True, abs(gameSpread), -1*abs(gameSpread))


            pbp_json = {
                "gameId": pbp_txt['gameId'],
                "plays" : pbp_txt['plays'].to_dict(orient='records'),
                "winprobability" : np.array(pbp_txt['winprobability']).tolist(),
                "boxscore" : pbp_txt['boxscore'],
                "header" : pbp_txt['header'],
                "homeTeamSpread" : np.array(pbp_txt['homeTeamSpread']).tolist(),
                "broadcasts" : np.array(pbp_txt['broadcasts']).tolist(),
                "videos" : np.array(pbp_txt['videos']).tolist(),
                "standings" : pbp_txt['standings'],
                "pickcenter" : np.array(pbp_txt['pickcenter']).tolist(),
                "espnWP" : np.array(pbp_txt['espnWP']).tolist(),
                "gameInfo" : np.array(pbp_txt['gameInfo']).tolist(),
                "season" : np.array(pbp_txt['season']).tolist()
            }
            return pbp_json, pbp_json['season']
        else:
            
            pass
        
    
schedule = pd.read_csv('wbb/schedules/wbb_games_info_{}.csv'.format(years_arr), encoding='latin-1')
schedule = schedule.sort_values(by=['season', 'date'], ascending = False)

games = schedule[(schedule['season']==years_arr)&(schedule['status.type.name']=='STATUS_FINAL')].reset_index()['id']
print(f"Number of Games: {len(games)}, first gameId: {games[0]}")
g, y = wbb_pbp(gameId = games[0])
# for key in g.keys():
#     print(f"{key}: {type(g[key])}")    
json.dumps(g,indent=4)
print(g.keys())
print(g['pickcenter'])
print(g['homeTeamSpread'])
print(pd.DataFrame(g['plays']))
y

The jupyternotify extension is already loaded. To reload it, use:
  %reload_ext jupyternotify
Number of Games: 5434, first gameId: 401211959
dict_keys(['gameId', 'plays', 'winprobability', 'boxscore', 'header', 'homeTeamSpread', 'broadcasts', 'videos', 'standings', 'pickcenter', 'espnWP', 'gameInfo', 'season'])
{}
2.5
     shootingPlay sequenceNumber period.displayValue  period.number  \
0            True      101906201         1st Quarter              1   
1           False      101906202         1st Quarter              1   
2           False      101908101         1st Quarter              1   
3           False      101908201         1st Quarter              1   
4            True      101908501         1st Quarter              1   
..            ...            ...                 ...            ...   
361          True      104999105         4th Quarter              4   
362         False      104999106         4th Quarter              4   
363         False      104999701         

2020

In [35]:
%%notify
# 2888,5077, 5091 no header
i = 5092
for game in games[i:]:
    print(f"Working on game {i+1} of {len(games)}, gameId: {games[i]}")
    if len(str(game))<9:
        i+=1
        continue
    try:
        g, y = wbb_pbp(gameId=game)
    except (TypeError) as e: 
        print("TypeError: yo", e)
        g, y = wbb_pbp(gameId=game)
    fp = "wbb/{}/{}.json".format(y, game)
    with open(fp,'w') as f:
        json.dump(g, f, indent=4, sort_keys=False) 
#     time.sleep(1)
    i+=1

Working on game 5093 of 5434, gameId: 401178647
Working on game 5094 of 5434, gameId: 401175503
Working on game 5095 of 5434, gameId: 401174215
Working on game 5096 of 5434, gameId: 401171097
Working on game 5097 of 5434, gameId: 401171096
Working on game 5098 of 5434, gameId: 401169175
Working on game 5099 of 5434, gameId: 401177197
Working on game 5100 of 5434, gameId: 401179995
Working on game 5101 of 5434, gameId: 401180103
Working on game 5102 of 5434, gameId: 401180549
Working on game 5103 of 5434, gameId: 401174528
Working on game 5104 of 5434, gameId: 401179469
Working on game 5105 of 5434, gameId: 401177181
Working on game 5106 of 5434, gameId: 401179654
Working on game 5107 of 5434, gameId: 401177495
Working on game 5108 of 5434, gameId: 401173853
Working on game 5109 of 5434, gameId: 401171095
Working on game 5110 of 5434, gameId: 401169173
Working on game 5111 of 5434, gameId: 401177311
Working on game 5112 of 5434, gameId: 401166792
Working on game 5113 of 5434, gameId: 40

Working on game 5259 of 5434, gameId: 401175502
Working on game 5260 of 5434, gameId: 401173902
Working on game 5261 of 5434, gameId: 401176202
Working on game 5262 of 5434, gameId: 401174208
Working on game 5263 of 5434, gameId: 401178945
Working on game 5264 of 5434, gameId: 401176809
Working on game 5265 of 5434, gameId: 401166926
Working on game 5266 of 5434, gameId: 401178415
Working on game 5267 of 5434, gameId: 401176804
Working on game 5268 of 5434, gameId: 401173990
Working on game 5269 of 5434, gameId: 401173926
Working on game 5270 of 5434, gameId: 401177270
Working on game 5271 of 5434, gameId: 401176805
Working on game 5272 of 5434, gameId: 401177263
Working on game 5273 of 5434, gameId: 401179684
Working on game 5274 of 5434, gameId: 401178472
Working on game 5275 of 5434, gameId: 401178061
Working on game 5276 of 5434, gameId: 401178060
Working on game 5277 of 5434, gameId: 401176806
Working on game 5278 of 5434, gameId: 401176276
Working on game 5279 of 5434, gameId: 40

Working on game 5423 of 5434, gameId: 401176803
Working on game 5424 of 5434, gameId: 401178414
Working on game 5425 of 5434, gameId: 401169156
Working on game 5426 of 5434, gameId: 401180373
Working on game 5427 of 5434, gameId: 401180007
Working on game 5428 of 5434, gameId: 401178942
Working on game 5429 of 5434, gameId: 401178497
Working on game 5430 of 5434, gameId: 401177653
Working on game 5431 of 5434, gameId: 401177255
Working on game 5432 of 5434, gameId: 401175523
Working on game 5433 of 5434, gameId: 401177277
Working on game 5434 of 5434, gameId: 401178053


<IPython.core.display.Javascript object>