In [7]:
from numpy.core.fromnumeric import mean
import pandas as pd
import numpy as np
import json
import re
import urllib
import http
from urllib.error import URLError, HTTPError, ContentTooShortError
from datetime import datetime
from flask import jsonify
import xgboost as xgb
import time
# years completed 2002-2020
years_arr = [2019]


kickoff_vec = [
    "Kickoff",
    "Kickoff Return (Offense)",
    "Kickoff Return Touchdown",
    "Kickoff Touchdown",
    "Kickoff Team Fumble Recovery",
    "Kickoff Team Fumble Recovery Touchdown",
    "Kickoff (Safety)",
    "Penalty (Kickoff)"
]
gameId = 0
def download(url, num_retries=5): 
#     print('Downloading:', url)
    try: 
        time.sleep(1)
        html = urllib.request.urlopen(url).read()
    except (URLError, HTTPError, ContentTooShortError, http.client.HTTPException) as e: 
        print('Download error:', e.reason,url)
        html = None 
        if num_retries > 0: 
            if hasattr(e, 'code') and 500 <= e.code < 600: 
                time.sleep(10)
                # recursively retry 5xx HTTP errors 
                return download(url, num_retries - 1) 
    return html

def cfb_pbp(gameId):
        """cfb_pbp()
        Pull the game by id
        Data from API endpoints:
        * college-football/playbyplay
        * college-football/summary
        """
        # play by play
        pbp_url = "http://cdn.espn.com/core/college-football/playbyplay?gameId={}&xhr=1&render=false&userab=18".format(gameId)
        pbp_resp = download(url=pbp_url)
        pbp_txt = {}
        pbp_txt['scoringPlays'] = np.array([])
        pbp_txt['winprobability'] = np.array([])
        pbp_txt['standings'] = np.array([])
        pbp_txt['videos'] = np.array([])
        pbp_txt['broadcasts'] = np.array([])
        pbp_txt['pickcenter'] = np.array([])
        pbp_txt['espnWP'] = np.array([])
        pbp_txt['gameInfo'] = np.array([])
        pbp_txt['season'] = np.array([])
        pbp_d = json.loads(pbp_resp)
        
        pbp_txt = pbp_d['gamepackageJSON']
        pbp_txt['gameId'] = pbp_d['gameId']
        pbp_txt['timeouts'] = {}
        # summary endpoint for pickcenter array
        summary_url = "http://site.api.espn.com/apis/site/v2/sports/football/college-football/summary?event={}".format(gameId)
        summary_resp = download(url=summary_url)
        summary = json.loads(summary_resp)
        summary_txt = summary['pickcenter']
        # ESPN's win probability
        wp = "winprobability"
        if wp in summary:
            espnWP = summary["winprobability"]
        else:
            espnWP = np.array([])

        if 'news' in pbp_txt.keys():
            del pbp_txt['news']
        if 'shop' in pbp_txt.keys():
            del pbp_txt['shop']
        pbp_txt['gameInfo'] = pbp_txt['header']['competitions'][0]
        pbp_txt['season'] = pbp_txt['header']['season']
        pbp_txt['pickcenter'] = summary_txt
        pbp_txt['espnWP'] = espnWP
        # Home and Away identification variables
        homeTeamId = int(pbp_txt['header']['competitions'][0]['competitors'][0]['team']['id'])
        awayTeamId = int(pbp_txt['header']['competitions'][0]['competitors'][1]['team']['id'])
        homeTeamMascot = str(pbp_txt['header']['competitions'][0]['competitors'][0]['team']['name'])
        awayTeamMascot = str(pbp_txt['header']['competitions'][0]['competitors'][1]['team']['name'])
        homeTeamName = str(pbp_txt['header']['competitions'][0]['competitors'][0]['team']['location'])
        awayTeamName = str(pbp_txt['header']['competitions'][0]['competitors'][1]['team']['location'])
        homeTeamAbbrev = str(pbp_txt['header']['competitions'][0]['competitors'][0]['team']['abbreviation'])
        awayTeamAbbrev = str(pbp_txt['header']['competitions'][0]['competitors'][1]['team']['abbreviation'])
        homeTeamNameAlt = re.sub("Stat(.+)", "St", str(homeTeamName))
        awayTeamNameAlt = re.sub("Stat(.+)", "St", str(awayTeamName))

        pbp_txt['plays'] = pd.DataFrame()
        pbp_txt['plays']['season'] = pbp_txt['header']['season']['year']
        pbp_txt['plays']['seasonType'] = pbp_txt['header']['season']['type']
        pbp_txt['plays']["awayTeamId"] = awayTeamId
        pbp_txt['plays']["awayTeamName"] = str(awayTeamName)
        pbp_txt['plays']["awayTeamMascot"] = str(awayTeamMascot)
        pbp_txt['plays']["awayTeamAbbrev"] = str(awayTeamAbbrev)
        pbp_txt['plays']["awayTeamNameAlt"] = str(awayTeamNameAlt)
        pbp_txt['plays']["homeTeamId"] = homeTeamId
        pbp_txt['plays']["homeTeamName"] = str(homeTeamName)
        pbp_txt['plays']["homeTeamMascot"] = str(homeTeamMascot)
        pbp_txt['plays']["homeTeamAbbrev"] = str(homeTeamAbbrev)
        pbp_txt['plays']["homeTeamNameAlt"] = str(homeTeamNameAlt)
        if len(pbp_txt['espnWP']) > 1:
            pbp_txt['espnWP'] = espnWP
        else:
            pbp_txt['espnWP'] = espnWP
        # Spread definition
        pbp_txt['plays']["homeTeamSpread"] = 2.5
        if len(pbp_txt['pickcenter']) > 1:
            if 'spread' in pbp_txt['pickcenter'][1].keys():
                gameSpread =  pbp_txt['pickcenter'][1]['spread']
                homeFavorite = pbp_txt['pickcenter'][1]['homeTeamOdds']['favorite']
            else:
                gameSpread =  pbp_txt['pickcenter'][0]['spread']
                homeFavorite = pbp_txt['pickcenter'][0]['homeTeamOdds']['favorite']
            
        else:
            gameSpread = 2.5
            homeFavorite = True
        pbp_txt['plays']["gameSpread"] = abs(gameSpread)
        pbp_txt['plays']["homeTeamSpread"] = np.where(homeFavorite == True, abs(gameSpread), -1*abs(gameSpread))
        pbp_txt['homeTeamSpread'] = np.where(homeFavorite == True, abs(gameSpread), -1*abs(gameSpread))
        pbp_txt['plays']["homeFavorite"] = homeFavorite
        pbp_txt['plays']["gameSpread"] = gameSpread
        pbp_txt['plays']["homeFavorite"] = homeFavorite
        # negotiating the drive meta keys into columns after unnesting drive plays
        # concatenating the previous and current drives categories when necessary
        if 'drives' in pbp_txt.keys():
            prev_drives = pd.json_normalize(
                data = pbp_txt['drives']['previous'],
                record_path = 'plays',
                meta = ['id', 'displayResult','isScore',
                        ['team','shortDisplayName'],
                        ['team','displayName'],
                        ['team','name'],
                        ['team','abbreviation'],
                        'yards','offensivePlays','result',
                        'description',
                        'shortDisplayResult',
                        ['timeElapsed','displayValue'],
                        ['start','period','number'],
                        ['start','period','type'],
                        ['start','yardLine'],
                        ['start','clock','displayValue'],
                        ['start','text'],
                        ['end','period','number'],
                        ['end','period','type'],
                        ['end','yardLine'],
                        ['end','clock','displayValue']],
                meta_prefix = 'drive.', errors = 'ignore')

            if len(pbp_txt['drives'].keys()) > 1:
                curr_drives = pd.json_normalize(
                    data = pbp_txt['drives']['current'],
                    record_path = 'plays',
                    meta = ['id', 'displayResult','isScore',
                            ['team','shortDisplayName'],
                            ['team','displayName'],
                            ['team','name'],
                            ['team','abbreviation'],
                            'yards','offensivePlays','result',
                            'description',
                            'shortDisplayResult',
                            ['timeElapsed','displayValue'],
                            ['start','period','number'],
                            ['start','period','type'],
                            ['start','yardLine'],
                            ['start','clock','displayValue'],
                            ['start','text'],
                            ['end','period','number'],
                            ['end','period','type'],
                            ['end','yardLine'],
                            ['end','clock','displayValue']],
                    meta_prefix = 'drive.', errors = 'ignore')
                pbp_txt['plays'] = pd.concat([curr_drives, prev_drives], ignore_index=True)
            else:
                pbp_txt['plays'] = prev_drives

            pbp_txt['plays'] = pbp_txt['plays'].to_dict(orient='records')
            pbp_txt['plays'] = pd.DataFrame(pbp_txt['plays'])
            pbp_txt['plays']['season'] = pbp_txt['header']['season']['year']
            pbp_txt['plays']['seasonType'] = pbp_txt['header']['season']['type']
            pbp_txt['plays']["homeTeamId"] = homeTeamId
            pbp_txt['plays']["awayTeamId"] = awayTeamId
            pbp_txt['plays']["homeTeamName"] = str(homeTeamName)
            pbp_txt['plays']["awayTeamName"] = str(awayTeamName)
            pbp_txt['plays']["homeTeamMascot"] = str(homeTeamMascot)
            pbp_txt['plays']["awayTeamMascot"] = str(awayTeamMascot)
            pbp_txt['plays']["homeTeamAbbrev"] = str(homeTeamAbbrev)
            pbp_txt['plays']["awayTeamAbbrev"] = str(awayTeamAbbrev)
            pbp_txt['plays']["homeTeamNameAlt"] = str(homeTeamNameAlt)
            pbp_txt['plays']["awayTeamNameAlt"] = str(awayTeamNameAlt)
            pbp_txt['plays']['period.number'] = pbp_txt['plays']['period.number'].apply(lambda x: int(x))
            #----- Figuring out Timeouts ---------
            pbp_txt['timeouts'] = {}
            pbp_txt['timeouts'][homeTeamId] = {"1": [], "2": []}
            pbp_txt['timeouts'][awayTeamId] = {"1": [], "2": []}

            pbp_txt['plays']["homeTeamSpread"] = 2.5
            if len(pbp_txt['pickcenter']) > 1:
                if 'spread' in pbp_txt['pickcenter'][1].keys():
                    gameSpread =  pbp_txt['pickcenter'][1]['spread']
                    homeFavorite = pbp_txt['pickcenter'][1]['homeTeamOdds']['favorite']
                else:
                    gameSpread =  pbp_txt['pickcenter'][0]['spread']
                    homeFavorite = pbp_txt['pickcenter'][0]['homeTeamOdds']['favorite']

            else:
                gameSpread = 2.5
                homeFavorite = True
            pbp_txt['plays']["gameSpread"] = abs(gameSpread)
            pbp_txt['plays']["homeTeamSpread"] = np.where(homeFavorite == True, abs(gameSpread), -1*abs(gameSpread))
            pbp_txt['homeTeamSpread'] = np.where(homeFavorite == True, abs(gameSpread), -1*abs(gameSpread))
            pbp_txt['plays']["homeFavorite"] = homeFavorite
            pbp_txt['plays']["gameSpread"] = gameSpread
            pbp_txt['plays']["homeFavorite"] = homeFavorite

            #----- Time ---------------
            pbp_txt['plays']['clock.mm'] = pbp_txt['plays']['clock.displayValue'].str.split(pat=':')
            pbp_txt['plays'][['clock.minutes','clock.seconds']] = pbp_txt['plays']['clock.mm'].to_list()
            pbp_txt['plays']['half'] = np.where(pbp_txt['plays']['period.number'] <= 2, "1","2")
            pbp_txt['plays']['lag_half'] = pbp_txt['plays']['half'].shift(1)
            pbp_txt['plays']['lead_half'] = pbp_txt['plays']['half'].shift(-1)
            pbp_txt['plays']['start.TimeSecsRem'] = np.where(
                pbp_txt['plays']['period.number'].isin([1,3]),
                900 + 60*pbp_txt['plays']['clock.minutes'].astype(int) + pbp_txt['plays']['clock.seconds'].astype(int),
                60*pbp_txt['plays']['clock.minutes'].astype(int) + pbp_txt['plays']['clock.seconds'].astype(int)
            )
            pbp_txt['plays']['start.adj_TimeSecsRem'] = np.select(
                [
                    pbp_txt['plays']['period.number'] == 1,
                    pbp_txt['plays']['period.number'] == 2,
                    pbp_txt['plays']['period.number'] == 3,
                    pbp_txt['plays']['period.number'] == 4
                ],
                [
                    2700 + 60*pbp_txt['plays']['clock.minutes'].astype(int)+pbp_txt['plays']['clock.seconds'].astype(int),
                    1800 + 60*pbp_txt['plays']['clock.minutes'].astype(int)+pbp_txt['plays']['clock.seconds'].astype(int),
                    900 + 60*pbp_txt['plays']['clock.minutes'].astype(int)+pbp_txt['plays']['clock.seconds'].astype(int),
                    60*pbp_txt['plays']['clock.minutes'].astype(int)+pbp_txt['plays']['clock.seconds'].astype(int)
                ], default = 60*pbp_txt['plays']['clock.minutes'].astype(int) + pbp_txt['plays']['clock.seconds'].astype(int)
            )
            # Pos Team - Start and End Id
            pbp_txt['plays']['game_play_number'] = np.arange(len(pbp_txt['plays']))+1
            pbp_txt['plays']['text'] = pbp_txt['plays']['text'].astype(str)
            pbp_txt['plays']['id'] = pbp_txt['plays']['id'].apply(lambda x: int(x))
            pbp_txt['plays']["start.team.id"] = pbp_txt['plays']["start.team.id"].fillna(method='ffill').apply(lambda x: int(x))
            if "end.team.id" not in pbp_txt['plays'].keys():
                pbp_txt['plays']['end.team.id']= pbp_txt['plays']["start.team.id"]
                
            pbp_txt['plays']["end.team.id"] = pbp_txt['plays']["end.team.id"].fillna(value=pbp_txt['plays']["start.team.id"]).apply(lambda x: int(x))
            pbp_txt['plays']['start.pos_team.id'] = np.select(
                [
                    (pbp_txt['plays']['type.text'].isin(kickoff_vec)) &
                    (pbp_txt['plays']['start.team.id'].astype(int) == pbp_txt['plays']['homeTeamId'].astype(int)),
                    (pbp_txt['plays']['type.text'].isin(kickoff_vec)) &
                    (pbp_txt['plays']['start.team.id'].astype(int) == pbp_txt['plays']['awayTeamId'].astype(int))
                ],
                [
                    pbp_txt['plays']['awayTeamId'].astype(int),
                    pbp_txt['plays']['homeTeamId'].astype(int)
                ], default = pbp_txt['plays']['start.team.id'].astype(int)
            )
            pbp_txt['plays']['start.def_pos_team.id'] = np.where(
                pbp_txt['plays']['start.pos_team.id'].astype(int) == pbp_txt['plays']['homeTeamId'].astype(int),
                pbp_txt['plays']['awayTeamId'].astype(int), pbp_txt['plays']['homeTeamId'].astype(int)
            )
            pbp_txt['plays']["end.def_team.id"] = np.where(
                pbp_txt['plays']["end.team.id"].astype(int) == pbp_txt['plays']['homeTeamId'].astype(int),
                pbp_txt['plays']['awayTeamId'].astype(int), pbp_txt['plays']['homeTeamId'].astype(int)
            )
            pbp_txt['plays']['end.pos_team.id'] = pbp_txt['plays']['end.team.id'].apply(lambda x: int(x))
            pbp_txt['plays']['end.def_pos_team.id'] = pbp_txt['plays']['end.def_team.id'].apply(lambda x: int(x))
            pbp_txt['plays']['start.pos_team.name'] = np.where(
                pbp_txt['plays']['start.pos_team.id'].astype(int) == pbp_txt['plays']['homeTeamId'].astype(int),
                pbp_txt['plays']['homeTeamName'],pbp_txt['plays']['awayTeamName']
            )
            pbp_txt['plays']['start.def_pos_team.name'] = np.where(
                pbp_txt['plays']['start.pos_team.id'].astype(int) == pbp_txt['plays']['homeTeamId'].astype(int),
                pbp_txt['plays']['awayTeamName'], pbp_txt['plays']['homeTeamName']
            )
            pbp_txt['plays']['end.pos_team.name'] = np.where(
                pbp_txt['plays']['end.pos_team.id'].astype(int) == pbp_txt['plays']['homeTeamId'].astype(int),
                pbp_txt['plays']['homeTeamName'],pbp_txt['plays']['awayTeamName']
            )
            pbp_txt['plays']['end.def_pos_team.name'] = np.where(
                pbp_txt['plays']['end.pos_team.id'].astype(int) == pbp_txt['plays']['homeTeamId'].astype(int),
                pbp_txt['plays']['awayTeamName'], pbp_txt['plays']['homeTeamName']
            )
            pbp_txt['plays']['start.is_home'] = np.where(
                pbp_txt['plays']["start.pos_team.id"].astype(int) == pbp_txt['plays']['homeTeamId'].astype(int),
                True, False
            )
            pbp_txt['plays']['end.is_home'] = np.where(
                pbp_txt['plays']["end.pos_team.id"].astype(int) == pbp_txt['plays']['homeTeamId'].astype(int),
                True, False
            )
            pbp_txt['plays']['homeTimeoutCalled'] = np.where(
                (pbp_txt['plays']['type.text']=='Timeout') &
                ((pbp_txt['plays']['text'].str.lower().str.contains(str(homeTeamAbbrev),case=False))|
                 (pbp_txt['plays']['text'].str.lower().str.contains(str(homeTeamName), case=False))|
                 (pbp_txt['plays']['text'].str.lower().str.contains(str(homeTeamMascot), case=False))|
                 (pbp_txt['plays']['text'].str.lower().str.contains(str(homeTeamNameAlt), case=False))),
                True, False
            )
            pbp_txt['plays']['awayTimeoutCalled'] = np.where(
                (pbp_txt['plays']['type.text']=='Timeout') &
                ((pbp_txt['plays']['text'].str.lower().str.contains(str(awayTeamAbbrev),case=False))|
                 (pbp_txt['plays']['text'].str.lower().str.contains(str(awayTeamName), case=False))|
                 (pbp_txt['plays']['text'].str.lower().str.contains(str(awayTeamMascot), case=False))|
                 (pbp_txt['plays']['text'].str.lower().str.contains(str(awayTeamNameAlt), case=False))),
                True, False
            )
            pbp_txt['timeouts'][homeTeamId]["1"] = pbp_txt['plays'].loc[
                        (pbp_txt['plays']['homeTimeoutCalled'] == True) &
                        (pbp_txt['plays']['period.number'] <= 2)].reset_index()['id']
            pbp_txt['timeouts'][homeTeamId]["2"] = pbp_txt['plays'].loc[
                        (pbp_txt['plays']['homeTimeoutCalled'] == True) &
                        (pbp_txt['plays']['period.number'] > 2)
                        ].reset_index()['id']
            pbp_txt['timeouts'][awayTeamId]["1"] = pbp_txt['plays'].loc[
                        (pbp_txt['plays']['awayTimeoutCalled'] == True) &
                        (pbp_txt['plays']['period.number'] <= 2)
                        ].reset_index()['id']
            pbp_txt['timeouts'][awayTeamId]["2"] = pbp_txt['plays'].loc[
                        (pbp_txt['plays']['awayTimeoutCalled'] == True) &
                        (pbp_txt['plays']['period.number'] > 2)
                        ].reset_index()['id']

            pbp_txt['timeouts'][homeTeamId]["1"] = pbp_txt['timeouts'][homeTeamId]["1"].apply(lambda x: int(x))
            pbp_txt['timeouts'][homeTeamId]["2"] = pbp_txt['timeouts'][homeTeamId]["2"].apply(lambda x: int(x))
            pbp_txt['timeouts'][awayTeamId]["1"] = pbp_txt['timeouts'][awayTeamId]["1"].apply(lambda x: int(x))
            pbp_txt['timeouts'][awayTeamId]["2"] = pbp_txt['timeouts'][awayTeamId]["2"].apply(lambda x: int(x))
            pbp_txt['plays']['end.homeTeamTimeouts'] = 3 - pbp_txt['plays'].apply(
                lambda x: ((pbp_txt['timeouts'][homeTeamId]["1"] <= x['id']) & (x['period.number'] <= 2))|
                        ((pbp_txt['timeouts'][homeTeamId]["2"] <= x['id']) & (x['period.number'] > 2)), axis = 1
            ).apply(lambda x: int(x.sum()), axis=1)
            pbp_txt['plays']['end.awayTeamTimeouts'] = 3 - pbp_txt['plays'].apply(
                lambda x: ((pbp_txt['timeouts'][awayTeamId]["1"] <= x['id']) & (x['period.number'] <= 2))|
                        ((pbp_txt['timeouts'][awayTeamId]["2"] <= x['id']) & (x['period.number'] > 2)), axis = 1
            ).apply(lambda x: int(x.sum()), axis=1)
            pbp_txt['plays']['start.homeTeamTimeouts'] = pbp_txt['plays']['end.homeTeamTimeouts'].shift(1)
            pbp_txt['plays']['start.awayTeamTimeouts'] = pbp_txt['plays']['end.awayTeamTimeouts'].shift(1)
            pbp_txt['plays']['start.homeTeamTimeouts'] = np.where(
                (pbp_txt['plays']['game_play_number'] == 1) |
                ((pbp_txt['plays']['half'] == "2") & (pbp_txt['plays']['lag_half'] == "1")),
                3, pbp_txt['plays']['start.homeTeamTimeouts']
            )
            pbp_txt['plays']['start.awayTeamTimeouts'] = np.where(
                (pbp_txt['plays']['game_play_number'] == 1)|
                ((pbp_txt['plays']['half'] == "2") & (pbp_txt['plays']['lag_half'] == "1")),
                3, pbp_txt['plays']['start.awayTeamTimeouts']
            )
            pbp_txt['plays']['start.homeTeamTimeouts'] = pbp_txt['plays']['start.homeTeamTimeouts'].apply(lambda x: int(x))
            pbp_txt['plays']['start.awayTeamTimeouts'] = pbp_txt['plays']['start.awayTeamTimeouts'].apply(lambda x: int(x))
            pbp_txt['plays']['end.TimeSecsRem'] = pbp_txt['plays']['start.TimeSecsRem'].shift(1)
            pbp_txt['plays']['end.adj_TimeSecsRem'] = pbp_txt['plays']['start.adj_TimeSecsRem'].shift(1)
            pbp_txt['plays']['end.TimeSecsRem'] = np.where(
                (pbp_txt['plays']['game_play_number'] == 1)|
                ((pbp_txt['plays']['half'] == "2") & (pbp_txt['plays']['lag_half'] == "1")),
                1800, pbp_txt['plays']['end.TimeSecsRem']
            )
            pbp_txt['plays']['end.adj_TimeSecsRem'] = np.select(
                [
                    (pbp_txt['plays']['game_play_number'] == 1),
                    ((pbp_txt['plays']['half'] == "2") & (pbp_txt['plays']['lag_half'] == "1"))
                ],
                [
                    3600, 
                    1800
                ], default = pbp_txt['plays']['end.adj_TimeSecsRem']
            )
            pbp_txt['plays']['start.posTeamTimeouts'] = np.where(
                pbp_txt['plays']['start.pos_team.id'] == pbp_txt['plays']['homeTeamId'],
                pbp_txt['plays']['start.homeTeamTimeouts'],
                pbp_txt['plays']['start.awayTeamTimeouts']
            )
            pbp_txt['plays']['start.defPosTeamTimeouts'] = np.where(
                pbp_txt['plays']['start.def_pos_team.id'] == pbp_txt['plays']['homeTeamId'],
                pbp_txt['plays']['start.homeTeamTimeouts'],
                pbp_txt['plays']['start.awayTeamTimeouts']
            )
            pbp_txt['plays']['end.posTeamTimeouts'] = np.where(
                pbp_txt['plays']['end.pos_team.id'] == pbp_txt['plays']['homeTeamId'],
                pbp_txt['plays']['end.homeTeamTimeouts'],
                pbp_txt['plays']['end.awayTeamTimeouts']
            )
            pbp_txt['plays']['end.defPosTeamTimeouts'] = np.where(
                pbp_txt['plays']['end.def_pos_team.id'] == pbp_txt['plays']['homeTeamId'],
                pbp_txt['plays']['end.homeTeamTimeouts'],
                pbp_txt['plays']['end.awayTeamTimeouts']
            )
            pbp_txt['firstHalfKickoffTeamId'] = np.where(
                (pbp_txt['plays']['game_play_number'] == 1) &
                (pbp_txt['plays']['type.text'].isin(kickoff_vec)) &
                (pbp_txt['plays']['start.team.id'] == pbp_txt['plays']['homeTeamId']),
                pbp_txt['plays']['homeTeamId'],
                pbp_txt['plays']['awayTeamId']
            )
            pbp_txt['plays']['firstHalfKickoffTeamId'] = pbp_txt['firstHalfKickoffTeamId']
            pbp_txt['plays']['period'] = pbp_txt['plays']['period.number']
            pbp_txt['plays']['start.yard'] = np.where(
                (pbp_txt['plays']['start.team.id'] == homeTeamId),
                100 - pbp_txt['plays']['start.yardLine'],
                pbp_txt['plays']['start.yardLine']
            )
            pbp_txt['plays']['start.yardsToEndzone'] = np.where(
                pbp_txt['plays']['start.yardLine'].isna() == False,
                pbp_txt['plays']['start.yardsToEndzone'],
                pbp_txt['plays']['start.yard']
            )
            pbp_txt['plays']['start.yardsToEndzone'] = np.where(
                pbp_txt['plays']['start.yardsToEndzone'] == 0,
                pbp_txt['plays']['start.yard'],
                pbp_txt['plays']['start.yardsToEndzone']
            )
            pbp_txt['plays']['end.yard'] = np.where(
                (pbp_txt['plays']['end.team.id'] == homeTeamId),
                100 - pbp_txt['plays']['end.yardLine'],
                pbp_txt['plays']['end.yardLine']
            )
            pbp_txt['plays']['end.yard'] = np.where(
                (pbp_txt['plays']['type.text'] == "Penalty") &
                (pbp_txt['plays']["text"].str.contains("declined", case=False, flags=0, na=False, regex=True)),
                pbp_txt['plays']['start.yard'],
                pbp_txt['plays']['end.yard']
            )
            pbp_txt['plays']['end.yardsToEndzone'] = np.where(
                pbp_txt['plays']['end.yardLine'].isna() == False,
                pbp_txt['plays']['end.yardsToEndzone'],
                pbp_txt['plays']['end.yard']
            )
            pbp_txt['plays']['end.yardsToEndzone'] = np.where(
                (pbp_txt['plays']['type.text'] == "Penalty") &
                (pbp_txt['plays']["text"].str.contains("declined", case=False, flags=0, na=False, regex=True)),
                pbp_txt['plays']['start.yardsToEndzone'],
                pbp_txt['plays']['end.yardsToEndzone']
            )
            pbp_txt['timeouts'][homeTeamId]["1"] = np.array(pbp_txt['timeouts'][homeTeamId]["1"]).tolist()
            pbp_txt['timeouts'][homeTeamId]["2"] = np.array(pbp_txt['timeouts'][homeTeamId]["2"]).tolist()
            pbp_txt['timeouts'][awayTeamId]["1"] = np.array(pbp_txt['timeouts'][awayTeamId]["1"]).tolist()
            pbp_txt['timeouts'][awayTeamId]["2"] = np.array(pbp_txt['timeouts'][awayTeamId]["2"]).tolist()
            if 'scoringType.displayName' in pbp_txt['plays'].keys():
                pbp_txt['plays']['type.text'] = np.where(
                    pbp_txt['plays']['scoringType.displayName']=='Field Goal',
                    'Field Goal Good', pbp_txt['plays']['type.text']
                )
                pbp_txt['plays']['type.text'] = np.where(
                    pbp_txt['plays']['scoringType.displayName']=='Extra Point',
                    'Extra Point Good', pbp_txt['plays']['type.text']
                )
                
            pbp_txt['plays']['playType'] = np.where(
                pbp_txt['plays']['type.text'].isna() == False,
                pbp_txt['plays']['type.text'], "Unknown"
            )
            pbp_txt['plays']['type.text'] = np.where(
                    pbp_txt['plays']['text'].str.lower().str.contains("extra point", case=False) &
                    pbp_txt['plays']['text'].str.lower().str.contains("no good", case=False),
                    'Extra Point Missed', pbp_txt['plays']['type.text']
                )
            pbp_txt['plays']['type.text'] = np.where(
                pbp_txt['plays']['text'].str.lower().str.contains("extra point", case=False) &
                pbp_txt['plays']['text'].str.lower().str.contains("blocked", case=False),
                'Extra Point Missed', pbp_txt['plays']['type.text']
            )
            pbp_txt['plays']['type.text'] = np.where(
                pbp_txt['plays']['text'].str.lower().str.contains("field goal", case=False) &
                pbp_txt['plays']['text'].str.lower().str.contains("blocked", case=False),
                'Blocked Field Goal', pbp_txt['plays']['type.text']
            )
            pbp_txt['plays']['type.text'] = np.where(
                pbp_txt['plays']['text'].str.lower().str.contains("field goal", case=False) &
                pbp_txt['plays']['text'].str.lower().str.contains("no good", case=False),
                'Field Goal Missed', pbp_txt['plays']['type.text']
            )
            del pbp_txt['plays']['clock.mm']
        else:
            pbp_txt['drives']={}
        if 'scoringPlays' not in pbp_txt.keys():
            pbp_txt['scoringPlays']=np.array([])
        if 'winprobability' not in pbp_txt.keys():
            pbp_txt['winprobability'] = np.array([])
        if 'standings' not in pbp_txt.keys():
            pbp_txt['standings'] = np.array([])
        if 'videos' not in pbp_txt.keys():
            pbp_txt['videos'] = np.array([])
        if 'broadcasts' not in pbp_txt.keys():
            pbp_txt['broadcasts'] = np.array([])  
        
        pbp_json = {
            "gameId": pbp_txt['gameId'],
            "drives" : pbp_txt['drives'],
            "plays" : pbp_txt['plays'].to_dict(orient='records'),
            "boxscore" : pbp_txt['boxscore'],
            "header" : pbp_txt['header'],
            "standings" : pbp_txt['standings'],
            "timeouts" : pbp_txt['timeouts'],
            "scoringPlays" : np.array(pbp_txt['scoringPlays']).tolist(),
            "winprobability" : np.array(pbp_txt['winprobability']).tolist(),
            "homeTeamSpread" : np.array(pbp_txt['homeTeamSpread']).tolist(),
            "broadcasts" : np.array(pbp_txt['broadcasts']).tolist(),
            "videos" : np.array(pbp_txt['videos']).tolist(),
            "pickcenter" : np.array(pbp_txt['pickcenter']).tolist(),
            "espnWP" : np.array(pbp_txt['espnWP']).tolist(),
            "gameInfo" : np.array(pbp_txt['gameInfo']).tolist(),
            "season" : np.array(pbp_txt['season']).tolist()
        }
        
        return pbp_json, pbp_json['season']['year']
    

schedule = pd.read_csv('cfb_games_info_2002_2020.csv', encoding='latin-1')
schedule = schedule.sort_values(by=['season', 'week'], ascending = False)

games = schedule[(schedule['season'].isin(years_arr))].reset_index()['game_id']
print(f"Number of Games: {len(games)}, first gameId: {games[0]}")
g, y = cfb_pbp(gameId = games[0])
# for key in g.keys():
#     print(f"{key}: {type(g[key])}")    
json.dumps(g,indent=4)
print(g.keys())
print(pd.DataFrame(g['plays']).columns)
y

Number of Games: 888, first gameId: 401132973
dict_keys(['gameId', 'drives', 'plays', 'boxscore', 'header', 'standings', 'timeouts', 'scoringPlays', 'winprobability', 'homeTeamSpread', 'broadcasts', 'videos', 'pickcenter', 'espnWP', 'gameInfo', 'season'])
Index(['homeScore', 'scoringPlay', 'priority', 'statYardage', 'awayScore',
       'wallclock', 'modified', 'id', 'text', 'period.number',
       ...
       'end.adj_TimeSecsRem', 'start.posTeamTimeouts',
       'start.defPosTeamTimeouts', 'end.posTeamTimeouts',
       'end.defPosTeamTimeouts', 'firstHalfKickoffTeamId', 'period',
       'start.yard', 'end.yard', 'playType'],
      dtype='object', length=106)


2019

In [None]:
#322,428,457,619,643,746-769,771 of 2002
i = 771
for game in games[i:]:
    print(f"Working on game {i+1} of {len(games)}, gameId: {games[i]}")
    if len(str(game))<9:
        i+=1
        continue
    g, y = cfb_pbp(gameId=game)
    fp = "cfb/{}/{}.json".format(y, game)
    with open(fp,'w') as f:
        json.dump(g, f, indent=4, sort_keys=False) 
    time.sleep(2)
    i+=1