In [8]:
from numpy.core.fromnumeric import mean
import pandas as pd
import numpy as np
import xgboost as xgb
import os
import re
import json
import time
import http
import urllib
import urllib.request
from urllib.error import URLError, HTTPError, ContentTooShortError
from datetime import datetime
from itertools import chain, starmap
import pandas as pd
from pathlib import Path
season = ''
dates = ''
week = ''
groups = ''
season_type = ''
path_to_json = '/'

def __init__(season = 0, dates = '', week = '', season_type = '', groups='', path_to_json = '/'):
    season = int(season)
    dates = int(dates)
    week = week
    season_type = season_type
    groups = groups
    path_to_json = path_to_json

def download(url, num_retries=8):
    try:
        html = urllib.request.urlopen(url).read()
    except (URLError, HTTPError, ContentTooShortError, http.client.HTTPException, http.client.IncompleteRead) as e:
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # recursively retry 5xx HTTP errors
                return download(url, num_retries - 1)
        if num_retries > 0:
            if e == http.client.IncompleteRead:
                return download(url, num_retries - 1)
    except (TypeError) as e:
        html = urllib.request.urlopen(url).read()
    return html

def key_check(obj, key, replacement = np.array([])):
    if key in obj.keys():
        obj_key = obj[key]
    else:
        obj_key = replacement
    return obj_key
def parse_event(event):
    bad_keys = ['linescores', 'statistics', 'leaders',  'records']
    for k in bad_keys:
        if k in event['competitions'][0]['competitors'][0].keys():
            del event['competitions'][0]['competitors'][0][k]
        if k in event['competitions'][0]['competitors'][1].keys():
            del event['competitions'][0]['competitors'][1][k]
    if 'links' in event['competitions'][0]['competitors'][0]['team'].keys():
        del event['competitions'][0]['competitors'][0]['team']['links']
    if 'links' in event['competitions'][0]['competitors'][1]['team'].keys():
        del event['competitions'][0]['competitors'][1]['team']['links']
    if event['competitions'][0]['competitors'][0]['homeAway']=='home':
        event['competitions'][0]['home'] = event['competitions'][0]['competitors'][0]['team']
    else: 
        event['competitions'][0]['away'] = event['competitions'][0]['competitors'][0]['team']
    if event['competitions'][0]['competitors'][1]['homeAway']=='away':
        event['competitions'][0]['away'] = event['competitions'][0]['competitors'][1]['team']
    else: 
        event['competitions'][0]['home'] = event['competitions'][0]['competitors'][1]['team']

    if event['competitions'][0]['competitors'][0]['homeAway']=='home':
        event['competitions'][0]['home_score'] = event['competitions'][0]['competitors'][0]['score']
    else: 
        event['competitions'][0]['away_score'] = event['competitions'][0]['competitors'][0]['score']
    if event['competitions'][0]['competitors'][1]['homeAway']=='away':
        event['competitions'][0]['away_score'] = event['competitions'][0]['competitors'][1]['score']
    else: 
        event['competitions'][0]['home_score'] = event['competitions'][0]['competitors'][1]['score']
    if 'curatedRank' in event['competitions'][0]['competitors'][0].keys():
        if event['competitions'][0]['competitors'][0]['homeAway']=='home':
            event['competitions'][0]['home_rank'] = event['competitions'][0]['competitors'][0]['curatedRank']['current']
        else: 
            event['competitions'][0]['away_rank'] = event['competitions'][0]['competitors'][0]['curatedRank']['current']
    if 'curatedRank' in event['competitions'][0]['competitors'][1].keys():
        if event['competitions'][0]['competitors'][1]['homeAway']=='away':
            event['competitions'][0]['away_rank'] = event['competitions'][0]['competitors'][1]['curatedRank']['current']
        else: 
            event['competitions'][0]['home_rank'] = event['competitions'][0]['competitors'][1]['curatedRank']['current']

    del_keys = ['competitions','broadcasts','geoBroadcasts', 'headlines']
    for k in del_keys:
        if k in event['competitions'][0].keys():
            del event['competitions'][0][k]
    return event

def cfb_schedule_date(groups, date, week, season_type):
    if groups is None:
        groups_param = '&groups=80'
    else:
        groups_param = '&groups=' + str(groups)
    if date is None:
        date_param = ''
    else:
        date_param = '&dates=' + str(date)
    if week is None:
        week_param = ''
    else:
        week_param = '&week=' + str(week)
    if season_type is None:
        season_type_param = ''
    else:
        season_type_param = '&seasontype=' + str(season_type)
    url = "http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100{}{}{}{}".format(date_param,date_param,week_param,season_type_param)
    print(url)
    try:
        resp_reg = download(url=url)
    except (TypeError) as e:
        resp_reg = download(url=url)
    events = json.loads(resp_reg)['events']
    ev = pd.DataFrame()
    for event in events:
        event = parse_event(event)
        ev = ev.append(pd.json_normalize(event['competitions'][0]))
    ev = json.loads(pd.DataFrame(ev).to_json(orient='records'))
    return ev

def cfb_schedule_year(groups, date,season_type):
    if groups is None:
        groups_param = '&groups=80'
    else:
        groups_param = '&groups=' + str(groups)
    if date is None:
        date_param = ''
    else:
        date_param = '&dates=' + str(date)
    if season_type is None:
        season_type_param = ''
    else:
        season_type_param = '&seasontype=' + str(season_type)
    
    url = "http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100{}{}{}".format(groups_param, date_param, season_type_param)
    
    schedule_table = pd.DataFrame()
    resp = download(url=url)
    
    txt = pd.json_normalize(json.loads(resp)['leagues'][0]['calendar'][0], 
                        record_path=['entries'], 
                        meta=['label','value','startDate','endDate'],
                        meta_prefix='seasontype.')

    txt2 = pd.json_normalize(json.loads(resp)['leagues'][0]['calendar'][1], 
                            record_path=['entries'], 
                            meta=['label','value','startDate','endDate'],
                            meta_prefix='seasontype.')
    txt = txt.append(txt2)
    
    for x,y in zip(txt['seasontype.value'],txt['value']):
        ev = cfb_schedule_date(groups=groups,date=date,week=y,season_type=x)
        schedule_table = schedule_table.append(ev)
    
    return schedule_table
    

In [9]:
path_to_schedules = "cfb/schedules"
final_file_name = "cfb_schedule_2002_2021.csv"

years_arr = range(2004,2022)
schedule_table = pd.DataFrame()
for year in years_arr:
# for val in [2,3]:
    try:
        year_schedule = cfb_schedule_year(groups='80',date=year,season_type=2)
    except (TypeError) as e:
        year_schedule = cfb_schedule_year(groups='80',date=year,season_type=2)
    #         print(year_schedule)
    #         schedule_table.append(year_schedule)
    year_schedule['game_id'] = year_schedule['id']

    year_schedule.to_csv(f"{path_to_schedules}/cfb_schedule_{year}.csv", index=False)


http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2004&dates=2004&week=1&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2004&dates=2004&week=2&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2004&dates=2004&week=3&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2004&dates=2004&week=4&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2004&dates=2004&week=5&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2004&dates=2004&week=6&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2004&dates=2004&week=7&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-

http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2007&dates=2007&week=12&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2007&dates=2007&week=13&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2007&dates=2007&week=14&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2007&dates=2007&week=15&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2007&dates=2007&week=16&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2007&dates=2007&week=1&seasontype=3
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2008&dates=2008&week=1&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/col

http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2011&dates=2011&week=7&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2011&dates=2011&week=8&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2011&dates=2011&week=9&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2011&dates=2011&week=10&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2011&dates=2011&week=11&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2011&dates=2011&week=12&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2011&dates=2011&week=13&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/coll

http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2015&dates=2015&week=2&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2015&dates=2015&week=3&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2015&dates=2015&week=4&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2015&dates=2015&week=5&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2015&dates=2015&week=6&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2015&dates=2015&week=7&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2015&dates=2015&week=8&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-

http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2018&dates=2018&week=15&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2018&dates=2018&week=1&seasontype=3
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2019&dates=2019&week=1&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2019&dates=2019&week=2&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2019&dates=2019&week=3&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2019&dates=2019&week=4&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college-football/scoreboard?limit=100&dates=2019&dates=2019&week=5&seasontype=2
http://site.api.espn.com/apis/site/v2/sports/football/college

In [10]:
path_to_raw = "cfb/schedules"
final_file_name = "cfb_schedule_2002_2021.csv"
csv_files = [pos_csv.replace('.csv', '') for pos_csv in os.listdir(path_to_schedules) if pos_csv.endswith('.csv')]
glued_data = pd.DataFrame()
for index, js in enumerate(csv_files):
    x = pd.read_csv(f"{path_to_schedules}/{js}.csv", low_memory=False)
    glued_data = pd.concat([glued_data,x],axis=0)
glued_data['game_id'] = glued_data['id']

glued_data.to_csv(final_file_name)