In [213]:
import os
import requests
from datetime import datetime

from tqdm import tqdm_notebook as tqdm

import pandas as pd
import numpy as np

In [229]:
class DownloadCollegeFootballData(object):
    """
    Object to download data from college football data api

    """
    base_url = 'https://api.collegefootballdata.com/'

    years = [int(2000 + year) for year in range(int(pd.Timestamp(datetime.now()).year - 2000))]

    endpoints = {
        'games':  'games/',
        'stats': 'games/teams'
    }

    def download_games(self) -> pd.DataFrame:
        """
        Download Game Information (no stats) for a year
        """
        df = []
        for year in (self.years):
            print(year)
            # Return data
            r = requests.get(self.base_url + self.endpoints['games'], params={'year': year})

            # Convert to dataframe
            df_year = pd.DataFrame.from_records(r.json())

            # Only retain useful fields, rename as necessary
            df_year = df_year[
                ['id', 'season', 'week', 'season_type', 'home_team', 'away_team', 'home_points', 'away_points']
            ].rename(columns={'id': 'game_id'})

            # Gather
            df.append(df_year)
        df = pd.concat(df).reset_index(drop=True)

        return df

    def download_stats(self, df_games: pd.DataFrame = None) -> pd.DataFrame:
        """
        Download stats for each game
        """
        if df_games is None:
            df_games = self.download_games()
        assert 'game_id' in df_games.columns
        print('stats')
        df_stats, not_working = [], {}
        for game_id in set(df_games['game_id']):
            params = {'gameId': game_id}
            r = requests.get(self.base_url + self.endpoints['stats'], params=params)
            if r.status_code != 200:
                continue
            if len(r.json()) == 0:
                continue
            try: 
                stats_by_team = r.json()[0]['teams']
                df_game = []
                for team in stats_by_team:
                    df_game_long = pd.DataFrame.from_records(team['stats'])
                    df_game_long['category'] = team['homeAway'] + '_' + df_game_long['category']
                    df_game.append(df_game_long)
                df_game = pd.concat(df_game)
                # Pivot
                df_game = pd.pivot_table(df_game, columns='category', values='stat', aggfunc='first').reset_index(drop=True)
                df_stats.append(df_game.assign(game_id=game_id))
            except:
                not_working[game_id] = r.json()
        df_stats = pd.concat(df_stats).reset_index(drop=True)
        
        return df_stats, not_working

In [230]:
downloader = DownloadCollegeFootballData()

In [243]:
params = {'gameId': 222430130}
r = requests.get('https://api.collegefootballdata.com/games/teams', params=params)

In [244]:
r.json()

[]

In [232]:
import os
DATA_DIR = os.path.join(os.getcwd(), 'data')

df_stats = pd.read_csv(os.path.join(DATA_DIR, 'df_stats.csv'))
df_stats.shape

(998, 71)

In [233]:
df_stats.head()

Unnamed: 0,away_completionAttempts,away_defensiveTDs,away_firstDowns,away_fourthDownEff,away_fumblesLost,away_fumblesRecovered,away_interceptionTDs,away_interceptionYards,away_interceptions,away_kickReturnTDs,away_kickReturnYards,away_kickReturns,away_kickingPoints,away_netPassingYards,away_passesDeflected,away_passesIntercepted,away_passingTDs,away_possessionTime,away_puntReturnTDs,away_puntReturnYards,away_puntReturns,away_qbHurries,away_rushingAttempts,away_rushingTDs,away_rushingYards,away_sacks,away_tackles,away_tacklesForLoss,away_thirdDownEff,away_totalFumbles,away_totalPenaltiesYards,away_totalYards,away_turnovers,away_yardsPerPass,away_yardsPerRushAttempt,game_id,home_completionAttempts,home_defensiveTDs,home_firstDowns,home_fourthDownEff,home_fumblesLost,home_fumblesRecovered,home_interceptionTDs,home_interceptionYards,home_interceptions,home_kickReturnTDs,home_kickReturnYards,home_kickReturns,home_kickingPoints,home_netPassingYards,home_passesDeflected,home_passesIntercepted,home_passingTDs,home_possessionTime,home_puntReturnTDs,home_puntReturnYards,home_puntReturns,home_qbHurries,home_rushingAttempts,home_rushingTDs,home_rushingYards,home_sacks,home_tackles,home_tacklesForLoss,home_thirdDownEff,home_totalFumbles,home_totalPenaltiesYards,home_totalYards,home_turnovers,home_yardsPerPass,home_yardsPerRushAttempt
0,12-18,,16.0,0-0,0.0,0,0.0,11.0,0.0,0.0,64.0,4.0,4.0,161.0,,1.0,2.0,20:03,0.0,0.0,3.0,,44.0,2.0,265.0,,,,9-15,,6-51,398.0,0.0,7.4,6.0,312672390,21-31,,18.0,0-1,0.0,0.0,,,1.0,0.0,117.0,5.0,6.0,272.0,,,2.0,16:37,0.0,17.0,2.0,,27.0,1.0,139.0,,,,4-10,,4-30,411.0,1.0,8.8,5.1
1,22-41,0.0,14.0,0-1,1.0,4,,,0.0,0.0,18.0,1.0,0.0,233.0,3.0,,3.0,22:14,,,,8.0,17.0,0.0,12.0,6.0,80.0,14.0,4-14,2.0,12-139,245.0,1.0,5.7,0.7,401117510,28-42,0.0,30.0,1-1,2.0,2.0,,,0.0,0.0,37.0,2.0,4.0,326.0,16.0,,2.0,37:46,0.0,27.0,3.0,4.0,39.0,3.0,145.0,3.0,52.0,14.0,7-14,6.0,7-61,471.0,2.0,7.8,3.7
2,19-26,0.0,14.0,0-0,0.0,1,0.0,59.0,1.0,,,,6.0,211.0,7.0,2.0,2.0,25:19,0.0,4.0,1.0,3.0,36.0,2.0,107.0,5.0,38.0,8.0,6-15,,7-70,318.0,1.0,8.1,3.0,401014980,15-31,0.0,15.0,0-1,1.0,0.0,0.0,2.0,2.0,0.0,97.0,4.0,4.0,127.0,2.0,1.0,0.0,34:41,0.0,24.0,4.0,1.0,36.0,1.0,102.0,1.0,29.0,6.0,7-18,2.0,10-74,229.0,3.0,4.1,2.8
3,12-27,0.0,26.0,0-1,0.0,0,,,1.0,0.0,116.0,5.0,6.0,149.0,1.0,,0.0,28:55,,,,2.0,41.0,3.0,286.0,2.0,27.0,7.0,5-12,1.0,15-143,435.0,1.0,5.5,7.0,401014990,11-18,0.0,25.0,0-0,0.0,0.0,0.0,-1.0,0.0,,,,6.0,167.0,3.0,1.0,2.0,31:05,,,,5.0,43.0,5.0,296.0,2.0,36.0,6.0,7-10,1.0,5-55,463.0,0.0,9.3,6.9
4,25-35,0.0,29.0,1-1,0.0,0,,,0.0,0.0,59.0,3.0,4.0,252.0,7.0,,2.0,31:24,,,,6.0,40.0,2.0,192.0,1.0,26.0,4.0,9-15,2.0,11-102,444.0,0.0,7.2,4.8,401015000,27-45,0.0,24.0,1-1,0.0,0.0,,,0.0,0.0,19.0,4.0,8.0,285.0,3.0,,1.0,28:36,,,,3.0,29.0,1.0,120.0,1.0,38.0,3.0,6-13,1.0,10-120,405.0,0.0,6.3,4.1


In [235]:
df_fails = pd.read_csv(os.path.join(DATA_DIR, 'df_failed_stats.csv'))
df_fails.shape

(220, 2)

In [238]:
df_fails.head()

Unnamed: 0,error,game_id
0,list index out of range,223412390
1,list index out of range,222430120
2,list index out of range,222430130
3,list index out of range,222430150
4,list index out of range,232490030
