In [253]:
import os
import requests
from datetime import datetime

from tqdm import tqdm_notebook as tqdm

import pandas as pd
pd.set_option('display.max_rows', 100)
import numpy as np

In [229]:
class DownloadCollegeFootballData(object):
    """
    Object to download data from college football data api

    """
    base_url = 'https://api.collegefootballdata.com/'

    years = [int(2000 + year) for year in range(int(pd.Timestamp(datetime.now()).year - 2000))]

    endpoints = {
        'games':  'games/',
        'stats': 'games/teams'
    }

    def download_games(self) -> pd.DataFrame:
        """
        Download Game Information (no stats) for a year
        """
        df = []
        for year in (self.years):
            print(year)
            # Return data
            r = requests.get(self.base_url + self.endpoints['games'], params={'year': year})

            # Convert to dataframe
            df_year = pd.DataFrame.from_records(r.json())

            # Only retain useful fields, rename as necessary
            df_year = df_year[
                ['id', 'season', 'week', 'season_type', 'home_team', 'away_team', 'home_points', 'away_points']
            ].rename(columns={'id': 'game_id'})

            # Gather
            df.append(df_year)
        df = pd.concat(df).reset_index(drop=True)

        return df

    def download_stats(self, df_games: pd.DataFrame = None) -> pd.DataFrame:
        """
        Download stats for each game
        """
        if df_games is None:
            df_games = self.download_games()
        assert 'game_id' in df_games.columns
        print('stats')
        df_stats, not_working = [], {}
        for game_id in set(df_games['game_id']):
            params = {'gameId': game_id}
            r = requests.get(self.base_url + self.endpoints['stats'], params=params)
            if r.status_code != 200:
                continue
            if len(r.json()) == 0:
                continue
            try: 
                stats_by_team = r.json()[0]['teams']
                df_game = []
                for team in stats_by_team:
                    df_game_long = pd.DataFrame.from_records(team['stats'])
                    df_game_long['category'] = team['homeAway'] + '_' + df_game_long['category']
                    df_game.append(df_game_long)
                df_game = pd.concat(df_game)
                # Pivot
                df_game = pd.pivot_table(df_game, columns='category', values='stat', aggfunc='first').reset_index(drop=True)
                df_stats.append(df_game.assign(game_id=game_id))
            except:
                not_working[game_id] = r.json()
        df_stats = pd.concat(df_stats).reset_index(drop=True)
        
        return df_stats, not_working

In [230]:
downloader = DownloadCollegeFootballData()

In [243]:
params = {'gameId': 222430130}
r = requests.get('https://api.collegefootballdata.com/games/teams', params=params)

In [244]:
r.json()

[]

In [250]:
import os
DATA_DIR = os.path.join(os.getcwd(), 'data')

df_stats = pd.read_csv(os.path.join(DATA_DIR, 'df_stats.csv'))
df_stats.shape

(12593, 71)

In [254]:
df_stats.isna().mean()

away_completionAttempts     0.002065
away_defensiveTDs           0.736282
away_firstDowns             0.002065
away_fourthDownEff          0.002065
away_fumblesLost            0.002065
away_fumblesRecovered       0.000556
away_interceptionTDs        0.446200
away_interceptionYards      0.446200
away_interceptions          0.002065
away_kickReturnTDs          0.314460
away_kickReturnYards        0.314460
away_kickReturns            0.314460
away_kickingPoints          0.037878
away_netPassingYards        0.002065
away_passesDeflected        0.736282
away_passesIntercepted      0.446200
away_passingTDs             0.005162
away_possessionTime         0.005082
away_puntReturnTDs          0.232987
away_puntReturnYards        0.232987
away_puntReturns            0.232987
away_qbHurries              0.736282
away_rushingAttempts        0.002065
away_rushingTDs             0.002382
away_rushingYards           0.002065
away_sacks                  0.736282
away_tackles                0.736282
a

In [249]:
[
    'yardsPerRushAttempt', 
    'yardsPerPass', 
    'turnovers', 
    'totalYards', 
    'totalPenaltiesYards', 
    'thirdDownEff',
    'rushingYards',
    'rushingTDs',
    'rushingAttempts',
    'possessionTime',
    'passingTDs',
    'netPassingYards',
    'interceptions',
    'fumblesRecovered',
    'fumblesLost',
    'fourthDownEff',
    'firstDowns',
    'completionAttempts',
    'kickingPoints'
    ]

Unnamed: 0,away_completionAttempts,away_defensiveTDs,away_firstDowns,away_fourthDownEff,away_fumblesLost,away_fumblesRecovered,away_interceptionTDs,away_interceptionYards,away_interceptions,away_kickReturnTDs,away_kickReturnYards,away_kickReturns,away_kickingPoints,away_netPassingYards,away_passesDeflected,away_passesIntercepted,away_passingTDs,away_possessionTime,away_puntReturnTDs,away_puntReturnYards,away_puntReturns,away_qbHurries,away_rushingAttempts,away_rushingTDs,away_rushingYards,away_sacks,away_tackles,away_tacklesForLoss,away_thirdDownEff,away_totalFumbles,away_totalPenaltiesYards,away_totalYards,away_turnovers,away_yardsPerPass,away_yardsPerRushAttempt,game_id,home_completionAttempts,home_defensiveTDs,home_firstDowns,home_fourthDownEff,home_fumblesLost,home_fumblesRecovered,home_interceptionTDs,home_interceptionYards,home_interceptions,home_kickReturnTDs,home_kickReturnYards,home_kickReturns,home_kickingPoints,home_netPassingYards,home_passesDeflected,home_passesIntercepted,home_passingTDs,home_possessionTime,home_puntReturnTDs,home_puntReturnYards,home_puntReturns,home_qbHurries,home_rushingAttempts,home_rushingTDs,home_rushingYards,home_sacks,home_tackles,home_tacklesForLoss,home_thirdDownEff,home_totalFumbles,home_totalPenaltiesYards,home_totalYards,home_turnovers,home_yardsPerPass,home_yardsPerRushAttempt
0,11-19,,18.0,0-0,0.0,1.0,0.0,8.0,0.0,0.0,108.0,4.0,8.0,122.0,,1.0,2.0,30:00,0.0,1.0,1.0,,41.0,1.0,199.0,,,,4-11,,5-40,321.0,0.0,6.4,4.9,322732032,24-41,,23.0,1-3,1.0,0.0,,,1.0,0.0,66.0,5.0,7.0,208.0,,,1.0,29:06,0.0,2.0,1.0,,39.0,0.0,127.0,,,,7-18,,8-67,335.0,2.0,5.1,3.3
1,12-32,,7.0,1-3,3.0,2.0,0.0,1.0,2.0,0.0,165.0,11.0,0.0,93.0,,1.0,0.0,20:30,0.0,1.0,1.0,,36.0,0.0,74.0,,,,0-18,,7-35,140.0,5.0,2.1,2.1,302612483,9-20,,24.0,2-3,2.0,3.0,0.0,13.0,1.0,0.0,25.0,1.0,15.0,140.0,,2.0,4.0,16:10,0.0,68.0,7.0,,63.0,5.0,528.0,,,,8-16,,7-50,668.0,3.0,7.0,8.4
2,23-30,,26.0,1-1,2.0,4.0,0.0,0.0,1.0,,,,6.0,249.0,,1.0,2.0,23:34,1.0,94.0,5.0,,54.0,3.0,186.0,,,,5-15,,0-0,421.0,3.0,7.8,3.4,243040265,17-40,,11.0,1-1,4.0,2.0,1.0,28.0,1.0,,,,0.0,195.0,,1.0,1.0,13:05,0.0,19.0,3.0,,25.0,0.0,-9.0,,,,5-18,,0-0,156.0,5.0,4.1,-0.4
3,7-14,,9.0,0-0,1.0,0.0,0.0,27.0,1.0,,,,,78.0,,1.0,0.0,16:11,0.0,6.0,3.0,,35.0,0.0,49.0,,,,3-13,,0-0,104.0,2.0,3.9,1.4,243040276,19-35,,21.0,2-3,0.0,1.0,1.0,38.0,1.0,,,,8.0,167.0,,1.0,1.0,20:12,0.0,31.0,4.0,,42.0,0.0,176.0,,,,5-16,,0-0,331.0,1.0,4.4,4.2
4,16-29,,17.0,1-3,2.0,0.0,,,2.0,,,,,261.0,,,0.0,18:23,0.0,8.0,2.0,,39.0,0.0,121.0,,,,8-16,,0-0,356.0,4.0,8.1,3.1,243040278,15-23,,23.0,0-0,0.0,2.0,0.0,5.0,0.0,,,,6.0,203.0,,1.0,3.0,17:25,0.0,0.0,1.0,,41.0,3.0,269.0,,,,7-11,,0-0,465.0,0.0,8.5,6.6


In [245]:
df_fails = pd.read_csv(os.path.join(DATA_DIR, 'df_failed_stats.csv'))
df_fails.shape

(2902, 2)

In [246]:
df_fails.head()

Unnamed: 0,error,game_id
0,No Response,213352483
1,No Response,213352509
2,No Response,223412390
3,No Response,223412426
4,No Response,213352653


In [247]:
set(df_fails['error'])

{"HTTPSConnectionPool(host='api.collegefootballdata.com', port=443): Max retries exceeded with url: /games/teams?gameId=302890059 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000023E9A8A0400>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond',))",
 "HTTPSConnectionPool(host='api.collegefootballdata.com', port=443): Max retries exceeded with url: /games/teams?gameId=400547963 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000023E9D40F4E0>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond',))",
 'No Response'}

In [255]:
r = requests.get('https://api.collegefootballdata.com/rankings?year=2005')
r = r.json()

In [265]:
r[0]['polls'][0]

{'poll': 'Coaches Poll',
 'ranks': [{'rank': 22,
   'school': 'Michigan State',
   'conference': 'Big Ten',
   'firstPlaceVotes': 0,
   'points': 190},
  {'rank': 17,
   'school': 'Wisconsin',
   'conference': 'Big Ten',
   'firstPlaceVotes': 0,
   'points': 553},
  {'rank': 13,
   'school': 'Ohio State',
   'conference': 'Big Ten',
   'firstPlaceVotes': 0,
   'points': 742},
  {'rank': 25,
   'school': 'Minnesota',
   'conference': 'Big Ten',
   'firstPlaceVotes': 0,
   'points': 90},
  {'rank': 1,
   'school': 'USC',
   'conference': 'Pac-10',
   'firstPlaceVotes': 54,
   'points': 1540},
  {'rank': 9,
   'school': 'UCLA',
   'conference': 'Pac-10',
   'firstPlaceVotes': 0,
   'points': 1053},
  {'rank': 23,
   'school': 'California',
   'conference': 'Pac-10',
   'firstPlaceVotes': 0,
   'points': 175},
  {'rank': 16,
   'school': 'Oregon',
   'conference': 'Pac-10',
   'firstPlaceVotes': 0,
   'points': 588},
  {'rank': 20,
   'school': 'West Virginia',
   'conference': 'Big East',

In [263]:
df_ranks = pd.DataFrame.from_records(r[0]['polls'][0]['ranks'])
df_ranks.head()

Unnamed: 0,rank,school,conference,firstPlaceVotes,points
0,22,Michigan State,Big Ten,0,190
1,17,Wisconsin,Big Ten,0,553
2,13,Ohio State,Big Ten,0,742
3,25,Minnesota,Big Ten,0,90
4,1,USC,Pac-10,54,1540


In [264]:
df_all_ranks = []
for week_record in r.json():
    week = week_record['week']
    for poll in week_record['polls']:
        poll_name = poll['poll']
        df_ranks = pd.DataFrame.from_records(poll['ranks'])
        

{'season': 2005,
 'seasonType': 'regular',
 'week': 8,
 'polls': [{'poll': 'Coaches Poll',
   'ranks': [{'rank': 22,
     'school': 'Michigan State',
     'conference': 'Big Ten',
     'firstPlaceVotes': 0,
     'points': 190},
    {'rank': 17,
     'school': 'Wisconsin',
     'conference': 'Big Ten',
     'firstPlaceVotes': 0,
     'points': 553},
    {'rank': 13,
     'school': 'Ohio State',
     'conference': 'Big Ten',
     'firstPlaceVotes': 0,
     'points': 742},
    {'rank': 25,
     'school': 'Minnesota',
     'conference': 'Big Ten',
     'firstPlaceVotes': 0,
     'points': 90},
    {'rank': 1,
     'school': 'USC',
     'conference': 'Pac-10',
     'firstPlaceVotes': 54,
     'points': 1540},
    {'rank': 9,
     'school': 'UCLA',
     'conference': 'Pac-10',
     'firstPlaceVotes': 0,
     'points': 1053},
    {'rank': 23,
     'school': 'California',
     'conference': 'Pac-10',
     'firstPlaceVotes': 0,
     'points': 175},
    {'rank': 16,
     'school': 'Oregon',
    

In [266]:
df_rankings = pd.read_csv(os.path.join(os.getcwd(), 'data', 'df_rankings.csv'))
df_rankings.head()

Unnamed: 0,conference,firstPlaceVotes,points,rank,school,poll,week,year
0,ACC,,,17,Clemson,AP Top 25,1,2000
1,Big Ten,,,25,Michigan State,AP Top 25,1,2000
2,Big Ten,,,21,Illinois,AP Top 25,1,2000
3,Big Ten,,,4,Wisconsin,AP Top 25,1,2000
4,Big Ten,,,6,Michigan,AP Top 25,1,2000
