### Import packages, setup API config

In [2]:
#Main Imports
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
import numpy as np
import requests
plt.style.use('ggplot')

#### CFBD API Base Data

In [74]:
baseurl = "https://api.collegefootballdata.com/"
headers = {
    "Authorization" : "BEARER Y2P4Ex6vaj/fPBURQsf2jz+0R2pXikYv8PtvqoqiMG7ukTvpVscCVjUA10VDv+My"
}

#### Get Game Data

In [75]:
games_data = []
years = list(range(2005,2025))
for year in years:
    request_url = baseurl + f'games?year={year}'
    response = requests.get(request_url,headers=headers)
    if response.status_code == 200:
        data = response.json()
        games_data.append(data)
        print(f'Successfully added games for {year}')
    else:
        print(f'Failed to retreive data: {response.status_code}')

Successfully added games for 2005
Successfully added games for 2006
Successfully added games for 2007
Successfully added games for 2008
Successfully added games for 2009
Successfully added games for 2010
Successfully added games for 2011
Successfully added games for 2012
Successfully added games for 2013
Successfully added games for 2014
Successfully added games for 2015
Successfully added games for 2016
Successfully added games for 2017
Successfully added games for 2018
Successfully added games for 2019
Successfully added games for 2020
Successfully added games for 2021
Successfully added games for 2022
Successfully added games for 2023
Successfully added games for 2024


In [83]:
games_data = [item for sublist in games_data for item in sublist]

In [91]:
gamesdf = pd.DataFrame(games_data)

### Load In Plays Data

In [4]:
conn = sqlite3.connect('cfb_data.db')
query = "SELECT * FROM plays"
playsdf = pd.read_sql_query(query,conn)
conn.close()

In [34]:
playsdf.play_type.unique()

array(['Rush', 'Pass', 'Penalty', 'Punt', 'Extra Point Good', 'Kickoff',
       'Timeout', 'Field Goal Good', 'Field Goal Missed',
       'Extra Point Missed', 'End Period', 'Safety', 'End of Half',
       'Pass Incompletion', 'Pass Completion', 'Pass Interception',
       'Uncategorized', 'Sack', 'Fumble Return Touchdown',
       'Punt Return Touchdown', '2pt Conversion',
       'Kickoff Return (Offense)', 'Pass Reception',
       'Fumble Recovery (Opponent)', 'Fumble Recovery (Own)',
       'Passing Touchdown', 'Pass Interception Return',
       'Rushing Touchdown', 'End of Game', 'Blocked Field Goal',
       'Kickoff Return Touchdown', 'Interception Return Touchdown',
       'Blocked Punt', 'Blocked Field Goal Touchdown',
       'Defensive 2pt Conversion', 'Blocked Punt Touchdown',
       'Missed Field Goal Return', 'Interception',
       'Missed Field Goal Return Touchdown', 'placeholder',
       'Two Point Rush', 'End of Regulation', 'Two Point Pass'],
      dtype=object)

In [35]:
run_plays = ['Rush', 'Rushing Touchdown','Two Point Rush']
pass_plays = ['Pass', 'Pass Incompletion','Pass Completion','Pass Interception','Sack','Pass Reception','Passing Touchdown','Pass Interception Return','Interception Return Touchdown','Interception','Two Point Pass']
need_to_split_defaultrun = ['Fumble Return Touchdown','Fumble Recovery (Opponent)','Fumble Recovery (Own)']
need_to_split_defaultpass = ['Safety']

In [45]:
def run_pass(row):
    if row.play_type in run_plays:
        play_type_simple = "Rush"
    elif row.play_type in pass_plays:
        play_type_simple = "Pass"
    elif row.play_type in need_to_split_defaultrun:
        if 'pass' or 'interception' or 'sack' in row.play_text.lower():
            play_type_simple = "Pass"
        else:
            play_type_simple = 'Rush'
    elif row.play_type in need_to_split_defaultpass:
        if 'run' or 'rush' in row.play_text.lower():
            play_type_simple = "Rush"
        else:
            play_type_simple = 'Pass'
    else:
        play_type_simple = 'Other'
    return play_type_simple
playsdf['play_type_simple'] = playsdf.apply(run_pass,axis=1)

In [49]:
conn = sqlite3.connect('cfb_data.db')
playsdf.to_sql('plays',conn,if_exists='replace',index=False)
conn.close()

In [46]:
list(playsdf.play_type_simple.unique())

['Rush', 'Pass', 'Other']

### Dynamic Window Exponentially Weighted Moving Average

We want to use EPA/Play as a key feature in the model, but we want to bias this toward more recent outcomes (done with the exponential weighting). The dynamic window allows us to use the last 5 games if a team hasn't play more than 5 games in the most recent season. Then, if the team has played at least 5 games in the season, we will just use all the games from that season. This will allow us to get rid of past season priors as fast as we can. 

In [50]:
def dynamic_window_ewma(x):
    """
    Calculate rolling exponentially weighted EPA with a dynamic window size
    """
    values = np.zeros(len(x))
    for i, (_, row) in enumerate(x.iterrows()):
        epa = x.epa_shifted[:i+1]
        if row.week > 10:
            values[i] = epa.ewm(min_periods=1, span=row.week).mean().values[-1]
        else:
            values[i] = epa.ewm(min_periods=1, span=10).mean().values[-1]
            
    return pd.Series(values, index=x.index)

#### Split EPA into the different categories

In [72]:
# seperate EPA in to rushing offense, rushing defense, passing offense, passing defense for each team. Also just do offense and defense overall
rushing_offense_epa = playsdf.loc[playsdf['play_type_simple'] == 'Rush', :].groupby(['offense', 'year', 'week'], as_index=False)['ppa'].mean()

rushing_defense_epa = playsdf.loc[playsdf['play_type_simple'] == 'Rush', :].groupby(['defense', 'year', 'week'], as_index=False)['ppa'].mean()

passing_offense_epa = playsdf.loc[playsdf['play_type_simple'] == 'Pass', :].groupby(['offense', 'year', 'week'], as_index=False)['ppa'].mean()

passing_defense_epa = playsdf.loc[playsdf['play_type_simple'] == 'Pass', :].groupby(['defense', 'year', 'week'], as_index=False)['ppa'].mean()

KeyError: 'year'