# Get CFB data from CFDB

This code extracts play-by-play data from the college football database API using the **cfbd** package. Additional cleaning and stat aggregation is completed prior to writing data to .csv file for R analysis.

**Further R analysis**
Play-by-play and aggregated player statistics are used in a multiple linear regression model that predicts fantasy points

In [1]:
from __future__ import print_function
import time
import cfbd
from cfbd.rest import ApiException
from pprint import pprint
import pandas as pd
import config 

configuration = cfbd.Configuration()
configuration.api_key['Authorization'] = config.api_key
configuration.api_key_prefix['Authorization'] = config.api_key_prefix

# Player Stats - Game

In [2]:
# debug
api_instance = cfbd.GamesApi(cfbd.ApiClient(configuration))
games = api_instance.get_player_game_stats(year=2020, week=12, season_type='regular')

In [3]:
# create an instance of the API class
api_instance = cfbd.GamesApi(cfbd.ApiClient(configuration))

# Get data for each week
year_list = [2020,2021]

season_type = 'regular' # str | Season type filter

games_2020 = [] # list of games by week, week = index+1

for i in year_list:
    for j in range(1,17):
        games = api_instance.get_player_game_stats(year=i, week=j, season_type=season_type)
        games_2020.append(games)

In [4]:
# Wrap this in a function
def get_stats(api_response):

    temp = [g.teams for g in api_response]

    # Get lists of all stats
    team = [] 
    statCats = [] # stat category (i.e. passing, rushing, receiving)
    statName = [] # i.e. rushing TD, passing TD, etc.
    player = []
    playerID = []
    value = [] # raw value for each stat, (4.0 avg reception, 2 receiving TDs, etc.)
    opponent = []
    statCat = ['fumbles', 'receiving', 'rushing', 'passing'] # only want these stat categories

    for i in range(0, len(temp)):
        for j in range(0,2):
            for k in range(0, len(temp[i][j%2]['categories'])):
                if(temp[i][j%2]['categories'][k]['name'] in statCat):
                    for m in range(0, len(temp[i][j%2]['categories'][k]['types'])):
                        for n in range(0, len(temp[i][j%2]['categories'][k]['types'][m]['athletes'])):
                            team.append(temp[i][j%2]['school'])
                            opponent.append(temp[i][abs(j%2-1)]['school'])
                            statCats.append(temp[i][j%2]['categories'][k]['name'])
                            statName.append(temp[i][j%2]['categories'][k]['types'][m]['name'])
                            player.append(temp[i][j%2]['categories'][k]['types'][m]['athletes'][n]['name'])
                            playerID.append(temp[i][j%2]['categories'][k]['types'][m]['athletes'][n]['id'])
                            value.append(temp[i][j%2]['categories'][k]['types'][m]['athletes'][n]['stat'])
    
    # Fix San Jose State tilde
    team = [i.replace('é', 'e') for i in team]
    
    temp_list = []
    for t, sc, sn, p, pid, v, o in zip(team, statCats, statName, player, playerID, value, opponent):
        temp = { 'Team': t, 'StatCat':sc, 'StatName':sn, 'Player': p, 'PlayerID': pid, 'Value':v, 'Opponent':o }
        temp_list.append(temp)
    
    df = pd.DataFrame(temp_list)
    return(df)

In [5]:
# initialize data frame
df = pd.DataFrame(columns = ['Team', 'StatCat', 'StatName', 'Player', 'PlayerID', 'Value', 'Opponent'])

# Get data from each week
for i in range(0, len(games_2020)):
    temp = get_stats(games_2020[i])
    temp['Week'] = int(i%16+1) # get week to start over in 2021
    if(i<16):
        temp['Year'] = 2020
    else:
        temp['Year'] = 2021
    df = df.append(temp, ignore_index = True)

teams= list(pd.unique(df['Team'])) # use for search

In [6]:
# Duplicate rows where StatName=C/ATT
is_dup = df['StatName'] == 'C/ATT'
df_try =  df[is_dup]

# New rows = attempts
df_try.loc[df_try['StatName']=='C/ATT', 'StatName'] = 'attempts'

# Original rows = Comp
df.loc[df['StatName']=='C/ATT', 'StatName'] = 'comp'

# Append duplicated data frame with changed values to original df
df = df.append(df_try,ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [7]:
# Convert 26/36 to 26 comp, 36 attempts
mask1 = df.Value.str.contains('/') & df.StatName.str.contains('comp')
mask2 = df.Value.str.contains('/') & df.StatName.str.contains('attempts')

df.Value.loc[mask1] = df.Value[mask1].str.split('/').str[0]
df.Value.loc[mask2] = df.Value[mask2].str.split('/').str[1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [8]:
# Drop rows where Player = 'Team'
df.drop(df[df['Player'] == ' Team'].index, inplace = True)

In [9]:
#df.loc[((df['Team'] == "Texas") & (df['Year'] == 2021) & (df['Player'] == 'Xavier Worthy'))]

# Betting Lines

In [10]:
api = cfbd.BettingApi(cfbd.ApiClient(configuration))
year = 2020 # int | Year/season filter for games (optional)
week = 1 # int | Week filter (optional)
season_type = 'regular'

api_response = api.get_lines(year=year, week=week, season_type=season_type)

In [11]:
# Get data for each week
year = 2020
year_list = [2020, 2021]
season_type = 'regular' # str | Season type filter

bets_2020 = [] # list of games by week, week = index+1

for i in year_list:
    for j in range(1,17):
        bets = api.get_lines(year=i, week=j, season_type=season_type)
        bets_2020.append(bets)
        week = i
        year = j
        print(week, year) # check 

2020 1
2020 2
2020 3
2020 4
2020 5
2020 6
2020 7
2020 8
2020 9
2020 10
2020 11
2020 12
2020 13
2020 14
2020 15
2020 16
2021 1
2021 2
2021 3
2021 4
2021 5
2021 6
2021 7
2021 8
2021 9
2021 10
2021 11
2021 12
2021 13
2021 14
2021 15
2021 16


In [12]:
 # Wrap in a function
def get_betting_info(api_response):
    bet_df = pd.DataFrame.from_records([dict(Away=b.away_team, Home=b.home_team, awayScore=b.away_score,
                                            homeScore=b.home_score, season=b.season, week=b.week) for b in api_response])
    
    bet_df['Away'] = bet_df['Away'].str.replace('é','e') # San Jose State tildé
    bet_df['Home'] = bet_df['Home'].str.replace('é','e') # San Jose State tildé
    
    lines = [b.lines for b in api_response]
    formattedSpread = []
    overUnder = []
    spread = []

    for i in range(0, len(lines)):
        for j in range(0, 1): 
            if(len(lines[i]) > 0): # only want the first line of spread info
                formattedSpread.append(lines[i][j%1]['formattedSpread'])
                spread.append(lines[i][j%1]['spread'])
                overUnder.append(lines[i][j%1]['overUnder'])
            else:
                formattedSpread.append('')
                spread.append('')
                overUnder.append('') 
    
    formattedSpread = [sub.replace('é', 'e') for sub in formattedSpread] # San Jose State tildé
    # combine lists (list of tuples)
    bet_list = list(zip(formattedSpread, spread, overUnder))

    # bet information df
    df = pd.DataFrame(bet_list, columns=['FormatSpread', 'Spread', 'overUnder'])

    # merge game information with bet information
    betting_df = pd.concat([bet_df, df], axis=1)
    return(betting_df)

In [13]:
# initialize data frame
bet_df = pd.DataFrame(columns = ['Away', 'Home', 'awayScore', 'homeScore', 'season', 'week', 'FormatSpread', 'Spread', 'overUnder'])

# Get data from each week
for i in range(0, 32):
    if(len(bets_2020[i]) > 0): # fixes error with empty list at index 29
        temp = get_betting_info(bets_2020[i])
        bet_df = pd.concat([bet_df,temp],axis=0).reset_index(drop=True)

# Change week=1 to week=0 for 2021 Week 0 games
bet_df['UID'] = bet_df['Away'] + '__' + bet_df['Home']
zero_list = ['Nebraska__Illinois', 'Connecticut__Fresno State',"Hawai'i__UCLA", "UTEP__New Mexico State", 
             "Southern Utah__San Jose State"]
bet_df['week'][bet_df['UID'].isin(zero_list)] = 0
# Drop UID
bet_df = bet_df.drop(columns=['UID'])

In [14]:
bet_df.to_csv(r'C:\Users\punco\OneDrive\Desktop\Fantasy Football\2021\CFB\CFB2020Betting.csv', index=False)

# Game Information Data

In [15]:
# create an instance of the API class
api_instance = cfbd.GamesApi(cfbd.ApiClient(configuration))

year_list = [2020,2021]
games_df = pd.DataFrame(columns=['Away', 'Home', 'Season', 'Week'])

for i in year_list:
    for j in range(1, 17):
        api_response = api_instance.get_games(year=i, week=j)
        temp_df = pd.DataFrame.from_records([dict(Away=p.away_team, Home=p.home_team, Season=p.season, Week=p.week) 
                                        for p in api_response])
        games_df = games_df.append(temp_df).reset_index(drop=True)

games_df['Away'] = games_df['Away'].str.replace('é','e') # San Jose State tildé
games_df['Home'] = games_df['Home'].str.replace('é','e') # San Jose State tildé

In [16]:
# 2021 doesn't code for week 0, change first 5 games of 2021 to week 0
for i in range(542,547):
    games_df.loc[i, 'Week'] = 0

# Merge in python? merge in R?
## Player Team Opponent Week Season Stat1 Stat2 Stat3 ...

# Connelly's SP+ ratings

In [17]:
api_instance = cfbd.RatingsApi(cfbd.ApiClient(configuration))
year_list = [2020, 2021] # int | Season filter (required if team not specified) (optional)
sp_list = []
# Historical SP+ ratings
for i in year_list:
    api_response = api_instance.get_sp_ratings(year=i)
    sp_list.append(api_response)

In [18]:
# SP+ ratings data frame
def get_sp(api_response):
    sp_df = pd.DataFrame.from_records([dict(Team=s.team, SP=s.rating) for s in api_response])
    
    # San Jose State tilde
    sp_df['Team'] = sp_df['Team'].str.replace('é','e')
    
    def_ratings = []
    off_ratings = []

    # Get offensive ratings
    temp = [s.offense for s in api_response]
    for i in range(0, len(temp)):
        off_ratings.append(temp[i]['rating'])

    # Get defensive ratings
    temp = [s.defense for s in api_response]
    for i in range(0, len(temp)):
        def_ratings.append(temp[i]['rating'])

    # Merge sp_df with offense and defense ratings
    ratings_df = pd.DataFrame(list(zip(off_ratings, def_ratings)),
                  columns=['OffenseSP','DefenseSP'])

    sp_df = pd.concat([sp_df, ratings_df], axis=1) # side-by-side
    return(sp_df)

In [19]:
sp_df = pd.DataFrame(columns=['Team', 'SP', 'OffenseSP', 'DefenseSP', 'Year'])
for i in range(0,2):
    temp = get_sp(sp_list[i])
    temp['Year'] = year_list[i]
    sp_df = pd.concat([sp_df, temp], axis=0).reset_index(drop=True)

# Positions

In [20]:
# create an instance of the API class
api_instance = cfbd.TeamsApi(cfbd.ApiClient(configuration))
year_list = [2020,2021] # int | Season year (optional)
rosters_year = []

for i in year_list:
    api_response = api_instance.get_roster(year=i)
    rosters_year.append(api_response)

In [21]:
# Do something
pos_df = pd.DataFrame(columns=['First', 'Last', 'Position', 'Team', 'Year'])
for j in range(0,2): # 2 years of data
    temp = pd.DataFrame.from_records([dict(First=i.first_name, Last=i.last_name, Position=i.position, Team=i.team) 
                                      for i in rosters_year[j]])
    temp['Year'] = j+2020
    pos_df = pos_df.append(temp).reset_index(drop=True)

# Change Xavier Worthy's position
pos_df.at[28514, 'Position'] = 'WR'

# Only offensive skill position players (QB, RB, WR, TE)
pos_df = pos_df[pos_df['Position'].isin(['QB','RB','WR','TE'])]

# San Jose State thing
pos_df['Team'] = pos_df['Team'].str.replace('é','e')

#Remove empty rows
nan_value = float("NaN")

pos_df.replace("", nan_value, inplace=True)

pos_df.dropna(subset = ["First", "Last", "Position"], inplace=True)
pos_df['Player'] = pos_df['First'] + ' ' + pos_df['Last']

# Name match
pos_df.replace("Christopher Rodriguez Jr.", "Chris Rodriguez Jr.", inplace = True)
pos_df.replace("Micale Cunningham", "Malik Cunningham", inplace = True)

In [22]:
pos_df.loc[pos_df['Last']=="Addison"]

Unnamed: 0,First,Last,Position,Team,Year,Player
9062,Jordan,Addison,WR,Pittsburgh,2020,Jordan Addison
23642,Jordan,Addison,WR,Pittsburgh,2021,Jordan Addison


In [23]:
# merge game stats with positions
stats_with_pos = pd.merge(df, pos_df, how="left", left_on=["Team","Player","Year"], right_on=['Team', "Player", "Year"])

# Remove NaNs (non-offensive players)
# Remove empty rows
nan_value = float("NaN")
stats_with_pos.replace("", nan_value, inplace=True)
stats_with_pos.dropna(subset = ["First", "Last", "Position"], inplace=True)

In [24]:
#stats_with_pos.loc[stats_with_pos['Team']=="Louisville"]

In [25]:
print(bet_df.head(3),'\n\n') # bet info
print(stats_with_pos.head(10),'\n\n') # game stats
print(games_df[:10], '\n\n') # game info
print(sp_df.head(10)) # sp+ ratings

                Away                  Home awayScore homeScore season week  \
0   Eastern Kentucky              Marshall         0        59   2020    1   
1      South Alabama  Southern Mississippi        32        21   2020    1   
2  Stephen F. Austin                  UTEP        14        24   2020    1   

                 FormatSpread Spread overUnder  
0              Marshall -25.5  -25.5        55  
1  Southern Mississippi -12.5  -12.5      54.5  
2                     UTEP -4     -4        54   


       Team    StatCat StatName             Player PlayerID Value  \
0   Memphis    fumbles      REC        Brady White  3893630     0   
3   Memphis    fumbles     LOST        Brady White  3893630     0   
6   Memphis    fumbles      FUM        Brady White  3893630     1   
9   Memphis  receiving     LONG         Asa Martin  4361333     4   
10  Memphis  receiving     LONG    Tahj Washington  4567506     7   
11  Memphis  receiving     LONG  Calvin Austin III  4243389     9   
12  M

## Merge data

In [26]:
# games_df with bet_df; keep games_df week data (has week 0 2021)
games_bets = pd.merge(games_df, bet_df, how='left', left_on = ['Away','Home','Season','Week'], right_on = ['Away','Home','season','week'])

In [27]:
# merge games_bets with SP+ to get away team
games_bets2 = pd.merge(games_bets, sp_df, how='left', left_on = ['Away', 'Season'], right_on = ['Team', 'Year'])

# Fill NAs with ''
games_bets2['AwaySP'] = games_bets2['SP'].fillna(0)
games_bets2['AwayDefSP'] = games_bets2['DefenseSP'].fillna(0)
games_bets2['AwayOffSP'] = games_bets2['OffenseSP'].fillna(0)

# drop unnecessary columns
games_bets2.drop(['Team', 'Year', 'SP', 'season', 'week', 'OffenseSP', 'DefenseSP'], axis=1, inplace=True)

# merge games_bets with SP+ to get home team
games_bets3 = pd.merge(games_bets2, sp_df, how='left', left_on = ['Home', 'Season'], right_on = ['Team', 'Year'])

# Fill NAs with ''
games_bets3['HomeSP'] = games_bets3['SP'].fillna(0)
games_bets3['HomeDefSP'] = games_bets3['DefenseSP'].fillna(0)
games_bets3['HomeOffSP'] = games_bets3['OffenseSP'].fillna(0)
games_bets3['Spread'] = games_bets3['Spread'].fillna(0)
games_bets3['overUnder'] = games_bets3['overUnder'].fillna(0)

# drop unnecessary columns
games_bets3.drop(['Team', 'Year', 'SP', 'OffenseSP', 'DefenseSP'], axis=1, inplace=True)

# Replace empty strings with 0
games_bets3.replace('', 0, inplace=True)

In [28]:
#print(games_bets3)

In [29]:
# Add implied team totals based on spread
games_bets3['overUnder'] = games_bets3['overUnder'].astype(float)
games_bets3['Spread'] = games_bets3['Spread'].astype(float)

games_bets3['HomeTotal'] = (games_bets3['overUnder'] / 2) - (games_bets3['Spread'] / 2)
games_bets3['AwayTotal'] = (games_bets3['overUnder'] / 2) + (games_bets3['Spread'] / 2)

# If over/under is n/a (0), then change team totals to 0
games_bets3.loc[games_bets3.overUnder==0, 'HomeTotal'] = 0
games_bets3.loc[games_bets3.overUnder==0, 'AwayTotal'] = 0

In [30]:
#print(games_bets3[games_bets3['Week']==0])

In [31]:
# Merge player game stats with games_bets
suff_A = ['_on_A_match_1', '_on_A_match_2']
suff_B = ['_on_B_match_1', '_on_B_match_2']

game_stats = pd.concat([stats_with_pos.merge(games_bets3, how='left', left_on=['Team', 'Opponent', 'Year'],
                                             right_on=['Away', 'Home', 'Season'],suffixes=suff_A), 
                stats_with_pos.merge(games_bets3, how='left', left_on=['Team', 'Opponent', 'Year'],
                                             right_on=['Home', 'Away', 'Season'], suffixes=suff_B)])

# Remove empty rows
nan_value = float("NaN")
game_stats.replace("", nan_value, inplace=True)
game_stats.dropna(subset = ["Away", "Home", "Season"], inplace=True)

# separate week 0 games from df to append to final DF below
#zeros_ind = (game_stats.Week_on_B_match_2==0)
#zeros_df = game_stats[zeros_ind]

# Create unique ID to drop dupes
game_stats['UID'] = game_stats['Player'].map(str) + game_stats['StatCat'].map(str) + game_stats['StatName'].map(str) + game_stats['Value'].map(str) + game_stats['Opponent'].map(str) + game_stats['Year'].map(str)

# Count times unique ID appears
game_stats['freq'] = game_stats.groupby('UID')['UID'].transform('count')

# Remove duplicates
game_stats_no_dupes = game_stats[game_stats['freq']==1].reset_index(drop=True)

# Drop columns
game_stats_no_dupes.drop(['UID', 'freq', 'Week_on_A_match_1','Week_on_B_match_1'], axis=1, inplace=True)

# 'Week_on_B_match_2' and 'Week_on_A_match_2' need to be merged into 1 column
week_df = game_stats_no_dupes[['Week_on_A_match_2', 'Week_on_B_match_2']]
game_stats_no_dupes['Week'] = week_df.bfill(axis=1).iloc[:, 0]
game_stats_no_dupes.drop(['Week_on_A_match_2','Week_on_B_match_2'], axis=1, inplace=True)

# Player Usage (rush share, pass share, etc)

In [32]:
# create an instance of the API class
api_instance = cfbd.PlayersApi(cfbd.ApiClient(configuration))
year_list = [2020,2021]
exclude_garbage_time = True # bool | Filter to remove garbage time plays from calculations (optional)

usage = []
for i in year_list:
    api_response = api_instance.get_player_usage(year=i, exclude_garbage_time=exclude_garbage_time)
    usage.append(api_response)

In [33]:
temp_dict = [[dict(Player=i.name, Team=i.team, Season=i.season, Position=i.position, Usage=i.usage) 
              for i in usage[j]] for j in range(0,2)]

player_list = []
team_list = []
season_list = []
pos_list = []
overall_usage = []
firstDown_usage = []
pass_usage = []
rush_usage = []
secondDown_usage = []
thirdDown_usage = []
passingDowns_usage = []
standardDowns_usage = []

for i in range(0, len(temp_dict)):
    for k in range(0, len(temp_dict[i])):
        player_list.append(temp_dict[i][k]['Player'])
        team_list.append(temp_dict[i][k]['Team'])
        season_list.append(temp_dict[i][k]['Season'])
        pos_list.append(temp_dict[i][k]['Position'])
        overall_usage.append(temp_dict[i][k]['Usage']['overall'])
        firstDown_usage.append(temp_dict[i][k]['Usage']['firstDown'])
        pass_usage.append(temp_dict[i][k]['Usage']['pass'])
        rush_usage.append(temp_dict[i][k]['Usage']['rush'])
        secondDown_usage.append(temp_dict[i][k]['Usage']['secondDown'])
        thirdDown_usage.append(temp_dict[i][k]['Usage']['thirdDown'])
        passingDowns_usage.append(temp_dict[i][k]['Usage']['passingDowns'])
        standardDowns_usage.append(temp_dict[i][k]['Usage']['standardDowns'])

# Zip lists
usage_list = []
for e,f,g,h,i,j,k,l,m,n,o,p in zip(player_list, team_list, season_list, pos_list, overall_usage, firstDown_usage,
                                   pass_usage, rush_usage, secondDown_usage, thirdDown_usage, passingDowns_usage, 
                                   standardDowns_usage):
    temp = { 'Player':e, 'Team':f, 'Season':g, 'Position':h, 'Overall': i, 'FirstDown':j, 'Passing':k, 'Rushing': l,
            'SecondDown': m, 'ThirdDown':n, 'PassingDowns':o, 'Standard':p }
    usage_list.append(temp)

# convert to df
usage_df = pd.DataFrame(usage_list)

In [34]:
#usage_df.loc[usage_df['Team']=='Ohio State']

# Merge usage with game stats

In [35]:
game_stats_with_usage = pd.merge(game_stats_no_dupes, usage_df, how = 'left',
                                 left_on = ['Team','Player','Year','Position'],
                                 right_on = ['Team','Player','Season','Position'])

In [36]:
game_stats_with_usage.to_csv(r'C:\Users\punco\OneDrive\Desktop\Fantasy Football\2021\CFB\CFBGameStats.csv', index=False)