Import Python modules.

In [1]:
import json
import pandas as pd
import numpy as np


1. Read regular and postseason basic game stats JSON file from an input year into pandas objects.
2. Concatinate regular and postseason games DataFrame into a larger DataFrame (N-games x 13), with game results and filtered game information.

In [2]:
cfb_year = input('Input college football year?')

json_regular_season = cfb_year + '_cfb_JSON-CSV/' + cfb_year + '-basicGameStatsCFB-JSON-regular.json'
json_post_season = cfb_year + '_cfb_JSON-CSV/' + cfb_year + '-basicGameStatsCFB-JSON-postseason.json'

regular_season = pd.read_json(json_regular_season)
post_season = pd.read_json(json_post_season)

# New DataFrame, but select 13 columns, not 33 columns
regular_season_games = pd.DataFrame(regular_season, columns=['id', 'season', 'season_type', 'week', 'completed', 'home_team', 'home_id', 'home_division', 'home_conference', 'home_points', 'away_team', 'away_id', 'away_division', 'away_conference', 'away_points'])
postseason_games = pd.DataFrame(post_season, columns=['id', 'season', 'season_type', 'week', 'completed', 'home_team', 'home_id', 'home_division', 'home_conference', 'home_points', 'away_team', 'away_id', 'away_division', 'away_conference', 'away_points'])

# Concatenate regular_season_games DataFrame and postseason_games DataFrame
cfb_season_games = pd.concat([regular_season_games, postseason_games], axis=0)


Input college football year? 2022


In [3]:
cfb_season_games.tail()


Unnamed: 0,id,season,season_type,week,completed,home_team,home_id,home_division,home_conference,home_points,away_team,away_id,away_division,away_conference,away_points
37,401442011,2022,postseason,1,True,Illinois,356,fbs,Big Ten,10,Mississippi State,344,fbs,SEC,19
38,401442012,2022,postseason,1,True,Purdue,2509,fbs,Big Ten,7,LSU,99,fbs,SEC,63
39,401442013,2022,postseason,1,True,USC,30,fbs,Pac-12,45,Tulane,2655,fbs,American Athletic,46
40,401442014,2022,postseason,1,True,Utah,254,fbs,Pac-12,21,Penn State,213,fbs,Big Ten,35
41,401442010,2022,postseason,1,True,Georgia,61,fbs,SEC,65,TCU,2628,fbs,Big 12,7


1. Read detailed postseason game stats JSON file into a viewable pandas object.
2. Split detailed postseason panda object into an [1] "id" and [2] "teams" pandas objects.
3. Split "teams" pandas for each game into four (4) lists...
   * [1] team_0, [2] team_0.location, [3] team_1, and [4] team_1.location

In [4]:
# load detailed postseason game stats JSON as a pandas object
json_detailed_postseason = cfb_year + '_cfb_JSON-CSV/' + cfb_year + '-postseason-1-detailedGamesCFB-JSON.json'
detailed_postseason = pd.read_json(json_detailed_postseason)

# split detailed_postseason pandas object to list id_only (N-games x 1)
id_only = detailed_postseason.drop(['teams'], axis=1)
rows = id_only.shape[0]
print(str(rows) + ' games')

# split detailed_postseason pandas object to list teams (N-games x 1)
teams_only = detailed_postseason.drop(['id'], axis=1)

# split detailed_postseason 'DataFrame' into a [1] teams[j][0]['teams_key'] list
# & teams[j][1]['teams_key'] list (N-teams x 1)
def teams_column(all_games, team_number, teams_key):
    teams_list = []
    for game in range(all_games):
        teams_list.append((teams_only['teams'][game][team_number][teams_key]))
    return teams_list

team0_list = teams_column(rows, 0, 'school')
team1_list = teams_column(rows, 1, 'school')
team0_location_list = teams_column(rows, 0, 'homeAway')
team1_location_list = teams_column(rows, 1, 'homeAway')
team0_points_list = teams_column(rows, 0, 'points')
team1_points_list = teams_column(rows, 1, 'points')
team0_school_id_list = teams_column(rows, 0, 'schoolId')
team1_school_id_list = teams_column(rows, 1, 'schoolId')

# Combine the 4 list-Series into a DataFrame (N-teams x 4)
teams_detail = pd.DataFrame({'team0': team0_list, 'team0.id': team0_school_id_list, 'team0.location': team0_location_list, 'team0.points': team0_points_list, 'team1' : team1_list,  'team1.id': team1_school_id_list, 'team1.location' : team1_location_list, 'team1.points': team1_points_list})

# Join games id with team_0 and team_1 information
id_teams_locations_df = id_only.join(teams_detail, on=None)
id_teams_locations_df


42 games


Unnamed: 0,id,team0,team0.id,team0.location,team0.points,team1,team1.id,team1.location,team1.points
0,401442010,TCU,2628,away,7,Georgia,61,home,65
1,401442011,Mississippi State,344,away,19,Illinois,356,home,10
2,401442012,LSU,99,away,63,Purdue,2509,home,7
3,401442013,Tulane,2655,away,46,USC,30,home,45
4,401442014,Utah,254,home,21,Penn State,213,away,35
5,401442018,Alabama,333,away,45,Kansas State,2306,home,20
6,401442016,TCU,2628,away,51,Michigan,130,home,45
7,401442015,Ohio State,194,away,41,Georgia,61,home,42
8,401443543,Ohio,195,away,30,Wyoming,2751,home,27
9,401442035,Maryland,120,away,16,NC State,152,home,12


In [5]:
# practicing reading JSON file with nested dictionaries
# read detailed category stats from game #1, team0 [game0 until last row][team0 or team1] (read values from a "stats" dictionary)
# detailed stats list for each team will vary

# loop through every game as a new row; join rows with game id (id_detailed_game_stats_df);
#   join "id_team_locations_df" with "id_detailed_game_stats_df" 

team_detailed_stats_columns_df = pd.DataFrame(columns=['team0.firstDowns', 'team0.fourthDownEff', 'team0.yardsPerPass'
                                                , 'team0.fumblesLost', 'team0.fumblesRecovered', 'team0.interceptions'
                                                , 'team0.interceptionTDs', 'team0.interceptionYards', 'team0.kickReturns'
                                                , 'team0.kickReturnTDs', 'team0.kickReturnYards', 'team0.netPassingYards'
                                                , 'team0.completionAttempts', 'team0.passesIntercepted', 'team0.thirdDownEff'
                                                , 'team0.passingTDs', 'team0.totalPenaltiesYards', 'team0.possessionTime'
                                                , 'team0.puntReturns', 'team0.puntReturnTDs', 'team0.puntReturnYards'
                                                , 'team0.rushingAttempts', 'team0.rushingTDs', 'team0.rushingYards'
                                                , 'team0.sacks', 'team0.tacklesForLoss', 'team0.kickingPoints'
                                                , 'team0.yardsPerRushAttempt', 'team0.turnovers', 'team0.totalYards'
                                                , 'team0.passesDeflected', 'team0.defensiveTDs', 'team0.qbHurries'
                                                , 'team1.firstDowns', 'team1.fourthDownEff', 'team1.yardsPerPass'
                                                , 'team1.fumblesLost', 'team1.fumblesRecovered', 'team1.interceptions'
                                                , 'team1.interceptionTDs', 'team1.interceptionYards', 'team1.kickReturns'
                                                , 'team1.kickReturnTDs', 'team1.kickReturnYards', 'team1.netPassingYards'
                                                , 'team1.completionAttempts', 'team1.passesIntercepted', 'team1.thirdDownEff'
                                                , 'team1.passingTDs', 'team1.totalPenaltiesYards', 'team1.possessionTime'
                                                , 'team1.puntReturns', 'team1.puntReturnTDs', 'team1.puntReturnYards'
                                                , 'team1.rushingAttempts', 'team1.rushingTDs', 'team1.rushingYards'
                                                , 'team1.sacks', 'team1.tacklesForLoss', 'team1.kickingPoints'
                                                , 'team1.yardsPerRushAttempt', 'team1.turnovers', 'team1.totalYards'
                                                , 'team1.passesDeflected', 'team1.defensiveTDs', 'team1.qbHurries'])


accumulator = []
for game in range(rows):
    for team in range(2):
        team_detailed_stats = teams_only['teams'][game][team]['stats']
        
        # convert team_detailed_stats into a DataFrame
        team_detailed_stats_df = pd.DataFrame(team_detailed_stats)
        
        # convert team_detailed_stats_df into a numpy nested array
        team_detailed_stats_np = team_detailed_stats_df.to_numpy()
        
        # transpose nested array
        team_detailed_stats_np = np.transpose(team_detailed_stats_np)
        
        # create team name (either 'team0.' or 'team1.')
        team_int = 'team' + str(team) + '.'
        
        # convert back to DataFrame with header as stats columns
        team_detailed_stats_df2 = pd.DataFrame(team_detailed_stats_np)
        new_header = team_detailed_stats_df2.iloc[0]
        team_detailed_stats_df2 = team_detailed_stats_df2[1:]
        team_detailed_stats_df2.columns = new_header
        
        # insert team0. or team1. as a prefix in the header row
        team_detailed_stats_df2 = team_detailed_stats_df2.add_prefix(team_int)
        accumulator.append(team_detailed_stats_df2)
        
big_df = pd.concat(accumulator)
big_df


Unnamed: 0,team0.rushingTDs,team0.passingTDs,team0.kickReturnYards,team0.kickReturnTDs,team0.kickReturns,team0.kickingPoints,team0.fumblesRecovered,team0.totalFumbles,team0.possessionTime,team0.interceptions,...,team0.tackles,team0.sacks,team0.qbHurries,team0.passesDeflected,team1.puntReturnYards,team1.puntReturnTDs,team1.puntReturns,team0.interceptionYards,team0.interceptionTDs,team0.passesIntercepted
1,1,0,135,0,6,1,0,1,23:01,2,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
1,0,1,,,,7,0,1,25:42,2,...,7,7,0,0,,,,,,
1,,,,,,,,,,,...,,,,,10,0,2,,,
1,4,4,22,0,1,10,0,,30:15,1,...,,,,,,,,122,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,,,,,,,,,,,...,,,,,,,,,,
1,0,1,119,0,5,1,2,3,27:31,0,...,1,1,0,2,,,,39,0,2
1,,,,,,,,,,,...,,,,,10,0,3,,,
1,0,1,4,0,1,4,0,5,26:44,2,...,44,6,1,3,,,,0,0,2


group big_df every two rows

In [6]:
group_big_df = big_df.groupby(np.arange(len(big_df))//2).sum()
group_big_df


Unnamed: 0,team0.rushingTDs,team0.passingTDs,team0.kickReturnYards,team0.kickReturnTDs,team0.kickReturns,team0.kickingPoints,team0.fumblesRecovered,team0.totalFumbles,team0.possessionTime,team0.interceptions,...,team0.tackles,team0.sacks,team0.qbHurries,team0.passesDeflected,team1.puntReturnYards,team1.puntReturnTDs,team1.puntReturns,team0.interceptionYards,team0.interceptionTDs,team0.passesIntercepted
0,1,0,135,0,6,1,0,1,23:01,2,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,7,0,1,25:42,2,...,7,7,0,0,10,0,2,0,0,0
2,4,4,22,0,1,10,0,0,30:15,1,...,0,0,0,0,35,0,2,122,1,3
3,4,2,146,0,6,8,0,1,20:11,0,...,0,0,0,1,1,0,1,0,0,1
4,1,2,15,0,1,3,0,0,35:13,2,...,3,3,0,0,4,0,2,0,0,0
5,1,5,31,0,2,9,0,0,24:26,0,...,2,2,0,2,1,0,1,2,0,2
6,3,2,75,0,4,9,1,1,27:35,2,...,4,4,0,2,16,0,2,70,2,2
7,1,4,15,0,1,11,0,1,32:36,0,...,0,2,0,1,22,0,1,15,0,1
8,1,2,95,0,4,10,0,2,30:43,0,...,4,4,0,1,0,0,0,15,0,1
9,0,1,32,0,3,10,0,1,35:32,2,...,0,0,0,2,8,0,2,2,0,2
