Import Python modules.

In [1]:
import json
import pandas as pd
import numpy as np


1. Read regular and postseason basic game stats JSON file from an input year into pandas objects.
2. Concatinate regular and postseason games DataFrame into a larger DataFrame (N-games x 13), with game results and filtered game information.

In [2]:
cfb_year = input('Input college football year?')

json_regular_season = cfb_year + '_cfb_JSON-CSV/' + cfb_year + '-basicGameStatsCFB-JSON-regular.json'
json_post_season = cfb_year + '_cfb_JSON-CSV/' + cfb_year + '-basicGameStatsCFB-JSON-postseason.json'

regular_season = pd.read_json(json_regular_season)
post_season = pd.read_json(json_post_season)

# New DataFrame, but select 13 columns, not 33 columns
regular_season_games = pd.DataFrame(regular_season, columns=['id', 'season', 'season_type', 'week', 'completed', 'home_team', 'home_id', 'home_division', 'home_conference', 'home_points', 'away_team', 'away_id', 'away_division', 'away_conference', 'away_points'])
postseason_games = pd.DataFrame(post_season, columns=['id', 'season', 'season_type', 'week', 'completed', 'home_team', 'home_id', 'home_division', 'home_conference', 'home_points', 'away_team', 'away_id', 'away_division', 'away_conference', 'away_points'])

# Concatenate regular_season_games DataFrame and postseason_games DataFrame
cfb_season_games = pd.concat([regular_season_games, postseason_games], axis=0)


Input college football year? 2015


In [3]:
cfb_season_games.tail()


Unnamed: 0,id,season,season_type,week,completed,home_team,home_id,home_division,home_conference,home_points,away_team,away_id,away_division,away_conference,away_points
36,400852739,2015,postseason,1,True,Georgia,61,fbs,SEC,24,Penn State,213,fbs,Big Ten,17
37,400852740,2015,postseason,1,True,Arkansas,8,fbs,SEC,45,Kansas State,2306,fbs,Big 12,23
38,400852741,2015,postseason,1,True,TCU,2628,fbs,Big 12,47,Oregon,2483,fbs,Pac-12,41
39,400852742,2015,postseason,1,True,Arizona State,9,fbs,Pac-12,42,West Virginia,277,fbs,Big 12,43
40,400852743,2015,postseason,1,True,Clemson,228,fbs,ACC,40,Alabama,333,fbs,SEC,45


1. Read detailed postseason game stats JSON file into a viewable pandas object.
2. Split detailed postseason panda object into an [1] "id" and [2] "teams" pandas objects.
3. Split "teams" pandas for each game into four (4) lists...
   * [1] team_0, [2] team_0.location, [3] team_1, and [4] team_1.location

In [4]:
# load detailed postseason game stats JSON as a pandas object
json_detailed_postseason = cfb_year + '_cfb_JSON-CSV/' + cfb_year + '-postseason-1-detailedGamesCFB-JSON.json'
detailed_postseason = pd.read_json(json_detailed_postseason)

# split detailed_postseason pandas object to list id_only (N-games x 1)
id_only = detailed_postseason.drop(['teams'], axis=1)
rows = id_only.shape[0]
print(rows)

# split detailed_postseason pandas object to list teams (N-games x 1)
teams_only = detailed_postseason.drop(['id'], axis=1)

# split detailed_postseason 'DataFrame' into a [1] teams[j][0]['teams_key'] list
# & teams[j][1]['teams_key'] list (N-teams x 1)
def teams_column(all_games, team_number, teams_key):
    teams_list = []
    for game in range(all_games):
        teams_list.append((teams_only['teams'][game][team_number][teams_key]))
    return teams_list

team0_list = teams_column(rows, 0, 'school')
team1_list = teams_column(rows, 1, 'school')
team0_location_list = teams_column(rows, 0, 'homeAway')
team1_location_list = teams_column(rows, 1, 'homeAway')
team0_points_list = teams_column(rows, 0, 'points')
team1_points_list = teams_column(rows, 1, 'points')
team0_school_id_list = teams_column(rows, 0, 'schoolId')
team1_school_id_list = teams_column(rows, 1, 'schoolId')

# Combine the 4 list-Series into a DataFrame (N-teams x 4)
teams_detail = pd.DataFrame({'team0': team0_list, 'team0.id': team0_school_id_list, 'team0.location': team0_location_list, 'team0.points': team0_points_list, 'team1' : team1_list,  'team1.id': team1_school_id_list, 'team1.location' : team1_location_list, 'team1.points': team1_points_list})

# Join games id with team_0 and team_1 information
id_teams_locations_df = id_only.join(teams_detail, on=None)
id_teams_locations_df


41


Unnamed: 0,id,team0,team0.id,team0.location,team0.points,team1,team1.id,team1.location,team1.points
0,400852723,Air Force,2005,home,36,California,25,away,55
1,400852725,Nevada,2440,away,28,Colorado State,36,home,23
2,400852730,USC,30,away,21,Wisconsin,275,home,23
3,400852727,Memphis,235,home,10,Auburn,2,away,31
4,400852728,NC State,152,away,28,Mississippi State,344,home,51
5,400852729,Texas A&M,245,away,21,Louisville,97,home,27
6,400852733,Clemson,228,home,37,Oklahoma,201,away,17
7,400852732,Michigan State,127,away,0,Alabama,333,home,38
8,400852731,Houston,248,away,38,Florida State,52,home,24
9,400852737,Iowa,2294,home,16,Stanford,24,away,45


In [5]:
# practicing reading JSON file with nested dictionaries
# read detailed category stats from game #1, team0 [game0 until last row][team0 or team1] (read values from a "stats" dictionary)
# detailed stats list for each team will vary

def team_stats_details(game_row, team_number):
    team_detailed_stats = teams_only['teams'][0][team_number]['stats']
    # print(team_detailed_stats)
    
    # convert team_detailed_stats into a DataFrame
    team_detailed_stats_df = pd.DataFrame(team_detailed_stats)
    # print(team_detailed_stats_df)
    
    # convert team_detailed_stats_df into a numpy nested array
    team_detailed_stats_np = team_detailed_stats_df.to_numpy()
    # print(team_detailed_stats_np)
    
    # transpose nested array
    team_detailed_stats_np = np.transpose(team_detailed_stats_np)
    # print(team_detailed_stats_np)
    
    # create team name (either 'team0.' or 'team1.')
    team_int = 'team' + str(team_number) + '.'
    # print(team_int)
    
    # convert back to DataFrame with header as stats columns
    team_detailed_stats_df2 = pd.DataFrame(team_detailed_stats_np)
    new_header = team_detailed_stats_df2.iloc[0]
    team_detailed_stats_df2 = team_detailed_stats_df2[1:]
    team_detailed_stats_df2.columns = new_header
    
    # insert team0. or team1. as a prefix in the header row
    team_detailed_stats_df2 = team_detailed_stats_df2.add_prefix(team_int)
    return team_detailed_stats_df2

team0_sample = team_stats_details(0, 0)
team1_sample = team_stats_details(0, 1)

# join "team0." & "team1." detailed stats from 
teams_stats = team0_sample.join(team1_sample, on=None)
teams_stats

# coming soon: loop through every game as a new row; join rows with game id (id_detailed_game_stats_df);
#              join "id_team_locations_df" with "id_detailed_game_stats_df" 


Unnamed: 0,team0.fumblesRecovered,team0.rushingTDs,team0.passingTDs,team0.kickReturnYards,team0.kickReturnTDs,team0.kickReturns,team0.kickingPoints,team0.firstDowns,team0.thirdDownEff,team0.fourthDownEff,...,team1.completionAttempts,team1.yardsPerPass,team1.rushingYards,team1.rushingAttempts,team1.yardsPerRushAttempt,team1.totalPenaltiesYards,team1.turnovers,team1.fumblesLost,team1.interceptions,team1.possessionTime
1,1,3,2,225,0,11,4,23,5-10,1-1,...,25-37,12.6,119,34,3.5,5-29,1,1,0,28:53


Display team0 & team1, game0 (aka. row01) totalYards & possessionTime in a row.

In [6]:
sample = teams_stats[["team0.totalYards", "team1.totalYards"]]
sample


Unnamed: 0,team0.totalYards,team1.totalYards
1,434,586
