In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Import Data and Separate Games

### Notes

End quarter is included as a row of mostly NANs. In column 'desc', this is noted as "END QUARTER ...", i.e. "END QUARTER 1"

In [2]:
def separate_games(df: pd.DataFrame) -> list[pd.DataFrame]:
    """Separate dataframe into separate games via the game ID. Place into a 
    list of games. Indices are reindexed so plays are numbered, starting with 0
    """
    games = [df[df['GameID'] == value].reset_index(drop=True) for value in df['GameID'].unique()]
        
    return games

def calculate_time_per_play(game: pd.DataFrame) -> pd.DataFrame:
    """Create a new column which is the time each play took.
    Kicks will have NANs in the new play_time column, which should make them easy to remove
    """
    game['play_time'] = -game['TimeSecs'].diff()

    return game


def drop_unnecessary_rows(game: pd.DataFrame) -> pd.DataFrame:
    """The end of each quarter is its own row. Same with timeouts
    and the end of the game. Other values are mostly NANs.
    This removes all of those unhelpful rows and reindexes

    NOTE: plays must be indexed starting with their first play
    TODO: Might be able to just drop rows with missing posteam
    """
    # find indices
    game.dropna(subset=['posteam', 'play_time'], inplace=True)
    # reset index
    game = game.reset_index(drop=True)

    return game

def encode_teams(game: pd.DataFrame) -> pd.DataFrame:
    """Change all team names to just 0s or 1s. This won't be retraceable if you are
    looking for a game with a specific team playing.
    """
    teams = game['posteam'].unique()
    if len(teams) != 2:
        print(teams)
        raise ValueError("Dataset has not been properly cleaned. There are more than 2 values in posteam.")
    
    team_map = {team:i for i, team in enumerate(teams)}
    game['posteam'] = game['posteam'].map(team_map)
    game['DefensiveTeam'] = game['DefensiveTeam'].map(team_map)

    return game

def create_team0_yardage(game: pd.DataFrame) -> pd.DataFrame:
    """Create a new column which is the yards gained in the play by team zero. 
    It is negative if team 1 is in posession and gains yards.
    """
    game['team0_yards'] = np.where(game['posteam'] == 0, game['Yards.Gained'], -game['Yards.Gained'])

    return game


# Of note: Yards.Gained
# TODO: column of yards gained for team 0, when team 1 gains yards, value is negative
# TODO: keep nans in until you calculate time per play
#   then delete those rows like before and replace nans in time per play with
#   average time per play in that game

### Read in games, separate into individual games

In [3]:
df = pd.read_csv("NFLPlaybyPlay2015.csv")
df.drop(columns=['Unnamed: 0', 'Season'], inplace=True)

games = separate_games(df)

  df = pd.read_csv("NFLPlaybyPlay2015.csv")


### Clean rows and add columns

In [4]:
for i, game in enumerate(games):
    games[i] = calculate_time_per_play(game)
    games[i] = drop_unnecessary_rows(games[i])
    games[i] = encode_teams(games[i])
    games[i] = create_team0_yardage(games[i])

game = games[30]

print(game['posteam'].unique())

[0 1]


### Take important columns

In [5]:
key_columns = ['team0_yards', 'play_time', 'posteam', 'DefensiveTeam']
               
# TODO: maybe it is easier to calculate team0 yards with 'PosTeamScore'andd'DefTeamScore'.
# We could also choose to always label the team0 as the home team 'posteam', 'DefensiveTeam', 'GameID']
# TODO: check if diff is forward or backward, and decide which one we want

# Check for missing values in these key columns
missing_values = game[key_columns].isnull().sum()
print("Missing values in key columns:\n", missing_values)

Missing values in key columns:
 team0_yards      0
play_time        0
posteam          0
DefensiveTeam    0
dtype: int64


## Train-Test Split
* Each element in games represents a whole game
* Combine all testing games into one df, same for training. We can split each by game id later
* Should keep sequential order of plays within each game

In [9]:
# Split the list of games into training and testing sets
train_games, test_games = train_test_split(games, test_size=0.2)

print(f"Number of training games: {len(train_games)}")
print(f"Number of testing games: {len(test_games)}")

Number of training games: 204
Number of testing games: 52


In [10]:
# Concatenate training games into a single DataFrame
train_df = pd.concat(train_games, ignore_index=True)
test_df = pd.concat(test_games, ignore_index=True)

print("Training data shape:", train_df.shape)
print("Testing data shape:", test_df.shape)

Training data shape: (33903, 66)
Testing data shape: (8719, 66)


In [11]:
train_df.to_csv("NFLTrain2015.csv", index=False)
test_df.to_csv("NFLTest2015.csv", index=False)