In [17]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

Data Collection: Load the Dataset

In [27]:
df = pd.read_csv('NFLPlaybyPlay2015.csv')
print("DataFrame shape:", df.shape)
print("Columns:", df.columns.tolist())
print(df.head())

DataFrame shape: (46129, 66)
Columns: ['Unnamed: 0', 'Date', 'GameID', 'Drive', 'qtr', 'down', 'time', 'TimeUnder', 'TimeSecs', 'PlayTimeDiff', 'SideofField', 'yrdln', 'yrdline100', 'ydstogo', 'ydsnet', 'GoalToGo', 'FirstDown', 'posteam', 'DefensiveTeam', 'desc', 'PlayAttempted', 'Yards.Gained', 'sp', 'Touchdown', 'ExPointResult', 'TwoPointConv', 'DefTwoPoint', 'Safety', 'PuntResult', 'PlayType', 'Passer', 'PassAttempt', 'PassOutcome', 'PassLength', 'PassLocation', 'InterceptionThrown', 'Interceptor', 'Rusher', 'RushAttempt', 'RunLocation', 'RunGap', 'Receiver', 'Reception', 'ReturnResult', 'Returner', 'BlockingPlayer', 'Tackler1', 'Tackler2', 'FieldGoalResult', 'FieldGoalDistance', 'Fumble', 'RecFumbTeam', 'RecFumbPlayer', 'Sack', 'Challenge.Replay', 'ChalReplayResult', 'Accepted.Penalty', 'PenalizedTeam', 'PenaltyType', 'PenalizedPlayer', 'Penalty.Yards', 'PosTeamScore', 'DefTeamScore', 'ScoreDiff', 'AbsScoreDiff', 'Season']
   Unnamed: 0        Date      GameID  Drive  qtr  down   t

  df = pd.read_csv('NFLPlaybyPlay2015.csv')


Data Cleaning: Missing Values and Inconsistencies

Key columns:
1. Yards.Gained: Yards gained by team CURRENTLY in offensive possession
2. Time (or TimeSecs): Time remaining on clock (ex 14:21). Alterntively TimeSecs is seconds remaining (ex 3561)
3. PosTeamScore: Score of team CURRENTLY in offensive possession
4. DefTeamScore: Score of team CURRENTLY in defense
5. posteam: Team currently in offensive possession
6. DefensiveTeam: Team currently on defense 
7. GameID: Indicates which game

In [28]:
# Define key columns for the analysis
key_columns = ['Yards.Gained', 'TimeSecs', 'PosTeamScore', 'DefTeamScore', 'posteam', 'DefensiveTeam', 'GameID']

# Check for missing values in these key columns
missing_values = df[key_columns].isnull().sum()
print("Missing values in key columns:\n", missing_values)

Missing values in key columns:
 Yards.Gained        0
TimeSecs           27
PosTeamScore     3251
DefTeamScore     3251
posteam          3251
DefensiveTeam    3251
GameID              0
dtype: int64


In [29]:
# Drop rows with missing values in the key columns
df_clean = df.dropna(subset=['TimeSecs', 'PosTeamScore', 'DefTeamScore'])
print("Shape after dropping rows with missing 'TimeSecs', 'PosTeamScore', or 'DefTeamScore':", df_clean.shape)

Shape after dropping rows with missing 'TimeSecs', 'PosTeamScore', or 'DefTeamScore': (42878, 66)


In [30]:
# Verify missing values after cleaning
missing_after = df_clean[key_columns].isnull().sum()
print("\nMissing values in key columns after cleaning:\n", missing_after)


Missing values in key columns after cleaning:
 Yards.Gained     0
TimeSecs         0
PosTeamScore     0
DefTeamScore     0
posteam          0
DefensiveTeam    0
GameID           0
dtype: int64


In [31]:
# Remove rows with unrealistic yardage gains
df_clean = df_clean[(df_clean['Yards.Gained'] >= -100) & (df_clean['Yards.Gained'] <= 100)]
print("Shape after filtering unrealistic yardage values:", df_clean.shape)

Shape after filtering unrealistic yardage values: (42878, 66)


Train-Test Split

In [None]:
# Since we're working with sequential data (each game is a sequence), plan a split on the game level to avoid data leakage