In [1]:
pip install nfl_data_py


Note: you may need to restart the kernel to use updated packages.


In [46]:
import nfl_data_py as nfl
import pandas as pd
import os
# Download play-by-play data for seasons 2021–2024
pbp = nfl.import_pbp_data([2021, 2022, 2023, 2024])

pbp.shape


2021 done.
2022 done.
2023 done.
2024 done.
Downcasting floats.


(198513, 397)

In [48]:
raw_path = r"C:/Users/visha/OneDrive/Desktop/Data Projects/NFL Strategy Analysis Project/Data/Raw/nfl_raw_2021_2024.csv"
pbp.to_csv(raw_path, index= False)



PermissionError: [Errno 13] Permission denied

In [5]:
sf = pbp[pbp["posteam"] == "SF"].copy()
sf.shape

(6127, 397)

In [6]:
sf["play_type"].value_counts()


pass           2447
run            2157
kickoff         359
no_play         334
punt            252
extra_point     219
field_goal      154
qb_kneel         68
qb_spike         15
Name: play_type, dtype: int64

In [11]:
#Removing the unwanted columns 
sf_off= sf[sf["play_type"].isin(["run", "pass"])].copy()
sf_off.shape
sf_off["play_type"].value_counts()


pass    2447
run     2157
Name: play_type, dtype: int64

In [17]:
sf.columns

Index(['play_id', 'game_id', 'old_game_id_x', 'home_team', 'away_team',
       'season_type', 'week', 'posteam', 'posteam_type', 'defteam',
       ...
       'was_pressure', 'route', 'defense_man_zone_type',
       'defense_coverage_type', 'offense_names', 'defense_names',
       'offense_positions', 'defense_positions', 'offense_numbers',
       'defense_numbers'],
      dtype='object', length=397)

In [25]:
cols = [
    # Game info
    "game_id", "game_date", "season", "week",
    "posteam", "defteam", "home_team", "away_team",

    # Situation
    "qtr", "down", "ydstogo", "yardline_100",
    "goal_to_go", "score_differential",
    "game_seconds_remaining", "half_seconds_remaining",
    "shotgun", "no_huddle", "side_of_field",

    # Play Type + Players
    "play_type", "passer", "rusher", "receiver",
    "run_location", "run_gap",
    "pass_location", "pass_length",

    # EPA / WPA / Success
    "epa", "wpa", "success",
    "air_yards", "yards_after_catch",
    "xyac_mean_yardage", "xyac_success",
    "xpass",

    # Results
    "yards_gained", "first_down", "touchdown",
    "passing_yards", "rushing_yards",
    "pass_touchdown", "rush_touchdown",
    "interception", "sack", "fumble", "fumble_lost",

    # Useful description + IDs
    "desc", "play_clock", "game_half",
    "play_id", "drive", "drive_play_id_started", "drive_play_id_ended"
]


In [26]:
sf_clean= sf_off[cols].copy()
sf_clean.head()

Unnamed: 0,game_id,game_date,season,week,posteam,defteam,home_team,away_team,qtr,down,...,sack,fumble,fumble_lost,desc,play_clock,game_half,play_id,drive,drive_play_id_started,drive_play_id_ended
2694,2021_01_SF_DET,2021-09-12,2021,1,SF,DET,DET,SF,1.0,1.0,...,0.0,1.0,1.0,(12:01) 10-J.Garoppolo FUMBLES (Aborted) at SF...,0,Half1,208.0,2.0,208.0,208.0
2701,2021_01_SF_DET,2021-09-12,2021,1,SF,DET,DET,SF,1.0,1.0,...,0.0,0.0,0.0,(9:36) 31-R.Mostert left end pushed ob at DET ...,0,Half1,370.0,4.0,370.0,545.0
2702,2021_01_SF_DET,2021-09-12,2021,1,SF,DET,DET,SF,1.0,1.0,...,0.0,0.0,0.0,(9:02) 31-R.Mostert right end to DET 39 for 9 ...,0,Half1,396.0,4.0,370.0,545.0
2703,2021_01_SF_DET,2021-09-12,2021,1,SF,DET,DET,SF,1.0,2.0,...,0.0,0.0,0.0,(8:26) 10-J.Garoppolo pass short right to 85-G...,0,Half1,417.0,4.0,370.0,545.0
2704,2021_01_SF_DET,2021-09-12,2021,1,SF,DET,DET,SF,1.0,1.0,...,0.0,0.0,0.0,(7:39) (Shotgun) 5-T.Lance up the middle to DE...,0,Half1,441.0,4.0,370.0,545.0


In [27]:
missing = [c for c in cols if c not in sf_off.columns]
missing


[]

In [28]:
sf_clean.isna().sum().sort_values(ascending=False).head(20)


run_gap                  3136
yards_after_catch        3080
passing_yards            3080
rusher                   2558
xyac_mean_yardage        2499
xyac_success             2499
run_location             2464
rushing_yards            2449
receiver                 2370
pass_length              2309
pass_location            2309
air_yards                2309
passer                   2047
xpass                       6
down                        6
drive_play_id_started       0
drive                       0
play_id                     0
game_half                   0
play_clock                  0
dtype: int64

In [29]:
# Filling Rush Only Columns(only when a play is a run)
sf_clean.loc[sf_clean["play_type"] == "run", "rusher"] = sf_clean.loc[sf_clean["play_type"] == "run", "rusher"].fillna("Unknown")
sf_clean.loc[sf_clean["play_type"] == "run", "run_location"] = sf_clean.loc[sf_clean["play_type"] == "run", "run_location"].fillna("middle")
sf_clean.loc[sf_clean["play_type"] == "run", "run_gap"] = sf_clean.loc[sf_clean["play_type"] == "run", "run_gap"].fillna("unknown")
sf_clean.loc[sf_clean["play_type"] == "run", "rushing_yards"] = sf_clean.loc[sf_clean["play_type"] == "run", "rushing_yards"].fillna(0)


In [30]:
# Filling Pass only Columns(Only when play is a pass)
sf_clean.loc[sf_clean["play_type"] == "pass", "passer"] = sf_clean.loc[sf_clean["play_type"] == "pass", "passer"].fillna("Unknown")
sf_clean.loc[sf_clean["play_type"] == "pass", "receiver"] = sf_clean.loc[sf_clean["play_type"] == "pass", "receiver"].fillna("Unknown")
sf_clean.loc[sf_clean["play_type"] == "pass", "air_yards"] = sf_clean.loc[sf_clean["play_type"] == "pass", "air_yards"].fillna(0)
sf_clean.loc[sf_clean["play_type"] == "pass", "pass_length"] = sf_clean.loc[sf_clean["play_type"] == "pass", "pass_length"].fillna("unknown")
sf_clean.loc[sf_clean["play_type"] == "pass", "pass_location"] = sf_clean.loc[sf_clean["play_type"] == "pass", "pass_location"].fillna("unknown")
sf_clean.loc[sf_clean["play_type"] == "pass", "yards_after_catch"] = sf_clean.loc[sf_clean["play_type"] == "pass", "yards_after_catch"].fillna(0)


In [31]:
# Filling EPA/YAC model
sf_clean["xyac_mean_yardage"] = sf_clean["xyac_mean_yardage"].fillna(0)
sf_clean["xyac_success"] = sf_clean["xyac_success"].fillna(0)


In [32]:
sf_clean.isna().sum().sort_values(ascending=False).head(20)


passing_yards            3080
rushing_yards            2447
run_gap                  2447
run_location             2447
rusher                   2447
air_yards                2157
pass_location            2157
receiver                 2157
pass_length              2157
yards_after_catch        2157
passer                   2047
xpass                       6
down                        6
rush_touchdown              0
game_half                   0
xyac_mean_yardage           0
xyac_success                0
sack                        0
drive_play_id_started       0
drive                       0
dtype: int64

In [33]:
sf_clean["xpass"] = sf_clean["xpass"].fillna(sf_clean["xpass"].median())


In [35]:
sf_clean.isna().sum().sort_values(ascending=False).head(20)


passing_yards            3080
rushing_yards            2447
run_gap                  2447
run_location             2447
rusher                   2447
air_yards                2157
pass_location            2157
receiver                 2157
pass_length              2157
yards_after_catch        2157
passer                   2047
down                        6
interception                0
rush_touchdown              0
xyac_mean_yardage           0
xyac_success                0
xpass                       0
drive_play_id_started       0
drive                       0
play_id                     0
dtype: int64

In [41]:
# Feature Engineering : Adding Analyticsl features 
#Adding late down(3rd and the 4th down)
sf_clean["late_down"] = sf_clean["down"].isin([3,4])
#Addding red zone(inside opponent 20-yard line)
sf_clean["red_zone"] = sf_clean["yardline_100"] <= 20
#Adding goal to go flag
sf_clean["goal_to_go_flag"]= sf_clean["goal_to_go"] == 1
# adding score state
sf_clean["score_state"] = sf_clean["score_differential"].apply( lambda x: "leading" if x > 0 else("trailing" if x < 0 else "tied"))
# adding explosive plays
sf_clean["explosive"] = (
    ((sf_clean["play_type"] == "run") & (sf_clean["yards_gained"] >= 12)) |
    ((sf_clean["play_type"] == "pass") & (sf_clean["yards_gained"] >= 16))
)
#adding succes boolean
sf_clean["success_boolean"] = sf_clean["success"] == 1
#adding play_type detail
sf_clean["pass_detail"] = sf_clean["pass_length"].fillna("run_play")
#adding Distance bucket(short, medium,long)
def distance_bucket(x):
    if x <= 2:
        return "short"
    elif x <= 6:
        return "medium"
    else:
        return "long"

sf_clean["distance_bucket"] = sf_clean["ydstogo"].apply(distance_bucket)

# adding Field Position Bucket 
def field_bucket(x):
    if x >= 80:
        return "deep_in_own"
    elif x >= 50:
        return "own_side"
    elif x >= 20:
        return "midfield"
    else:
        return "red_zone"

sf_clean["field_bucket"] = sf_clean["yardline_100"].apply(field_bucket)
# adding game phase(early/mid/lategame)
def game_phase(x):
    if x > 1800:   # First half start
        return "early"
    elif x > 900:  # Middle periods
        return "mid"
    else:
        return "late"

sf_clean["game_phase"] = sf_clean["game_seconds_remaining"].apply(game_phase)
# Adding pass direction(left/middle/right) & Depth(short/deep)
sf_clean["pass_side"] = sf_clean["pass_location"].fillna("run_play")


In [42]:
sf_clean.head(10)


Unnamed: 0,game_id,game_date,season,week,posteam,defteam,home_team,away_team,qtr,down,...,red_zone,goal_to_go_flag,score_state,explosive,success_boolean,pass_detail,distance_bucket,field_bucket,game_phase,pass_side
2694,2021_01_SF_DET,2021-09-12,2021,1,SF,DET,DET,SF,1.0,1.0,...,False,False,tied,False,False,run_play,long,own_side,early,run_play
2701,2021_01_SF_DET,2021-09-12,2021,1,SF,DET,DET,SF,1.0,1.0,...,False,False,tied,False,True,run_play,long,own_side,early,run_play
2702,2021_01_SF_DET,2021-09-12,2021,1,SF,DET,DET,SF,1.0,1.0,...,False,False,tied,False,True,run_play,long,midfield,early,run_play
2703,2021_01_SF_DET,2021-09-12,2021,1,SF,DET,DET,SF,1.0,2.0,...,False,False,tied,True,True,short,short,midfield,early,right
2704,2021_01_SF_DET,2021-09-12,2021,1,SF,DET,DET,SF,1.0,1.0,...,True,False,tied,False,False,run_play,long,red_zone,early,run_play
2705,2021_01_SF_DET,2021-09-12,2021,1,SF,DET,DET,SF,1.0,2.0,...,True,False,tied,False,False,run_play,long,red_zone,early,run_play
2706,2021_01_SF_DET,2021-09-12,2021,1,SF,DET,DET,SF,1.0,3.0,...,True,False,tied,False,True,short,medium,red_zone,early,left
2707,2021_01_SF_DET,2021-09-12,2021,1,SF,DET,DET,SF,1.0,1.0,...,True,True,tied,False,True,short,medium,red_zone,early,left
2727,2021_01_SF_DET,2021-09-12,2021,1,SF,DET,DET,SF,2.0,1.0,...,False,False,tied,False,False,run_play,long,own_side,early,run_play
2728,2021_01_SF_DET,2021-09-12,2021,1,SF,DET,DET,SF,2.0,2.0,...,False,False,tied,True,True,deep,long,own_side,early,middle


In [50]:
clean_path = r"C:/Users/visha/OneDrive/Desktop/Data Projects/NFL Strategy Analysis Project/Data/Processed/49ers_clean_2021_2024.csv"
sf_clean.to_csv(clean_path, index=False)