In [12]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nfl-pbp-2021-2023/pbp_2021_2023.csv


['play_id',
 'game_id',
 'old_game_id',
 'home_team',
 'away_team',
 'season_type',
 'week',
 'posteam',
 'posteam_type',
 'defteam',
 'side_of_field',
 'yardline_100',
 'game_date',
 'quarter_seconds_remaining',
 'half_seconds_remaining',
 'game_seconds_remaining',
 'game_half',
 'quarter_end',
 'drive',
 'sp',
 'qtr',
 'down',
 'goal_to_go',
 'time',
 'yrdln',
 'ydstogo',
 'ydsnet',
 'desc',
 'play_type',
 'yards_gained',
 'shotgun',
 'no_huddle',
 'qb_dropback',
 'qb_kneel',
 'qb_spike',
 'qb_scramble',
 'pass_length',
 'pass_location',
 'air_yards',
 'yards_after_catch',
 'run_location',
 'run_gap',
 'field_goal_result',
 'kick_distance',
 'extra_point_result',
 'two_point_conv_result',
 'home_timeouts_remaining',
 'away_timeouts_remaining',
 'timeout',
 'timeout_team',
 'td_team',
 'td_player_name',
 'td_player_id',
 'posteam_timeouts_remaining',
 'defteam_timeouts_remaining',
 'total_home_score',
 'total_away_score',
 'posteam_score',
 'defteam_score',
 'score_differential',
 'po

In [16]:
import pandas as pd

# ---------- 1. Helper: seconds -> "M:SS" ----------
def sec_to_mmss(x):
    if pd.isna(x):
        return None
    x = int(x)
    m, s = divmod(x, 60)
    return f"{m}:{s:02d}"   # e.g. 90 -> "1:30"


# ---------- 2. Filter to 2023 season ----------
df_2023 = df[df["season"] == 2023].copy()

# Add formatted quarter time (M:SS) from quarter_seconds_remaining
df_2023["qtr_time"] = df_2023["quarter_seconds_remaining"].apply(sec_to_mmss)


# ---------- 3. Define run vs pass ----------
runs = df_2023[df_2023["rush_attempt"] == 1]
passes = df_2023[df_2023["pass_attempt"] == 1]


# ---------- 4. Situations ----------

# Run – 3rd and short (1–3 yards)
runs_3rd_short = runs[
    (runs["down"] == 3) &
    (runs["ydstogo"].between(1, 3))
]

# Run – goal line
runs_goal_line = runs[
    (runs["goal_to_go"] == 1)
]

# Run – 1st and 10
runs_1st_10 = runs[
    (runs["down"] == 1) &
    (runs["ydstogo"] == 10)
]

# Pass – 3rd and long (7+ yards)
passes_3rd_long = passes[
    (passes["down"] == 3) &
    (passes["ydstogo"] >= 7)
]


# ---------- 5. Combine situations and sample 10 plays ----------

situations = pd.concat(
    [runs_3rd_short, runs_goal_line, runs_1st_10, passes_3rd_long],
    axis=0
)

situations = situations[situations["yards_gained"] >= 15]

# In case there are fewer than 10 rows, be safe:
sample_n = min(10, len(situations))
sample_10 = situations.sample(n=sample_n, random_state=42)

# ---------- 6. Keep / rename columns to what you described ----------

subset = sample_10[
    [
        "game_id",
        "play_id",
        "home_team",
        "away_team",
        "week",
        "qtr",
        "quarter_seconds_remaining",
        "qtr_time",     # new formatted time "M:SS"
        "down",
        "ydstogo",
        "yrdln",
        "yardline_100",
        "goal_to_go",
    ]
].rename(
    columns={
        "quarter_seconds_remaining": "qtr_seconds_remaining",
        "ydstogo": "yrds_to_go",
        "yrdln": "yrdline",
        "yardline_100": "yardline100",
    }
)

print(subset)

subset.to_csv("sample_plays_2023.csv", index=False)

                game_id  play_id home_team away_team  week  qtr  \
123884  2023_10_DEN_BUF     3625       BUF       DEN    10    4   
147590   2023_19_PHI_TB     3737        TB       PHI    19    4   
135311   2023_14_SEA_SF       56        SF       SEA    14    1   
144590   2023_18_CHI_GB     3147        GB       CHI    18    4   
116649   2023_07_GB_DEN     2499       DEN        GB     7    3   
132879  2023_13_SEA_DAL     1995       DAL       SEA    13    2   
147462   2023_19_PHI_TB      603        TB       PHI    19    1   
105587   2023_03_CHI_KC      676        KC       CHI     3    1   
130215  2023_12_PIT_CIN      759       CIN       PIT    12    1   
117277  2023_07_MIA_PHI      723       PHI       MIA     7    1   

        qtr_seconds_remaining qtr_time  down  yrds_to_go yrdline  yardline100  \
123884                  702.0    11:42   3.0          10  DEN 46         54.0   
147590                  347.0     5:47   3.0           7  PHI 23         23.0   
135311             

In [14]:
# Cell 1: Imports and load nflfastR-style play-by-play data from Kaggle

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Load your uploaded play-by-play data
data_path = "/kaggle/input/nfl-pbp-2021-2023/pbp_2021_2023.csv"

df = pd.read_csv(data_path, low_memory=False)

# Quick sanity check on key columns
df[["season", "week", "game_id", "play_id", "play_type", "down", "ydstogo"]].head()


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,season,week,game_id,play_id,play_type,down,ydstogo
0,2021,1,2021_01_ARI_TEN,1,,,0
1,2021,1,2021_01_ARI_TEN,40,kickoff,,0
2,2021,1,2021_01_ARI_TEN,55,run,1.0,10
3,2021,1,2021_01_ARI_TEN,76,pass,2.0,13
4,2021,1,2021_01_ARI_TEN,100,pass,3.0,10


In [9]:
import pandas as pd

# 1. Filter to 2023 season
df_2023 = df[df["season"] == 2023]

# 2. Define run vs pass
runs = df_2023[df_2023["rush_attempt"] == 1]
passes = df_2023[df_2023["pass_attempt"] == 1]

# 3. Situations

# Run – 3rd and short (1–3 yards)
runs_3rd_short = runs[
    (runs["down"] == 3) &
    (runs["ydstogo"].between(1, 3))
]

# Run – goal line
runs_goal_line = runs[
    (runs["goal_to_go"] == 1)
]

# Run – 1st and 10
runs_1st_10 = runs[
    (runs["down"] == 1) &
    (runs["ydstogo"] == 10)
]

# Pass – 3rd and long (7+ yards)
passes_3rd_long = passes[
    (passes["down"] == 3) &
    (passes["ydstogo"] >= 7)
]

# 4. Combine all these situations
situations = pd.concat(
    [runs_3rd_short, runs_goal_line, runs_1st_10, passes_3rd_long],
    axis=0
)

# 5. Sample 10 random plays
sample_10 = situations.sample(n=10, random_state=42)

# 6. Keep only the columns you care about
cols = [
    "game_id",
    "play_id",
    "home_team",
    "away_team",
    "week",
    "qtr",
    "quarter_seconds_remaining",  # your qtr_seconds_reamining
    "down",
    "ydstogo",                    # yrds_to_go
    "yrdln",                      # yrdline
    "yardline_100",               # yardline100
    "goal_to_go",
]


sample_10 = sample_10[cols]

print(sample_10)


                game_id  play_id home_team away_team  week  qtr  \
108343  2023_04_BAL_CLE     3160       CLE       BAL     4    3   
131150  2023_13_ATL_NYJ     3361       NYJ       ATL    13    3   
127603   2023_11_SEA_LA       97        LA       SEA    11    1   
133424   2023_14_CAR_NO     1603        NO       CAR    14    2   
120773  2023_09_ARI_CLE      575       CLE       ARI     9    1   
133795  2023_14_DET_CHI     2377       CHI       DET    14    3   
123714  2023_10_CLE_BAL     3848       BAL       CLE    10    4   
127479  2023_11_PIT_CLE     1486       CLE       PIT    11    2   
117655   2023_07_SF_MIN     1962       MIN        SF     7    3   
124584   2023_10_IND_NE     2878        NE       IND    10    4   

        quarter_seconds_remaining  down  ydstogo   yrdln  yardline_100  \
108343                      161.0   3.0       11  BAL 35          65.0   
131150                       26.0   1.0       10  ATL 32          68.0   
127603                      835.0   3.0 

In [None]:
print(len(df.columns), "columns total:\n")
print(df.columns.tolist())

In [None]:
# Cell 2: Filter to run/pass plays and create binary target is_pass

# See what play_type values look like
print("Unique play types:", df["play_type"].dropna().unique()[:20])

# Keep only plays that are clearly runs or passes
# (Most nflfastR-style data uses "run" and "pass")
df_model = df[df["play_type"].isin(["run", "pass"])].copy()

# Create target: 1 = pass, 0 = run
df_model["is_pass"] = (df_model["play_type"] == "pass").astype(int)

# Quick check of class balance
print("\nClass balance (0 = run, 1 = pass):")
print(df_model["is_pass"].value_counts(normalize=True))

df_model[["season", "week", "play_type", "down", "ydstogo", "is_pass"]].head()


In [None]:
# New Cell 3 (no leakage): feature engineering + build X and y

df_fe = df_model.copy()

# ---- Red zone feature (known pre-snap from ball position) ----
if "yardline_100" in df_fe.columns:
    df_fe["is_red_zone"] = (df_fe["yardline_100"] <= 20).astype(int)
else:
    print("Warning: yardline_100 not found, no red zone feature")

# ---- Goal-to-go feature ----
if "goal_to_go" in df_fe.columns:
    df_fe["is_goal_to_go"] = df_fe["goal_to_go"].fillna(0).astype(int)

# ---- Distance (short/medium/long) features ----
if "ydstogo" in df_fe.columns:
    df_fe["short_ydstogo"] = (df_fe["ydstogo"] <= 3).astype(int)
    df_fe["medium_ydstogo"] = df_fe["ydstogo"].between(4, 7).astype(int)
    df_fe["long_ydstogo"] = (df_fe["ydstogo"] >= 8).astype(int)

# ---- Offensive "formation-ish" features that are pre-snap ----
for col in ["shotgun", "no_huddle"]:
    if col in df_fe.columns:
        # Usually 0/1 or NaN; force to 0/1 ints
        df_fe[col] = df_fe[col].fillna(0).astype(int)

# IMPORTANT: do NOT use qb_dropback, pass_attempt, rush_attempt, etc. -> leakage

# ---- Score and score state features ----
if "score_differential" in df_fe.columns:
    df_fe["score_differential"] = df_fe["score_differential"].fillna(0)
    df_fe["is_trailing"] = (df_fe["score_differential"] < 0).astype(int)
    df_fe["is_tied"] = (df_fe["score_differential"] == 0).astype(int)
    df_fe["is_leading"] = (df_fe["score_differential"] > 0).astype(int)

# ---- Time/quarter context ----
if "qtr" in df_fe.columns:
    df_fe["is_fourth_qtr"] = (df_fe["qtr"] == 4).astype(int)

if "half_seconds_remaining" in df_fe.columns:
    # Inside 2-minute warning of the half
    df_fe["late_half"] = (df_fe["half_seconds_remaining"] < 120).astype(int)

# ---- Home vs away offense ----
if {"posteam", "home_team"}.issubset(df_fe.columns):
    df_fe["is_home_offense"] = (df_fe["posteam"] == df_fe["home_team"]).astype(int)

# ---- Base numeric features we still want (all pre-snap) ----
base_numeric_cols = []
for col in ["down", "ydstogo", "yardline_100", "game_seconds_remaining"]:
    if col in df_fe.columns:
        base_numeric_cols.append(col)

# ---- Collect engineered feature columns that exist ----
engineered_candidates = [
    "is_red_zone",
    "is_goal_to_go",
    "short_ydstogo",
    "medium_ydstogo",
    "long_ydstogo",
    "shotgun",
    "no_huddle",
    "score_differential",
    "is_trailing",
    "is_tied",
    "is_leading",
    "is_fourth_qtr",
    "late_half",
    "is_home_offense",
]

engineered_cols = [c for c in engineered_candidates if c in df_fe.columns]

feature_cols = base_numeric_cols + engineered_cols

print("Base numeric features:", base_numeric_cols)
print("Engineered features:", engineered_cols)
print("Total features used:", len(feature_cols))

# ---- Drop rows with missing values in our features or target ----
df_fe = df_fe.dropna(subset=feature_cols + ["is_pass"])

# ---- Build X and y ----
X = df_fe[feature_cols]
y = df_fe["is_pass"]

print("X shape:", X.shape)
print("y shape:", y.shape)

X.head()


In [None]:
# Cell 4: Train/test split, scaling, and logistic regression training

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,    # keep run/pass balance similar in train and test
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

# Scale features (fit on train, apply to test)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define and train logistic regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

print("Model trained!")


In [None]:
# Cell 4b: Hyperparameter tuning for Logistic Regression

from sklearn.model_selection import GridSearchCV

# Base model (no need to set C here, we'll tune it)
base_log_reg = LogisticRegression(max_iter=1000)

# Hyperparameter grid:
# - C controls regularization strength (lower = stronger regularization)
# - class_weight "balanced" can help if classes are imbalanced
param_grid = {
    "C": [0.01, 0.1, 1.0, 10.0],
    "class_weight": [None, "balanced"],
}

grid_search = GridSearchCV(
    estimator=base_log_reg,
    param_grid=param_grid,
    cv=3,                 # 3-fold cross-validation on the training set
    scoring="f1",         # optimize F1 for class 1 (pass) by default
    n_jobs=-1,            # use all available cores
    verbose=1
)

grid_search.fit(X_train_scaled, y_train)

print("Best params:", grid_search.best_params_)
print("Best CV score (F1):", grid_search.best_score_)

# This will be our final logistic regression model
log_reg = grid_search.best_estimator_


In [None]:
# Cell 5: Evaluation – confusion matrix and classification report

from sklearn.metrics import classification_report, confusion_matrix

# Predictions
y_pred = log_reg.predict(X_test_scaled)

print("Confusion matrix (rows = true, cols = predicted):")
print(confusion_matrix(y_test, y_pred))

print("\nClassification report (0 = run, 1 = pass):")
print(classification_report(y_test, y_pred, target_names=["run", "pass"]))


In [None]:
# Cell: Save model, scaler, and feature columns for use in Streamlit

import joblib
import json
import os

os.makedirs("artifacts", exist_ok=True)

# 1. Save the trained logistic regression model
joblib.dump(log_reg, "artifacts/log_reg_model.pkl")

# 2. Save the scaler
joblib.dump(scaler, "artifacts/scaler.pkl")

# 3. Save the feature column order (this is IMPORTANT)
feature_cols_list = list(X.columns)
with open("artifacts/feature_cols.json", "w") as f:
    json.dump(feature_cols_list, f)

print("Saved artifacts:", os.listdir("artifacts"))
