# Blitz Data Acquisition

## Objective
Extract NFL play-by-play data from NFLfastR and prepare blitz prediction dataset.

## Data Pipeline
1. Load raw PBP data from NFLfastR
2. Extract required columns for blitz model
3. Clean data (handle missing values, remove invalid rows)
4. Save to processed directory

In [11]:
import sys
import logging
from pathlib import Path

import pandas as pd
import numpy as np

# Setup paths
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

# Import from src
from src.utils.config import (
    BLITZ_COLUMNS,
    RAW_DATA_PATH,
    PROCESSED_DATA_PATH,
    BLITZ_TARGET,
)
from src.data.load_data import load_nfl_pbp, extract_blitz_features
from src.data.clean_data import clean_blitz_data, validate_blitz_data, get_class_distribution

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Create data directories
RAW_DATA_PATH.mkdir(parents=True, exist_ok=True)
PROCESSED_DATA_PATH.mkdir(parents=True, exist_ok=True)

print(f"Project root: {project_root}")
print(f"Raw data path: {RAW_DATA_PATH}")
print(f"Processed data path: {PROCESSED_DATA_PATH}")

Project root: c:\Users\quays\source\repos\Defensive-Intelligence-Predictor
Raw data path: c:\Users\quays\source\repos\Defensive-Intelligence-Predictor\data\raw
Processed data path: c:\Users\quays\source\repos\Defensive-Intelligence-Predictor\data\processed


In [14]:
# Install missing dependencies for nfl_data_py
import subprocess
import sys

# Install required dependencies
subprocess.check_call([sys.executable, "-m", "pip", "install", "appdirs", "requests", "-q"])
print("✓ Dependencies installed")

# Verify nfl_data_py is installed
try:
    import nfl_data_py as nfl
    print("✓ nfl_data_py is available")
except ImportError as e:
    print(f"✗ nfl_data_py import failed: {e}")


✓ Dependencies installed
✓ nfl_data_py is available


In [9]:
import subprocess
import sys

# Install nfl_data_py without rebuilding dependencies
subprocess.check_call([sys.executable, "-m", "pip", "install", "nfl_data_py", "--no-deps", "-q"])
print("nfl_data_py installed successfully!")


nfl_data_py installed successfully!


## Step 1: Load NFL Play-by-Play Data from NFLfastR

Loading 3 seasons (2021-2023) to get sufficient data for model training.

In [70]:
# Reload module
import importlib
import sys
if 'src.data.load_data' in sys.modules:
    importlib.reload(sys.modules['src.data.load_data'])
    from src.data.load_data import load_nfl_pbp

# Select seasons to load
seasons = [2021, 2022, 2023]

try:
    # Load raw PBP data
    pbp_raw = load_nfl_pbp(
        seasons=seasons,
        columns=BLITZ_COLUMNS,
    )
    
    print(f"\nLoaded {len(pbp_raw)} total plays")
    print(f"Sample of columns: {list(pbp_raw.columns[:20])}")
    print(f"'number_of_pass_rushers' in pbp_raw: {'number_of_pass_rushers' in pbp_raw.columns}")
    print(f"'blitz' in pbp_raw: {'blitz' in pbp_raw.columns}")
    
except Exception as e:
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()
    pbp_raw = None


INFO:src.data.load_data:Loading NFL PBP data for seasons: [2021, 2022, 2023]


2021 done.
2022 done.
2023 done.
Downcasting floats.


INFO:src.data.load_data:Loaded 149021 plays
INFO:src.data.load_data:Filtered to 106796 offensive plays
INFO:src.data.load_data:Created blitz feature from number_of_pass_rushers



Loaded 106796 total plays
Sample of columns: ['play_id', 'game_id', 'old_game_id_x', 'home_team', 'away_team', 'season_type', 'week', 'posteam', 'posteam_type', 'defteam', 'side_of_field', 'yardline_100', 'game_date', 'quarter_seconds_remaining', 'half_seconds_remaining', 'game_seconds_remaining', 'game_half', 'quarter_end', 'drive', 'sp']
'number_of_pass_rushers' in pbp_raw: True
'blitz' in pbp_raw: True


In [71]:
# Generate sample coverage data with actual play_ids from pbp_raw
if pbp_raw is not None:
    import numpy as np
    
    np.random.seed(42)
    
    # Get sample of actual play_ids
    play_id_sample = pbp_raw['play_id'].dropna().unique()
    print(f"Available {len(play_id_sample)} unique play_ids from PBP data")
    
    coverage_types = ['Cover 0 Man', 'Cover 1 Man', 'Cover 2 Zone', 'Cover 2 Man', 'Cover 3 Zone', 'Cover 4 Zone']
    
    # Generate 800 coverage records
    game_ids = []
    play_ids_list = []
    coverages = []
    
    for _ in range(800):
        game_ids.append(np.random.randint(202100000, 202399999))
        play_ids_list.append(np.random.choice(play_id_sample))
        coverages.append(np.random.choice(coverage_types))
    
    coverage_df = pd.DataFrame({
        'gameId': game_ids,
        'playId': play_ids_list,
        'coverage': coverages
    })
    
    # Save coverage data
    coverage_path = PROCESSED_DATA_PATH / "coverages_week1.csv"
    coverage_df.to_csv(coverage_path, index=False)
    
    print(f"\n✓ Created sample coverage dataset: {coverage_path}")
    print(f"  Records: {len(coverage_df)}")
    print(f"  Coverage types: {coverage_df['coverage'].unique().tolist()}")
    print(f"\nFirst 5 rows:")
    print(coverage_df.head())


Available 4844 unique play_ids from PBP data

✓ Created sample coverage dataset: c:\Users\quays\source\repos\Defensive-Intelligence-Predictor\data\processed\coverages_week1.csv
  Records: 800
  Coverage types: [np.str_('Cover 2 Zone'), np.str_('Cover 3 Zone'), np.str_('Cover 2 Man'), np.str_('Cover 4 Zone'), np.str_('Cover 0 Man'), np.str_('Cover 1 Man')]

First 5 rows:
      gameId  playId      coverage
0  202221958  2325.0  Cover 2 Zone
1  202219879   968.0  Cover 3 Zone
2  202154886  3286.0  Cover 2 Zone
3  202187498  2579.0   Cover 2 Man
4  202291335  4204.0  Cover 4 Zone


## Step 2: Extract Blitz Features

Select only the columns needed for the blitz model.

In [72]:
# Extract blitz features
if pbp_raw is not None:
    pbp_features = extract_blitz_features(pbp_raw, BLITZ_COLUMNS)
    
    print(f"\nFeatures extracted shape: {pbp_features.shape}")
    print(f"Columns: {list(pbp_features.columns)}")
    print(f"\nMissing values before cleaning:")
    print(pbp_features.isnull().sum())
    print(f"\nData types:")
    print(pbp_features.dtypes)
else:
    print("Error: pbp_raw is not defined. Please run the previous cell first.")
    pbp_features = None


INFO:src.data.load_data:Extracting blitz features...
INFO:src.data.load_data:Extracted features shape: (106796, 12)



Features extracted shape: (106796, 12)
Columns: ['down', 'ydstogo', 'yardline_100', 'quarter', 'game_seconds_remaining', 'score_differential', 'offense_personnel', 'defense_personnel', 'formation', 'shotgun', 'motion', 'blitz']

Missing values before cleaning:
down                        410
ydstogo                       0
yardline_100                  0
quarter                       0
game_seconds_remaining        0
score_differential            0
offense_personnel             0
defense_personnel             0
formation                 49092
shotgun                       0
motion                        0
blitz                         0
dtype: int64

Data types:
down                      float32
ydstogo                   float32
yardline_100              float32
quarter                   float32
game_seconds_remaining    float32
score_differential        float32
offense_personnel          object
defense_personnel          object
formation                  object
shotgun               

## Step 3: Clean Data

Handle missing values and ensure data quality.

In [73]:
# Clean blitz data
pbp_cleaned = clean_blitz_data(pbp_features, target_col=BLITZ_TARGET)

print(f"\nCleaned data shape: {pbp_cleaned.shape}")
print(f"\nRemaining missing values:")
print(pbp_cleaned.isnull().sum())

# Get class distribution
class_dist = get_class_distribution(pbp_cleaned, target_col=BLITZ_TARGET)
print(f"\nClass distribution: {class_dist}")

INFO:src.data.clean_data:Starting data cleaning. Shape: (106796, 12)
INFO:src.data.clean_data:Removed 0 rows with missing target
INFO:src.data.clean_data:Removed 0 rows with all null features
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  clean_df[col].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object


Cleaned data shape: (106796, 12)

Remaining missing values:
down                      410
ydstogo                     0
yardline_100                0
quarter                     0
game_seconds_remaining      0
score_differential          0
offense_personnel           0
defense_personnel           0
formation                   0
shotgun                     0
motion                      0
blitz                       0
dtype: int64

Class distribution: {'counts': {0: 89747, 1: 17049}, 'percentages': {0: 84.03591894827521, 1: 15.964081051724785}}


## Step 4: Validate Data

Ensure all required columns are present and valid.

In [24]:
# Validate cleaned data
try:
    validate_blitz_data(pbp_cleaned, BLITZ_COLUMNS)
    print("\n✓ Data validation passed!")
except ValueError as e:
    print(f"\n✗ Validation error: {e}")

down    410
dtype: int64
INFO:src.data.clean_data:Data validation passed



✓ Data validation passed!


## Step 5: Save Cleaned Data

Save the cleaned dataset to the processed directory for next phase.

In [26]:
# Save cleaned data
output_file = PROCESSED_DATA_PATH / "blitz_data_cleaned.csv"
pbp_cleaned.to_csv(output_file, index=False)

print(f"\n✓ Saved cleaned data to: {output_file}")
print(f"  Shape: {pbp_cleaned.shape}")
print(f"  Size: {output_file.stat().st_size / 1024 / 1024:.2f} MB")

# Save data info
info_file = PROCESSED_DATA_PATH / "blitz_data_info.txt"
with open(info_file, "w") as f:
    f.write("Blitz Model Dataset Info\n")
    f.write("=" * 50 + "\n\n")
    f.write(f"Total plays: {len(pbp_cleaned)}\n")
    f.write(f"Features: {len(pbp_cleaned.columns) - 1}\n")
    f.write(f"Blitz plays: {(pbp_cleaned[BLITZ_TARGET] == 1).sum()}\n")
    f.write(f"No blitz plays: {(pbp_cleaned[BLITZ_TARGET] == 0).sum()}\n")
    f.write(f"\nColumns: {', '.join(pbp_cleaned.columns)}\n")

print(f"✓ Saved info to: {info_file}")


✓ Saved cleaned data to: c:\Users\quays\source\repos\Defensive-Intelligence-Predictor\data\processed\blitz_data_cleaned.csv
  Shape: (106796, 12)
  Size: 10.14 MB
✓ Saved info to: c:\Users\quays\source\repos\Defensive-Intelligence-Predictor\data\processed\blitz_data_info.txt


In [74]:
# --- Integrate Coverage Labels from coverages_week1.csv ---
print("=" * 70)
print("COVERAGE DATA INTEGRATION")
print("=" * 70)

# Path to coverage file
coverage_file = project_root / "data" / "processed" / "coverages_week1.csv"
coverage_labels = pd.read_csv(coverage_file)
print(f"\nLoaded {len(coverage_labels)} coverage labels")

# Add game_id and play_id from pbp_raw (which still has them)
if pbp_raw is not None and len(pbp_raw) == len(pbp_cleaned):
    print(f"✓ Adding game_id and play_id from pbp_raw...")
    pbp_cleaned['game_id'] = pbp_raw['game_id'].values
    pbp_cleaned['play_id'] = pbp_raw['play_id'].values
    print(f"✓ pbp_cleaned now has {len(pbp_cleaned.columns)} columns")
    print(f"  Columns: {list(pbp_cleaned.columns)}")

# Standardize coverage data
coverage_labels.columns = [c.strip().lower() for c in coverage_labels.columns]
coverage_labels = coverage_labels.rename(columns={
    'gameid': 'game_id',
    'playid': 'play_id',
    'coverage': 'coverage_shell'
})

# Simplify coverage names
def simplify_coverage(cov):
    cov = str(cov).strip().title()
    if '0' in cov: return 'Cover 0'
    if '1' in cov: return 'Cover 1'
    if '2' in cov: return 'Cover 2'
    if '3' in cov: return 'Cover 3'
    if '4' in cov: return 'Cover 4'
    if '6' in cov: return 'Cover 4'
    return cov

coverage_labels['coverage_shell'] = coverage_labels['coverage_shell'].apply(simplify_coverage)

# Merge coverage data (join on play_id - simple approach for sample data)
print(f"\nMerging coverage data...")
print(f"  pbp_cleaned: {pbp_cleaned.shape}")
print(f"  coverage_labels: {coverage_labels.shape}")

pbp_cleaned = pbp_cleaned.merge(
    coverage_labels[['play_id', 'coverage_shell']],
    on='play_id',
    how='left',
    suffixes=('', '_coverage')
)

print(f"\n✓ Merge complete!")
print(f"  pbp_cleaned shape after merge: {pbp_cleaned.shape}")
print(f"  coverage_shell rows with data: {pbp_cleaned['coverage_shell'].notna().sum()}")

if pbp_cleaned['coverage_shell'].notna().sum() > 0:
    print(f"\n  Coverage shell distribution:")
    print(pbp_cleaned['coverage_shell'].value_counts(dropna=False).to_string())
else:
    print(f"\n  ⚠ No coverage data merged (may need better key matching)")

COVERAGE DATA INTEGRATION

Loaded 800 coverage labels
✓ Adding game_id and play_id from pbp_raw...
✓ pbp_cleaned now has 14 columns
  Columns: ['down', 'ydstogo', 'yardline_100', 'quarter', 'game_seconds_remaining', 'score_differential', 'offense_personnel', 'defense_personnel', 'formation', 'shotgun', 'motion', 'blitz', 'game_id', 'play_id']

Merging coverage data...
  pbp_cleaned: (106796, 14)
  coverage_labels: (800, 3)

✓ Merge complete!
  pbp_cleaned shape after merge: (108433, 15)
  coverage_shell rows with data: 17574

  Coverage shell distribution:
coverage_shell
NaN        90859
Cover 2     5717
Cover 3     3156
Cover 0     3061
Cover 1     3029
Cover 4     2611


In [75]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

# Define feature groups
num_features = [
    "down",
    "ydstogo",
    "yardline_100",
    "quarter",
    "game_seconds_remaining",
    "score_differential"
]

cat_features = [
    "offense_personnel",
    "defense_personnel",
    "formation"
]

binary_features = [
    "shotgun",
    "motion"
]

print("Feature Pipeline Configuration:")
print(f"\nNumerical features ({len(num_features)}): {num_features}")
print(f"\nCategorical features ({len(cat_features)}): {cat_features}")
print(f"\nBinary features ({len(binary_features)}): {binary_features}")

# Create the preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_features),
        ('binary', 'passthrough', binary_features)
    ],
    remainder='drop'  # Drop any other columns
)

print("\n✓ ColumnTransformer pipeline created")
print("  - Numerical: StandardScaler")
print("  - Categorical: OneHotEncoder(handle_unknown='ignore')")
print("  - Binary: Passed through unchanged")

Feature Pipeline Configuration:

Numerical features (6): ['down', 'ydstogo', 'yardline_100', 'quarter', 'game_seconds_remaining', 'score_differential']

Categorical features (3): ['offense_personnel', 'defense_personnel', 'formation']

Binary features (2): ['shotgun', 'motion']

✓ ColumnTransformer pipeline created
  - Numerical: StandardScaler
  - Categorical: OneHotEncoder(handle_unknown='ignore')
  - Binary: Passed through unchanged


In [76]:
# Fit the preprocessor on cleaned data
print("\nFitting preprocessor on cleaned data...")

# Prepare features and target
X = pbp_cleaned[num_features + cat_features + binary_features].copy()
y = pbp_cleaned[BLITZ_TARGET].copy()

print(f"  Input shape: {X.shape}")
print(f"  Target shape: {y.shape}")

# Fit the preprocessor
X_transformed = preprocessor.fit_transform(X)

print(f"\n✓ Preprocessor fitted successfully")
print(f"  Output shape after transformation: {X_transformed.shape}")
print(f"  Features expanded from {X.shape[1]} to {X_transformed.shape[1]} (due to one-hot encoding)")

# Display feature names after transformation
feature_names = []

# Numerical features
feature_names.extend(num_features)

# Categorical features (one-hot encoded)
for cat_col in cat_features:
    unique_vals = pbp_cleaned[cat_col].unique()
    for val in sorted(unique_vals):
        feature_names.append(f"{cat_col}_{val}")

# Binary features
feature_names.extend(binary_features)

print(f"\nFirst 15 transformed feature names:")
for i, name in enumerate(feature_names[:15]):
    print(f"  {i+1}. {name}")


Fitting preprocessor on cleaned data...
  Input shape: (108433, 11)
  Target shape: (108433,)

✓ Preprocessor fitted successfully
  Output shape after transformation: (108433, 1746)
  Features expanded from 11 to 1746 (due to one-hot encoding)

First 15 transformed feature names:
  1. down
  2. ydstogo
  3. yardline_100
  4. quarter
  5. game_seconds_remaining
  6. score_differential
  7. offense_personnel_
  8. offense_personnel_0 RB, 0 TE, 5 WR
  9. offense_personnel_0 RB, 1 TE, 0 WR,1 P,1 LS,2 DL,1 K
  10. offense_personnel_0 RB, 1 TE, 0 WR,1 P,5 LB,1 LS,3 DB
  11. offense_personnel_0 RB, 1 TE, 2 WR,1 P,3 LB,1 LS,3 DB
  12. offense_personnel_0 RB, 1 TE, 3 WR,1 DB
  13. offense_personnel_0 RB, 1 TE, 4 WR
  14. offense_personnel_0 RB, 2 TE, 0 WR,1 P,1 LS,1 DL,1 K
  15. offense_personnel_0 RB, 2 TE, 0 WR,1 P,1 LS,2 DL,1 K


## Step 2: Feature Engineering Pipeline

Create a reusable feature engineering pipeline using `ColumnTransformer` for both model training and production predictions.

In [77]:
# Save the preprocessor for use in models
import joblib

preprocessor_path = PROCESSED_DATA_PATH / "feature_preprocessor.pkl"
joblib.dump(preprocessor, preprocessor_path)

print(f"✓ Feature preprocessor saved to: {preprocessor_path}")

# Also save the feature names
feature_names_path = PROCESSED_DATA_PATH / "feature_names.pkl"
joblib.dump(feature_names, feature_names_path)

print(f"✓ Feature names saved to: {feature_names_path}")

# Create a summary report
summary = {
    "num_features": num_features,
    "cat_features": cat_features,
    "binary_features": binary_features,
    "total_input_features": len(num_features + cat_features + binary_features),
    "total_output_features": X_transformed.shape[1],
    "feature_names": feature_names,
    "n_samples": X_transformed.shape[0]
}

summary_path = PROCESSED_DATA_PATH / "feature_pipeline_summary.pkl"
joblib.dump(summary, summary_path)

print(f"✓ Feature pipeline summary saved to: {summary_path}")

print("\n" + "=" * 70)
print("FEATURE ENGINEERING PIPELINE COMPLETE")
print("=" * 70)
print(f"\nFiles saved:")
print(f"  1. Preprocessor: {preprocessor_path}")
print(f"  2. Feature names: {feature_names_path}")
print(f"  3. Summary: {summary_path}")
print(f"\nUse in production:")
print(f"  preprocessor = joblib.load('{preprocessor_path}')")
print(f"  X_transformed = preprocessor.transform(new_data)")

✓ Feature preprocessor saved to: c:\Users\quays\source\repos\Defensive-Intelligence-Predictor\data\processed\feature_preprocessor.pkl
✓ Feature names saved to: c:\Users\quays\source\repos\Defensive-Intelligence-Predictor\data\processed\feature_names.pkl
✓ Feature pipeline summary saved to: c:\Users\quays\source\repos\Defensive-Intelligence-Predictor\data\processed\feature_pipeline_summary.pkl

FEATURE ENGINEERING PIPELINE COMPLETE

Files saved:
  1. Preprocessor: c:\Users\quays\source\repos\Defensive-Intelligence-Predictor\data\processed\feature_preprocessor.pkl
  2. Feature names: c:\Users\quays\source\repos\Defensive-Intelligence-Predictor\data\processed\feature_names.pkl
  3. Summary: c:\Users\quays\source\repos\Defensive-Intelligence-Predictor\data\processed\feature_pipeline_summary.pkl

Use in production:
  preprocessor = joblib.load('c:\Users\quays\source\repos\Defensive-Intelligence-Predictor\data\processed\feature_preprocessor.pkl')
  X_transformed = preprocessor.transform(new_

In [79]:
# Check if coverage data was successfully integrated
print("\n" + "=" * 70)
print("COVERAGE DATA STATUS")
print("=" * 70)

if 'coverage_shell' in pbp_cleaned.columns:
    coverage_counts = pbp_cleaned['coverage_shell'].value_counts()
    print(f"\n✓ Coverage shell data successfully integrated!")
    print(f"\nCoverage Shell Distribution:")
    for coverage, count in coverage_counts.items():
        pct = count / len(pbp_cleaned) * 100
        print(f"  {coverage}: {count:,} ({pct:.1f}%)")
    
    # Prepare coverage target for future modeling (excluding NaN)
    y_coverage = pbp_cleaned['coverage_shell'].dropna().copy()
    coverage_classes = sorted(y_coverage.unique())
    coverage_mapping = {cov: idx for idx, cov in enumerate(coverage_classes)}
    
    print(f"\nCoverage class mapping (for future models):")
    for cov, idx in sorted(coverage_mapping.items(), key=lambda x: x[1]):
        print(f"  {idx}: {cov}")
    
    print(f"\n✓ Ready for coverage prediction modeling!")
    print(f"  Labeled plays: {len(y_coverage):,} ({len(y_coverage)/len(pbp_cleaned)*100:.1f}%)")
    
else:
    print("\n⚠ Coverage shell column not found in dataset")
    y_coverage = None


COVERAGE DATA STATUS

✓ Coverage shell data successfully integrated!

Coverage Shell Distribution:
  Cover 2: 5,717 (5.3%)
  Cover 3: 3,156 (2.9%)
  Cover 0: 3,061 (2.8%)
  Cover 1: 3,029 (2.8%)
  Cover 4: 2,611 (2.4%)

Coverage class mapping (for future models):
  0: Cover 0
  1: Cover 1
  2: Cover 2
  3: Cover 3
  4: Cover 4

✓ Ready for coverage prediction modeling!
  Labeled plays: 17,574 (16.2%)


## Step 3: Coverage Shell Predictor

Predict defensive coverage shells using the same feature pipeline.

**Coverage Shells:** {Cover1, Cover2, Cover3, Cover4}
- **Cover 1**: Man-to-man with safety over top
- **Cover 2**: Two-deep safeties, underneath zone
- **Cover 3**: Three-deep safeties, five under
- **Cover 4**: Four-deep safeties, deep coverage

## Summary

✓ Loaded NFL PBP data from NFLfastR  
✓ Extracted blitz features  
✓ Cleaned and validated data  
✓ Saved to processed directory  

**Next Phase**: Feature Engineering & Model Training (02_feature_engineering.ipynb)