# NFL Big Data Bowl 2026 - Exploratory Data Analysis

This notebook explores the NFL player tracking data to understand:
- Data structure and format
- Player tracking patterns
- Spatial distributions
- Temporal dynamics
- Target variable characteristics

In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)
%matplotlib inline

## Load Data

In [None]:
# Define paths
data_dir = Path.cwd().parent / "data" / "raw" / "nfl-big-data-bowl-2026-analytics" / "114239_nfl_competition_files_published_analytics_final"

print(f"Loading data from {data_dir}")

# Load supplementary data
supplementary = pd.read_csv(data_dir / "supplementary_data.csv")
print(f"Supplementary data shape: {supplementary.shape}")
print(f"Columns: {list(supplementary.columns)}")
supplementary.head()

## Data Structure Overview

In [None]:
# Basic statistics
print("Dataset Overview:")
print("=" * 80)
print(f"Total rows: {len(supplementary):,}")
print(f"Total columns: {len(supplementary.columns)}")
print(f"\nColumn types:")
print(supplementary.dtypes.value_counts())
print(f"\nMemory usage: {supplementary.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Check for missing values
missing = supplementary.isnull().sum()
missing_pct = (missing / len(supplementary) * 100).sort_values(ascending=False)
missing_pct = missing_pct[missing_pct > 0]

if len(missing_pct) > 0:
    print(f"Columns with missing values: {len(missing_pct)}")
    print("\nTop 10 by missing percentage:")
    print(missing_pct.head(10))
    
    # Plot
    plt.figure(figsize=(12, 6))
    missing_pct.head(20).plot(kind='bar', color='steelblue', edgecolor='black')
    plt.title('Missing Value Percentage by Column (Top 20)')
    plt.xlabel('Column')
    plt.ylabel('Missing %')
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("âœ“ No missing values in the dataset!")

## Target Variable Analysis

In [None]:
# Identify target column (typically 'yards' or similar)
# Adjust based on actual column name
target_candidates = [col for col in supplementary.columns if 'yard' in col.lower() or 'return' in col.lower()]
print(f"Potential target columns: {target_candidates}")

if target_candidates:
    target_col = target_candidates[0]
    print(f"\nUsing '{target_col}' as target variable")
    
    # Statistics
    print(f"\nTarget Statistics:")
    print(supplementary[target_col].describe())
    
    # Distribution
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Histogram
    axes[0].hist(supplementary[target_col].dropna(), bins=50, edgecolor='black', alpha=0.7)
    axes[0].set_title(f'{target_col} Distribution')
    axes[0].set_xlabel('Value')
    axes[0].set_ylabel('Count')
    axes[0].axvline(supplementary[target_col].median(), color='red', linestyle='--', 
                   label=f'Median: {supplementary[target_col].median():.2f}')
    axes[0].legend()
    
    # Box plot
    axes[1].boxplot(supplementary[target_col].dropna())
    axes[1].set_title(f'{target_col} Box Plot')
    axes[1].set_ylabel('Value')
    axes[1].grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## Feature Analysis

In [None]:
# Identify numeric columns
numeric_cols = supplementary.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numeric columns ({len(numeric_cols)}): {numeric_cols[:10]}...")

# Identify categorical columns
categorical_cols = supplementary.select_dtypes(include=['object']).columns.tolist()
print(f"\nCategorical columns ({len(categorical_cols)}): {categorical_cols[:10]}...")

In [None]:
# Correlation analysis (if target exists)
if target_candidates and len(numeric_cols) > 1:
    target_col = target_candidates[0]
    
    # Calculate correlations
    correlations = supplementary[numeric_cols].corr()[target_col].drop(target_col).abs().sort_values(ascending=False)
    
    print(f"Top 15 Features by Correlation with {target_col}:")
    print(correlations.head(15))
    
    # Plot
    plt.figure(figsize=(10, 6))
    correlations.head(20).plot(kind='barh', color='purple', edgecolor='black')
    plt.title(f'Top 20 Features by Absolute Correlation with {target_col}')
    plt.xlabel('|Correlation|')
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.show()

## Categorical Feature Analysis

In [None]:
# Analyze categorical features
if categorical_cols:
    print("Categorical Feature Cardinality:")
    for col in categorical_cols[:10]:  # First 10
        n_unique = supplementary[col].nunique()
        print(f"  {col:30s}: {n_unique:5d} unique values")
        
        # Show value counts for low-cardinality features
        if n_unique <= 10:
            print(f"    Values: {supplementary[col].value_counts().to_dict()}")

## Summary Statistics

In [None]:
print("=" * 80)
print("NFL Big Data Bowl 2026 - Dataset Summary")
print("=" * 80)
print(f"Total samples: {len(supplementary):,}")
print(f"Total features: {len(supplementary.columns)}")
print(f"  Numeric: {len(numeric_cols)}")
print(f"  Categorical: {len(categorical_cols)}")
if target_candidates:
    target_col = target_candidates[0]
    print(f"\nTarget: {target_col}")
    print(f"  Mean: {supplementary[target_col].mean():.2f}")
    print(f"  Std:  {supplementary[target_col].std():.2f}")
    print(f"  Min:  {supplementary[target_col].min():.2f}")
    print(f"  Max:  {supplementary[target_col].max():.2f}")
print(f"\nMissing values: {len(missing_pct)} columns with missing data")
print("=" * 80)