In [12]:
# ===============================
# 1️⃣ IMPORT LIBRARIES
# ===============================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning (later for modeling)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Advanced modeling (XGBoost)
import xgboost as xgb

# NFL data
import nfl_data_py as nfl


In [19]:

# ===============================
# 2️⃣ LOAD 2024–2025 PLAY-BY-PLAY
# ===============================
pbp = nfl.import_pbp_data([2024, 2025])

# Quick check of column names
print(pbp.columns)

# ===============================
# 3️⃣ CALCULATE METRICS AND FANTASY POINTS SAFELY
# ===============================
pbp['epa'] = pbp.get('epa', 0).fillna(0)
pbp['success'] = ((pbp.get('epa', 0) > 0) | (pbp.get('ydstogo', 0)*0.5 <= pbp.get('yards_gained', 0))).astype(int)
pbp['red_zone'] = (pbp.get('yardline_100', 100) <= 20).astype(int)

# Fantasy points formula
pbp['fantasy_points'] = (
    pbp.get('rush_yd', 0)*0.1 + pbp.get('rush_td', 0)*6 +
    pbp.get('rec_yd', 0)*0.1 + pbp.get('rec_td', 0)*6 +
    pbp.get('pass_yd', 0)*0.04 + pbp.get('pass_td', 0)*4 -
    pbp.get('interception', 0)*2 - pbp.get('fumbles_lost', 0)*2
)

pbp[['player_name','epa','success','fantasy_points']].head()

# ===============================
# 4️⃣ AGGREGATE PLAYER STATS
# ===============================
player_stats = pbp.groupby('player_name').agg(
    total_epa=('epa','sum'),
    success_rate=('success','mean'),
    total_fp=('fantasy_points','sum'),
    plays=('play_type','count')
).reset_index().sort_values('total_fp', ascending=False)

player_stats.head(10)

# ===============================
# 5️⃣ AGGREGATE TEAM STATS
# ===============================
team_stats = pbp.groupby('posteam').agg(
    total_epa=('epa','sum'),
    success_rate=('success','mean'),
    avg_fp_per_play=('fantasy_points','mean'),
    total_fp=('fantasy_points','sum')
).reset_index().sort_values('total_fp', ascending=False)

team_stats

# ===============================
# 6️⃣ CORRELATION BETWEEN METRICS AND FANTASY POINTS
# ===============================
corr_df = player_stats[['total_fp','total_epa','success_rate','plays']].corr()
sns.heatmap(corr_df, annot=True, cmap='coolwarm')
plt.title("Correlation of Fantasy Points with Metrics")
plt.show()

# ===============================
# 7️⃣ VISUALIZATIONS
# ===============================
# Top 10 players by fantasy points
top10 = player_stats.head(10)
sns.barplot(x='total_fp', y='player_name', data=top10, palette='viridis')
plt.title("Top 10 Players by Fantasy Points (2024–2025)")
plt.xlabel("Total Fantasy Points")
plt.ylabel("Player")
plt.show()

# Team EPA vs Success Rate
sns.scatterplot(x='success_rate', y='total_epa', data=team_stats)
plt.title("Team Success Rate vs Total EPA")
plt.xlabel("Success Rate")
plt.ylabel("Total EPA")
plt.show()


2024 done.
2025 done.
Downcasting floats.
Index(['play_id', 'game_id', 'old_game_id_x', 'home_team', 'away_team',
       'season_type', 'week', 'posteam', 'posteam_type', 'defteam',
       ...
       'route', 'defense_man_zone_type', 'defense_coverage_type',
       'offense_names', 'defense_names', 'offense_positions',
       'defense_positions', 'offense_numbers', 'defense_numbers',
       'old_game_id'],
      dtype='object', length=398)


  pbp['red_zone'] = (pbp.get('yardline_100', 100) <= 20).astype(int)
  pbp['fantasy_points'] = (


KeyError: "['player_name'] not in index"