# NFL penalties statistics

#### Load Python tools and Jupyter config

In [1]:
%load_ext lab_black

In [2]:
import json
import requests
import pandas as pd
import geopandas as gpd
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm, trange
import re
import numpy as np
from tqdm import tqdm

tqdm.pandas()

In [3]:
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000
pd.options.display.max_colwidth = None

## Read data

#### Loop over seasons and collect lists of teams

In [4]:
teams_list = []

for season in tqdm(range(1970, 2024)):
    afc = pd.read_html(
        f"https://www.pro-football-reference.com/years/{season}", extract_links="body"
    )[0][["Tm"]]
    nfc = pd.read_html(
        f"https://www.pro-football-reference.com/years/{season}", extract_links="body"
    )[1][["Tm"]]
    teams_src = pd.concat([afc, nfc]).reset_index(drop=True)
    teams_src["team"] = teams_src["Tm"].apply(lambda x: x[0] if x else None)
    teams_src["link"] = teams_src["Tm"].apply(lambda x: x[1] if x else None)
    teams_df = (
        teams_src[~teams_src["team"].str.contains("AFC|NFC")].drop("Tm", axis=1).copy()
    )
    teams_df["team"] = (
        teams_df["team"]
        .str.replace("*", "", regex=False)
        .str.replace("+", "", regex=False)
    )
    teams_df["season"] = teams_df["link"].str[11:15]
    teams_list.append(teams_df)

 13%|████████████▌                                                                                    | 7/54 [00:10<01:09,  1.47s/it]


HTTPError: HTTP Error 429: Too Many Requests

#### Concatenate all the dataframes into one large dataframe

In [None]:
teams = pd.concat(teams_list).reset_index(drop=True)

#### How many teams in each season? 

In [None]:
teams_count = dict(teams.groupby("season").size())

#### Apply that to our dataframe

In [None]:
teams["teams_count"] = teams["season"].map(teams_count)

In [None]:
teams["team_abbr"] = teams["link"].str[7:10]

In [None]:
teams.head()

---

## Penalties

#### Loop over seasons and collect penalty states by team

In [None]:
teams_list = []

def process_row(row):
    url = f"https://www.pro-football-reference.com/teams/{row['team_abbr']}/{row['season']}.htm"
    df = (
        pd.read_html(url, header=1)[0][["Player", "Pen", "Yds.3", "1stPy"]]
        .drop([2, 3])
        .rename(
            columns={
                "Player": "type",
                "Pen": "penalty_count",
                "Yds.3": "penalty_yards",
                "1stPy": "firsts_by_pen",
            }
        )
    )
    df["season"] = row["season"]
    df["team_name"] = row["team"]
    df["team_abbr"] = row["team_abbr"]
    df["games"] = row["teams_count"]
    teams_list.append(df)


# Use progress_apply instead of iterrows
teams.progress_apply(process_row, axis=1)

#### Concatenate

In [None]:
df = pd.concat(teams_list).reset_index(drop=True)

In [None]:
df["type"] = (
    df["type"]
    .str.replace("Team Stats", "Team", regex=False)
    .str.replace("Opp. Stats", "Opponent", regex=False)
)

In [None]:
df.head()

In [None]:
df.tail()

---

In [None]:
# games_list = []

# def process_row(row):
#     url = f"https://www.pro-football-reference.com/teams/{row['team_abbr']}/{row['season']}.htm"
#     games_df = (
#         pd.read_html(url, header=1)[1][
#             ["Week", "Date", "Opp", "Unnamed: 8", "Unnamed: 5", "Tm", "Opp.1"]
#         ]
#         .rename(
#             columns={
#                 "Unnamed: 8": "home_away",
#                 "Unnamed: 5": "win_loss",
#                 "Opp": "opponent",
#                 "Date": "date",
#                 "Week": "week",
#                 "Tm": "points",
#                 "Opp.1": "opp_points",
#             }
#         )
#         .fillna("Home")
#     )
#     games_df["season"] = row["season"]
#     games_df["team_name"] = row["team_name"]
#     games_df["team_abbr"] = row["team_abbr"]
#     games_df["games"] = row["teams_count"]
#     games_list.append(games_df)

# # Use progress_apply instead of iterrows
# teams.progress_apply(process_row, axis=1)