# NFL penalties statistics

#### Load Python tools and Jupyter config

In [1]:
%load_ext lab_black

In [41]:
import re
import os
import requests
import pandas as pd
import jupyter_black
from io import StringIO
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from random import randint
from time import sleep
from tqdm import tqdm

In [25]:
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000
pd.options.display.max_colwidth = None
jupyter_black.load()

In [26]:
SCRAPE_PROXY_KEY = os.getenv("SCRAPE_PROXY_KEY")  # Ensure your key is stored securely

## Fetch

#### Loop over seasons and collect lists of teams

In [27]:
teams_list = []

for season in tqdm(range(1970, 2024)):
    # Prepare the URL for the request
    url = f"https://www.pro-football-reference.com/years/{season}"

    # Fetch the page content using the proxy service
    response = requests.get(
        url="https://proxy.scrapeops.io/v1/",
        params={
            "api_key": SCRAPE_PROXY_KEY,
            "url": url,
        },
    )
    html_content = response.content

    # Use read_html with the fetched HTML content
    afc = pd.read_html(StringIO(html_content.decode("utf-8")), extract_links="body")[0][
        ["Tm"]
    ]
    nfc = pd.read_html(StringIO(html_content.decode("utf-8")), extract_links="body")[1][
        ["Tm"]
    ]

    # Combine them
    teams_src = pd.concat([afc, nfc]).reset_index(drop=True)

    # Extract team name, link from "extract_links" tuple
    teams_src["team"] = teams_src["Tm"].apply(lambda x: x[0] if x else None)
    teams_src["link"] = teams_src["Tm"].apply(lambda x: x[1] if x else None)

    # Drop rows without team information
    teams_df = (
        teams_src[~teams_src["team"].str.contains("AFC|NFC")].drop("Tm", axis=1).copy()
    )

    # Remove stray characters
    teams_df["team"] = (
        teams_df["team"]
        .str.replace("*", "", regex=False)
        .str.replace("+", "", regex=False)
    )

    # Add season to the dataframe, and append to list
    teams_df["season"] = season
    teams_list.append(teams_df)

    # Be kind to the server
    sleep(randint(2, 6))

  0%|          | 0/54 [00:00<?, ?it/s]

#### Concatenate all the dataframes into one large dataframe

In [None]:
# Combine all seasons' data
all_teams_df = pd.concat(teams_list).reset_index(drop=True)

#### How many teams in each season? 

In [33]:
teams_count = dict(all_teams_df.groupby("season").size())

#### Apply that to our dataframe

In [34]:
all_teams_df["teams_count"] = all_teams_df["season"].map(teams_count)

In [35]:
all_teams_df["team_abbr"] = all_teams_df["link"].str[7:10]

In [36]:
all_teams_df.head()

Unnamed: 0,team,link,season,teams_count,team_abbr
0,Baltimore Colts,/teams/clt/1970.htm,1970,26,clt
1,Miami Dolphins,/teams/mia/1970.htm,1970,26,mia
2,New York Jets,/teams/nyj/1970.htm,1970,26,nyj
3,Buffalo Bills,/teams/buf/1970.htm,1970,26,buf
4,Boston Patriots,/teams/nwe/1970.htm,1970,26,nwe


---

## Penalties

#### Loop over seasons and collect penalty states by team

In [None]:
import os
import requests
import pandas as pd
from io import StringIO
from tqdm import tqdm

SCRAPE_PROXY_KEY = os.getenv("SCRAPE_PROXY_KEY")  # Ensure your key is stored securely

team_penalties_list = []


def process_row(row):
    url = f"https://www.pro-football-reference.com/teams/{row['team_abbr']}/{row['season']}.htm"

    # Fetch the page content using the proxy service
    response = requests.get(
        url="https://proxy.scrapeops.io/v1/",
        params={
            "api_key": SCRAPE_PROXY_KEY,
            "url": url,
        },
    )
    html_content = response.content

    # Use read_html with the fetched HTML content
    penalty_df = (
        pd.read_html(StringIO(html_content.decode("utf-8")), header=1)[0][
            ["Player", "Pen", "Yds.3", "1stPy"]
        ]
        .drop([2, 3])
        .rename(
            columns={
                "Player": "type",
                "Pen": "penalty_count",
                "Yds.3": "penalty_yards",
                "1stPy": "firsts_by_pen",
            }
        )
    )
    penalty_df["season"] = row["season"]
    penalty_df["team_name"] = row["team"]
    penalty_df["team_abbr"] = row["team_abbr"]
    penalty_df["teams"] = row["teams_count"] 
    team_penalties_list.append(penalty_df)


# Enable tqdm for pandas
tqdm.pandas()

# Use progress_apply instead of iterrows
all_teams_df.progress_apply(process_row, axis=1)

In [61]:
all_teams_df

Unnamed: 0,team,link,season,teams_count,team_abbr
0,Baltimore Colts,/teams/clt/1970.htm,1970,26,clt
1,Miami Dolphins,/teams/mia/1970.htm,1970,26,mia
2,New York Jets,/teams/nyj/1970.htm,1970,26,nyj
3,Buffalo Bills,/teams/buf/1970.htm,1970,26,buf
4,Boston Patriots,/teams/nwe/1970.htm,1970,26,nwe
...,...,...,...,...,...
1600,Carolina Panthers,/teams/car/2023.htm,2023,32,car
1601,San Francisco 49ers,/teams/sfo/2023.htm,2023,32,sfo
1602,Los Angeles Rams,/teams/ram/2023.htm,2023,32,ram
1603,Seattle Seahawks,/teams/sea/2023.htm,2023,32,sea


#### Concatenate

In [44]:
df = pd.concat(team_penalties_list).reset_index(drop=True)

In [45]:
df["type"] = (
    df["type"]
    .str.replace("Team Stats", "Team", regex=False)
    .str.replace("Opp. Stats", "Opponent", regex=False)
)

In [67]:
team_penalties_df = df.query("type == 'Team'").reset_index(drop=True).copy()

In [86]:
team_penalties_df["rank"] = (
    team_penalties_df.groupby("season")["penalty_yards"].rank(method="min").astype(int)
)

In [89]:
team_penalties_df.query("team_name == 'Dallas Cowboys'").sort_values(
    "season", ascending=False
)

Unnamed: 0,type,penalty_count,penalty_yards,firsts_by_pen,season,team_name,team_abbr,games,rank
1589,Team,115.0,964.0,43.0,2023,Dallas Cowboys,dal,32,32
1558,Team,104.0,842.0,24.0,2022,Dallas Cowboys,dal,32,23
1525,Team,127.0,1103.0,31.0,2021,Dallas Cowboys,dal,32,31
1495,Team,96.0,849.0,25.0,2020,Dallas Cowboys,dal,32,20
1462,Team,109.0,1008.0,30.0,2019,Dallas Cowboys,dal,32,25
1429,Team,104.0,885.0,25.0,2018,Dallas Cowboys,dal,32,11
1398,Team,97.0,939.0,25.0,2017,Dallas Cowboys,dal,32,19
1365,Team,105.0,879.0,25.0,2016,Dallas Cowboys,dal,32,14
1336,Team,112.0,882.0,38.0,2015,Dallas Cowboys,dal,32,13
1301,Team,105.0,839.0,31.0,2014,Dallas Cowboys,dal,32,12
