In [221]:
import requests
import json
from pprint import pprint
import pandas as pd

In [222]:
# The df that we are going to pull is nfl score data from espn

In [223]:
# The question we are looking to answer is if we can predict the total points 
# that will be scored during this weekend's superbowl game

In [224]:
# Step 1
# We began by collecting raw NFL game data from ESPNâ€™s public API 
# without filtering variables, so we could first understand what 
# information was available

In [225]:
response = requests.get("https://site.api.espn.com/apis/site/v2/sports/football/nfl/scoreboard")
print(response.status_code)

raw_data = response.json()

200


In [226]:
print(response.json())

{'leagues': [{'id': '28', 'uid': 's:20~l:28', 'name': 'National Football League', 'abbreviation': 'NFL', 'slug': 'nfl', 'season': {'year': 2025, 'startDate': '2025-07-31T07:00Z', 'endDate': '2026-02-12T07:59Z', 'displayName': '2025', 'type': {'id': '3', 'type': 3, 'name': 'Postseason', 'abbreviation': 'post'}}, 'logos': [{'href': 'https://a.espncdn.com/i/teamlogos/leagues/500/nfl.png', 'width': 500, 'height': 500, 'alt': '', 'rel': ['full', 'default'], 'lastUpdated': '2018-06-05T12:07Z'}, {'href': 'https://a.espncdn.com/i/teamlogos/leagues/500-dark/nfl.png', 'width': 500, 'height': 500, 'alt': '', 'rel': ['full', 'dark'], 'lastUpdated': '2024-07-22T16:53Z'}], 'calendarType': 'list', 'calendarIsWhitelist': True, 'calendarStartDate': '2025-07-31T07:00Z', 'calendarEndDate': '2026-02-12T07:59Z', 'calendar': [{'label': 'Preseason', 'value': '1', 'startDate': '2025-07-31T07:00Z', 'endDate': '2025-09-04T06:59Z', 'entries': [{'label': 'Hall of Fame Weekend', 'alternateLabel': 'HOF', 'detail': 

In [227]:
pprint(response.json())

{'events': [{'competitions': [{'attendance': 0,
                               'broadcast': 'NBC/Peacock',
                               'broadcasts': [{'market': 'national',
                                               'names': ['NBC', 'Peacock']}],
                               'competitors': [{'homeAway': 'home',
                                                'id': '17',
                                                'leaders': [{'abbreviation': 'PYDS',
                                                             'displayName': 'Passing '
                                                                            'Leader',
                                                             'leaders': [{'athlete': {'active': True,
                                                                                      'displayName': 'Drake '
                                                                                                     'Maye',
                                       

In [228]:
def extract_game_fields(event):
    comp = event["competitions"][0]
    competitors = comp["competitors"]

    home = next(t for t in competitors if t["homeAway"] == "home")
    away = next(t for t in competitors if t["homeAway"] == "away")

    weather = event.get("weather", {})

    return {
        "game_id": event.get("id"),
        "season": event.get("season", {}).get("year"),
        "season_type": event.get("season", {}).get("type"),
        "week": event.get("week", {}).get("number"),
        "date": event.get("date"),
        "home_team": home["team"]["displayName"],
        "away_team": away["team"]["displayName"],
        "home_score": int(home.get("score", 0)),
        "away_score": int(away.get("score", 0)),
        "total_points": int(home.get("score", 0)) + int(away.get("score", 0)),
        "indoor": comp["venue"].get("indoor"),
        "temperature": weather.get("temperature"),
        "weather": weather.get("displayValue")
    }

In [229]:
# We isolated the most relevant columns from our API and pulled a random game to
# assess the return

In [230]:
import random

events = raw_data["events"]
sample_events = random.sample(events, min(5, len(events)))

sample_rows = [extract_game_fields(e) for e in sample_events]

sample_raw_nfl_game_data_df = pd.DataFrame(sample_rows)
sample_raw_nfl_game_data_df

Unnamed: 0,game_id,season,season_type,week,date,home_team,away_team,home_score,away_score,total_points,indoor,temperature,weather
0,401772988,2025,3,5,2026-02-08T23:30Z,New England Patriots,Seattle Seahawks,0,0,0,False,67,Partly sunny


In [231]:
# The return gave us relevent information, but the random game was for a future game
# hence the total score being 0, so we will have to isolate a date range to identify
# which games we want to look at

In [280]:
# Our goal is to predict the total score for the superbowl, so we only want games
# that are similar to the superbowl. Based on our research, the NFL significantly
# changed scoring in 1994, but we will clean that data later.

In [233]:
sample_raw_nfl_game_data_df["date"].dtype

dtype('O')

In [234]:
# We noticed that date is a string, so we are converting it into a real datetime object
# so we can filter by year

In [235]:
sample_raw_nfl_game_data_df["date"] = pd.to_datetime(
    sample_raw_nfl_game_data_df["date"],
    errors="coerce"
)

In [236]:
sample_raw_nfl_game_data_df["date"].dtype

datetime64[ns, UTC]

In [237]:
# Now we will pull data from 1994 until now to start creating a df

In [238]:
BASE_URL = "https://site.api.espn.com/apis/site/v2/sports/football/nfl/scoreboard"

def get_scoreboard(start_date, end_date):
    dates_param = start_date.strftime("%Y%m%d") + "-" + end_date.strftime("%Y%m%d")
    response = requests.get(BASE_URL, params={"dates": dates_param})
    response.raise_for_status()
    return response.json()

In [239]:
print(response.status_code)

200


In [240]:
import datetime as dt

start = dt.date(1994, 1, 1)
end = dt.date.today()

all_events = []

current = start
while current <= end:
    window_end = min(current + dt.timedelta(days=6), end)

    data = get_scoreboard(current, window_end)
    all_events.extend(data.get("events", []))

    current = window_end + dt.timedelta(days=1)

In [241]:
print("DONE. Total events:", len(all_events))

DONE. Total events: 10110


In [242]:
def extract_game_fields(event):
    comps = event.get("competitions", [])
    if not comps:
        return None

    comp = comps[0]

    competitors = comp.get("competitors", [])
    if not competitors:
        return None

    home = next((t for t in competitors if t.get("homeAway") == "home"), None)
    away = next((t for t in competitors if t.get("homeAway") == "away"), None)
    if home is None or away is None:
        return None

    venue = comp.get("venue", {}) or {}
    weather = event.get("weather", {}) or {}

    def safe_int(x):
        try:
            return int(x)
        except (TypeError, ValueError):
            return 0

    home_score = safe_int(home.get("score"))
    away_score = safe_int(away.get("score"))

    season = event.get("season", {}) or {}
    week = event.get("week", {}) or {}
    status = event.get("status", {}).get("type", {}) or {}

    return {
        "game_id": event.get("id"),
        "season": season.get("year"),
        "season_type": season.get("type"),
        "week": week.get("number"),
        "date": event.get("date"),

        "home_team": home.get("team", {}).get("displayName"),
        "away_team": away.get("team", {}).get("displayName"),

        "home_score": home_score,
        "away_score": away_score,
        "total_points": home_score + away_score,

        "indoor": venue.get("indoor"),
        "temperature": weather.get("temperature"),
        "weather": weather.get("displayValue"),

        "status": status.get("name"),
    }

In [243]:
rows = [extract_game_fields(e) for e in all_events]
rows = [r for r in rows if r is not None]  # drop weird/incomplete events

In [244]:
raw_nfl_game_data_df = pd.DataFrame(rows)
raw_nfl_game_data_df.head()

Unnamed: 0,game_id,season,season_type,week,date,home_team,away_team,home_score,away_score,total_points,indoor,temperature,weather,status
0,140102001,1993,2,,1994-01-02T05:00Z,Atlanta Falcons,Phoenix Cardinals,10,27,37,,,,STATUS_FINAL
1,140102008,1993,2,,1994-01-02T05:00Z,Detroit Lions,Green Bay Packers,30,20,50,,,,STATUS_FINAL
2,140102010,1993,2,,1994-01-02T05:00Z,Houston Oilers,New York Jets,24,0,24,,,,STATUS_FINAL
3,140102011,1993,2,,1994-01-02T05:00Z,Indianapolis Colts,Buffalo Bills,10,30,40,,,,STATUS_FINAL
4,140102012,1993,2,,1994-01-02T05:00Z,Kansas City Chiefs,Seattle Seahawks,34,24,58,,,,STATUS_FINAL


In [245]:
raw_nfl_game_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10109 entries, 0 to 10108
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   game_id       10109 non-null  object 
 1   season        10109 non-null  int64  
 2   season_type   10109 non-null  int64  
 3   week          8815 non-null   float64
 4   date          10109 non-null  object 
 5   home_team     10109 non-null  object 
 6   away_team     10109 non-null  object 
 7   home_score    10109 non-null  int64  
 8   away_score    10109 non-null  int64  
 9   total_points  10109 non-null  int64  
 10  indoor        6890 non-null   object 
 11  temperature   0 non-null      object 
 12  weather       0 non-null      object 
 13  status        10109 non-null  object 
dtypes: float64(1), int64(5), object(8)
memory usage: 1.1+ MB


In [246]:
# We pulled data from the 1994 season until now and are assessing the kind of data
# available. Initially it looks like we have most of the games (10,109), but not all
# since it should be closer to 16,000 games (32 teams x ~17 games per season x 31 yrs)
# We also see that some data is strings and weather data is missing
# Now we will assess what types of games are missing

In [247]:
raw_nfl_game_data_df["season_type"].value_counts(dropna=False)

season_type
2    8168
1    1543
3     398
Name: count, dtype: int64

In [248]:
raw_nfl_game_data_df["status"].value_counts(dropna=False)

status
STATUS_FINAL        10106
STATUS_POSTPONED        3
Name: count, dtype: int64

In [249]:
raw_nfl_game_data_df["indoor"].value_counts(dropna=False)

indoor
False    5039
None     3219
True     1851
Name: count, dtype: int64

In [250]:
# Season type shows preseason (1), regular season (2), and post season (3)
# The vast majority of games are final, which is good
# There is a lot of missing data on if the game is indoor or not
# We will now assess if the data becomes more reliable over time

In [251]:
raw_nfl_game_data_df.groupby("season")["indoor"].apply(lambda x: x.notna().mean())

season
1986    0.000000
1988    0.000000
1992    0.000000
1993    0.041667
1994    0.076596
1995    0.071713
1996    0.071713
1997    0.108000
1998    0.103586
1999    0.131274
2000    0.114198
2001    0.114551
2002    0.116766
2003    0.110778
2004    0.116766
2005    0.113772
2006    0.882883
2007    0.882883
2008    0.996997
2009    0.996997
2010    1.000000
2011    1.000000
2012    1.000000
2013    0.996997
2014    1.000000
2015    1.000000
2016    1.000000
2017    1.000000
2018    1.000000
2019    1.000000
2020    1.000000
2021    1.000000
2022    1.000000
2023    1.000000
2024    1.000000
2025    1.000000
Name: indoor, dtype: float64

In [252]:
# Based on this information indoor game information is vary unreliable until 2006
# when it is populated 88% of the time and after 2010 it is reliable

In [253]:
# analysis revealed that venue-level indoor/outdoor data is largely unavailable
# prior to the mid-2000s, with coverage increasing sharply around 2006 and reaching 
# full completeness by 2010. This indicates that indoor status is a modern feature in 
# the ESPN dataset and must be handled carefully when analyzing earlier seasons

In [254]:
raw_nfl_game_data_df.groupby("season")["total_points"].mean()

season
1986    24.000000
1988    28.000000
1992    51.000000
1993    42.750000
1994    37.642553
1995    40.701195
1996    38.597610
1997    38.340000
1998    39.788845
1999    39.019305
2000    38.108025
2001    37.551084
2002    42.464072
2003    40.925150
2004    41.637725
2005    40.491018
2006    40.219219
2007    42.186186
2008    42.381381
2009    42.276276
2010    43.708709
2011    43.256024
2012    44.759760
2013    45.762763
2014    44.140719
2015    44.135135
2016    44.021084
2017    42.183183
2018    45.174174
2019    44.099099
2020    49.490706
2021    44.688623
2022    43.437126
2023    43.438806
2024    44.674627
2025    45.461078
Name: total_points, dtype: float64

In [255]:
# Based on this information, I will exlude games before the 2006 season because the
# indoor venue data is more complete and total point variation looks reasonable
# We will create a new df with only the data we want

In [256]:
analysis_df = raw_nfl_game_data_df[
    raw_nfl_game_data_df["season"] >= 2006
].copy()

In [257]:
analysis_df["season"].min(), analysis_df["season"].max()

(np.int64(2006), np.int64(2025))

In [258]:
# We confirmed that only data from seasons 2006 to 2025 are present
# Now we remember from earlier that not all games are final scores, so we will also
# exclude games that are not final

In [259]:
analysis_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6602 entries, 3507 to 10108
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   game_id       6602 non-null   object 
 1   season        6602 non-null   int64  
 2   season_type   6602 non-null   int64  
 3   week          6600 non-null   float64
 4   date          6602 non-null   object 
 5   home_team     6602 non-null   object 
 6   away_team     6602 non-null   object 
 7   home_score    6602 non-null   int64  
 8   away_score    6602 non-null   int64  
 9   total_points  6602 non-null   int64  
 10  indoor        6521 non-null   object 
 11  temperature   0 non-null      object 
 12  weather       0 non-null      object 
 13  status        6602 non-null   object 
dtypes: float64(1), int64(5), object(8)
memory usage: 773.7+ KB


In [260]:
analysis_df = analysis_df[
    analysis_df["status"] == "STATUS_FINAL"
]

In [261]:
analysis_df["status"].value_counts()

status
STATUS_FINAL    6600
Name: count, dtype: int64

In [262]:
# Now we will also exclude preseason games because they are normally played by backups
# therefore not an accurate score predicter

In [263]:
analysis_df = analysis_df[
    analysis_df["season_type"].isin([2, 3])
]

In [264]:
analysis_df["season_type"].value_counts()

season_type
2    5199
3     250
Name: count, dtype: int64

In [265]:
analysis_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5449 entries, 3572 to 10108
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   game_id       5449 non-null   object 
 1   season        5449 non-null   int64  
 2   season_type   5449 non-null   int64  
 3   week          5449 non-null   float64
 4   date          5449 non-null   object 
 5   home_team     5449 non-null   object 
 6   away_team     5449 non-null   object 
 7   home_score    5449 non-null   int64  
 8   away_score    5449 non-null   int64  
 9   total_points  5449 non-null   int64  
 10  indoor        5386 non-null   object 
 11  temperature   0 non-null      object 
 12  weather       0 non-null      object 
 13  status        5449 non-null   object 
dtypes: float64(1), int64(5), object(8)
memory usage: 638.6+ KB


In [266]:
# Next we see that some information is a string instead of data types that
# we can do analysis. So we will start converting the data types.

In [267]:
# starting with date, we will convert that data into datetime

In [268]:
analysis_df["date"] = pd.to_datetime(
    analysis_df["date"],
    errors="coerce",
    utc=True
)

In [269]:
analysis_df["date"].dtype

datetime64[ns, UTC]

In [270]:
analysis_df["date"].isna().sum()

np.int64(0)

In [271]:
analysis_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5449 entries, 3572 to 10108
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   game_id       5449 non-null   object             
 1   season        5449 non-null   int64              
 2   season_type   5449 non-null   int64              
 3   week          5449 non-null   float64            
 4   date          5449 non-null   datetime64[ns, UTC]
 5   home_team     5449 non-null   object             
 6   away_team     5449 non-null   object             
 7   home_score    5449 non-null   int64              
 8   away_score    5449 non-null   int64              
 9   total_points  5449 non-null   int64              
 10  indoor        5386 non-null   object             
 11  temperature   0 non-null      object             
 12  weather       0 non-null      object             
 13  status        5449 non-null   object             
dtypes: dateti

In [272]:
analysis_df["indoor"].value_counts(dropna=False)

indoor
False    3899
True     1487
None       63
Name: count, dtype: int64

In [273]:
# We know that there are 63 games with missing information on whether it was indoor or
# outdoor. From previous analysis, we can know the missing information is from earlier
# seasons, but we will double check which seasons have that info missing

In [274]:
analysis_df.groupby("season")["indoor"].apply(lambda x: x.isna().mean())

season
2006    0.111940
2007    0.115672
2008    0.003731
2009    0.000000
2010    0.000000
2011    0.000000
2012    0.000000
2013    0.003731
2014    0.000000
2015    0.000000
2016    0.000000
2017    0.000000
2018    0.000000
2019    0.000000
2020    0.000000
2021    0.000000
2022    0.000000
2023    0.000000
2024    0.000000
2025    0.000000
Name: indoor, dtype: float64

In [275]:
analysis_df.loc[
    analysis_df["indoor"].isna()].sample(10)

Unnamed: 0,game_id,season,season_type,week,date,home_team,away_team,home_score,away_score,total_points,indoor,temperature,weather,status
3959,270930016,2007,2,4.0,2007-09-30 17:00:00+00:00,Minnesota Vikings,Green Bay Packers,16,23,39,,,,STATUS_FINAL
3962,270930011,2007,2,4.0,2007-09-30 20:15:00+00:00,Indianapolis Colts,Denver Broncos,38,20,58,,,,STATUS_FINAL
3930,270916015,2007,2,2.0,2007-09-16 20:05:00+00:00,Miami Dolphins,Dallas Cowboys,20,37,57,,,,STATUS_FINAL
3817,261231016,2006,2,17.0,2006-12-31 18:00:00+00:00,Minnesota Vikings,St. Louis Rams,21,41,62,,,,STATUS_FINAL
4167,280113011,2007,3,2.0,2008-01-13 18:00:00+00:00,Indianapolis Colts,San Diego Chargers,24,28,52,,,,STATUS_FINAL
3704,261112011,2006,2,10.0,2006-11-12 18:00:00+00:00,Indianapolis Colts,Buffalo Bills,17,16,33,,,,STATUS_FINAL
3669,261022013,2006,2,7.0,2006-10-22 20:15:00+00:00,Oakland Raiders,Arizona Cardinals,22,9,31,,,,STATUS_FINAL
3634,261008011,2006,2,5.0,2006-10-08 17:00:00+00:00,Indianapolis Colts,Tennessee Titans,14,13,27,,,,STATUS_FINAL
3823,261231011,2006,2,17.0,2006-12-31 21:15:00+00:00,Indianapolis Colts,Miami Dolphins,27,22,49,,,,STATUS_FINAL
4091,271202013,2007,2,13.0,2007-12-02 21:05:00+00:00,Oakland Raiders,Denver Broncos,34,20,54,,,,STATUS_FINAL


In [276]:
# It looks like about 10% of indoor data is missing from 2006 and 2007 and only .3%
# is missing from 2008 and 2009. Since we are going to assess total score data
# in relation to weather conditions, it is critical that we know if the games were
# indoor or outdoor. We will now look at the types of games with missing venue data

In [277]:
analysis_df.groupby("home_team")["indoor"].apply(lambda x: x.notna().mean()).sort_values()

home_team
Team Rice                0.000000
Afc                      0.800000
Oakland Raiders          0.866071
Indianapolis Colts       0.895349
Miami Dolphins           0.914634
Minnesota Vikings        0.916168
Team Carter              1.000000
NFC All-Stars            1.000000
New England Patriots     1.000000
New Orleans Saints       1.000000
New York Giants          1.000000
New York Jets            1.000000
Washington               1.000000
Nfc                      1.000000
Tennessee Titans         1.000000
Philadelphia Eagles      1.000000
Pittsburgh Steelers      1.000000
NFC                      1.000000
San Francisco 49ers      1.000000
Seattle Seahawks         1.000000
St. Louis Rams           1.000000
Tampa Bay Buccaneers     1.000000
Team Irvin               1.000000
San Diego Chargers       1.000000
AFC                      1.000000
Los Angeles Rams         1.000000
AFC All-Stars            1.000000
Arizona Cardinals        1.000000
Atlanta Falcons          1.000000
Balt

In [278]:
# based on this information, it looks like some of the games with missing venue data
# are non-standard games (ex. AFC, AFC All-stars, NFC, ect). We will exclude
# non-standard games and reassess which games are missing venue data

In [279]:
non_standard_teams = [
    "AFC", "NFC", "Afc", "Nfc",
    "AFC All-Stars", "NFC All-Stars",
    "Team Rice", "Team Carter", "Team Irvin"
]

In [283]:
non_standard_games = analysis_df[
    (analysis_df["home_team"].isin(non_standard_teams)) |
    (analysis_df["away_team"].isin(non_standard_teams))
]

In [284]:
non_standard_games.shape

(19, 14)

In [287]:
non_standard_games[["season", "season_type", "home_team", "away_team", "status"]].head(19)

Unnamed: 0,season,season_type,home_team,away_team,status
3839,2006,3,Afc,Nfc,STATUS_FINAL
4172,2007,3,Nfc,Afc,STATUS_FINAL
4505,2008,3,Afc,Nfc,STATUS_FINAL
4837,2009,3,Afc,Nfc,STATUS_FINAL
5170,2010,3,Nfc,Afc,STATUS_FINAL
5502,2011,3,Afc,Nfc,STATUS_FINAL
5835,2012,3,Nfc,Afc,STATUS_FINAL
6168,2013,3,Team Rice,Team Sanders,STATUS_FINAL
6502,2014,3,Team Carter,Team Irvin,STATUS_FINAL
6835,2015,3,Team Irvin,Team Rice,STATUS_FINAL


In [288]:
analysis_df = analysis_df[
    ~(
        (analysis_df["home_team"].isin(non_standard_teams)) |
        (analysis_df["away_team"].isin(non_standard_teams))
    )
]

In [292]:
analysis_df["home_team"].isin(non_standard_teams).sum()

np.int64(0)

In [293]:
analysis_df["away_team"].isin(non_standard_teams).sum()

np.int64(0)

In [294]:
analysis_df.groupby("home_team")["indoor"].apply(lambda x: x.notna().mean()).sort_values()

home_team
Oakland Raiders          0.866071
Indianapolis Colts       0.895349
Miami Dolphins           0.914634
Minnesota Vikings        0.916168
New England Patriots     1.000000
New Orleans Saints       1.000000
New York Giants          1.000000
New York Jets            1.000000
Philadelphia Eagles      1.000000
Arizona Cardinals        1.000000
San Francisco 49ers      1.000000
Seattle Seahawks         1.000000
St. Louis Rams           1.000000
Tampa Bay Buccaneers     1.000000
Tennessee Titans         1.000000
Washington               1.000000
San Diego Chargers       1.000000
Pittsburgh Steelers      1.000000
Los Angeles Rams         1.000000
Los Angeles Chargers     1.000000
Atlanta Falcons          1.000000
Baltimore Ravens         1.000000
Buffalo Bills            1.000000
Carolina Panthers        1.000000
Chicago Bears            1.000000
Cincinnati Bengals       1.000000
Cleveland Browns         1.000000
Dallas Cowboys           1.000000
Denver Broncos           1.000000
Detr

In [295]:
analysis_df["indoor"].value_counts(dropna=False)

indoor
False    3886
True     1483
None       61
Name: count, dtype: int64

In [297]:
# We still have a lot of indoor information missing from standard regular-season
# and post-season games. Next we will look at charactersitics of that missing data
# to assess if there is a trend

In [298]:
analysis_df[analysis_df["indoor"].isna()].groupby("season").size()

season
2006    30
2007    31
dtype: int64

In [299]:
# It looks like the missing data is only from 2006 and 2007 and it is only from
# four teams. Next we will look up those home team stadiums to assess if they are
# indoor or outdoor to fill that information manually

In [301]:
# Based on a google search we discovered that during the 2006-2007 season the home
# stadiums of the Oakland Raiders and the Miami Dolphins were outdoor and the home
# stadiums of the Indianapolis Colts and Minnesota Vikings are indoor. Based on that
# information, we can fill the missing venue data and use the score data

In [302]:
stadium_indoor_lookup = {
    "Oakland Raiders": False,        # Oakland Coliseum
    "Miami Dolphins": False,         # Dolphin Stadium
    "Indianapolis Colts": True,      # RCA Dome
    "Minnesota Vikings": True        # Metrodome
}

In [303]:
mask_missing_indoor = (
    analysis_df["indoor"].isna() &
    analysis_df["season"].isin([2006, 2007]) &
    analysis_df["home_team"].isin(stadium_indoor_lookup.keys())
)

In [304]:
analysis_df.loc[
    mask_missing_indoor,
    ["season", "home_team", "away_team", "indoor"]
].value_counts("home_team")

home_team
Indianapolis Colts    18
Oakland Raiders       15
Miami Dolphins        14
Minnesota Vikings     14
Name: count, dtype: int64

In [305]:
analysis_df.loc[mask_missing_indoor, "indoor"] = (
    analysis_df.loc[mask_missing_indoor, "home_team"]
    .map(stadium_indoor_lookup)
)

In [306]:
analysis_df["indoor"].isna().sum()

np.int64(0)

In [307]:
analysis_df.groupby("home_team")["indoor"].apply(lambda x: x.notna().mean()).sort_values()

home_team
Arizona Cardinals        1.0
Minnesota Vikings        1.0
New England Patriots     1.0
New Orleans Saints       1.0
New York Giants          1.0
New York Jets            1.0
Oakland Raiders          1.0
Philadelphia Eagles      1.0
Pittsburgh Steelers      1.0
San Diego Chargers       1.0
San Francisco 49ers      1.0
Seattle Seahawks         1.0
St. Louis Rams           1.0
Tampa Bay Buccaneers     1.0
Tennessee Titans         1.0
Washington               1.0
Miami Dolphins           1.0
Washington Commanders    1.0
Los Angeles Rams         1.0
Las Vegas Raiders        1.0
Atlanta Falcons          1.0
Baltimore Ravens         1.0
Buffalo Bills            1.0
Carolina Panthers        1.0
Chicago Bears            1.0
Cincinnati Bengals       1.0
Cleveland Browns         1.0
Dallas Cowboys           1.0
Denver Broncos           1.0
Detroit Lions            1.0
Green Bay Packers        1.0
Houston Texans           1.0
Indianapolis Colts       1.0
Jacksonville Jaguars     1.0
Kans

In [308]:
analysis_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5430 entries, 3572 to 10107
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   game_id       5430 non-null   object             
 1   season        5430 non-null   int64              
 2   season_type   5430 non-null   int64              
 3   week          5430 non-null   float64            
 4   date          5430 non-null   datetime64[ns, UTC]
 5   home_team     5430 non-null   object             
 6   away_team     5430 non-null   object             
 7   home_score    5430 non-null   int64              
 8   away_score    5430 non-null   int64              
 9   total_points  5430 non-null   int64              
 10  indoor        5430 non-null   object             
 11  temperature   0 non-null      object             
 12  weather       0 non-null      object             
 13  status        5430 non-null   object             
dtypes: dateti

In [310]:
# Now we are looking at the data we have. It looks like we have captured all the 
# NFL games from 2006 to 2025 (note that season and post-season length changed in 
# the 2021 season)

In [311]:
analysis_df.groupby(["season", "season_type"]).size()

season  season_type
2006    2              256
        3               11
2007    2              256
        3               11
2008    2              256
        3               11
2009    2              256
        3               11
2010    2              256
        3               11
2011    2              256
        3               11
2012    2              256
        3               11
2013    2              256
        3               11
2014    2              256
        3               11
2015    2              256
        3               11
2016    2              256
        3               11
2017    2              256
        3               11
2018    2              256
        3               11
2019    2              256
        3               11
2020    2              256
        3               13
2021    2              272
        3               13
2022    2              271
        3               13
2023    2              272
        3               13
2024    

In [314]:
# Now that we know all of our games are official and final, we will take out the 
# "status" clumn because it is irrelevant

In [315]:
analysis_df = analysis_df.drop(columns=["status"])

In [316]:
analysis_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5430 entries, 3572 to 10107
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   game_id       5430 non-null   object             
 1   season        5430 non-null   int64              
 2   season_type   5430 non-null   int64              
 3   week          5430 non-null   float64            
 4   date          5430 non-null   datetime64[ns, UTC]
 5   home_team     5430 non-null   object             
 6   away_team     5430 non-null   object             
 7   home_score    5430 non-null   int64              
 8   away_score    5430 non-null   int64              
 9   total_points  5430 non-null   int64              
 10  indoor        5430 non-null   object             
 11  temperature   0 non-null      object             
 12  weather       0 non-null      object             
dtypes: datetime64[ns, UTC](1), float64(1), int64(5), object(6)
memor

In [313]:
# We also see that we have missing information for the temperature and weather column
# So we will take those columns out and inport another dataset with weather info