<a href="https://colab.research.google.com/github/samarthsarda9/Projects/blob/main/Football_Predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup

years = [2023, 2022, 2021, 2020]  # List of years to scrape
all_matches = []

for year in years:
    standings_url = f"https://www.pro-football-reference.com/years/{year}/"
    print(f"Fetching data for the {year} season...")
    time.sleep(2)  # Respectful delay
    data = requests.get(standings_url)

    if data.status_code != 200:
        print(f"Failed to fetch data for {year}. Status code: {data.status_code}")
        continue

    soup = BeautifulSoup(data.text, "html.parser")
    afc_table = soup.find("table", {"id": "AFC"})
    nfc_table = soup.find("table", {"id": "NFC"})

    if not afc_table or not nfc_table:
        print(f"Could not find AFC or NFC tables for {year}. Skipping...")
        continue

    afc_links = afc_table.find_all('a')
    nfc_links = nfc_table.find_all('a')
    links = afc_links + nfc_links
    links = [l.get("href") for l in links]
    links = [l for l in links if "/teams/" in l]
    team_urls = [f"https://www.pro-football-reference.com{l}" for l in links]

    for team_url in team_urls:
        team_name = team_url.split("/")[4].upper()
        print(f"Scraping data for {team_name} ({year})")
        time.sleep(2)  # Delay to prevent hitting rate limits

        team_data = requests.get(team_url)
        if team_data.status_code != 200:
            print(f"Failed to fetch team data for {team_name}. Status code: {team_data.status_code}")
            continue

        try:
            matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]
            matches.columns = matches.columns.droplevel()
        except (ValueError, KeyError):
            print(f"Could not find schedule table for {team_name} in {year}. Skipping...")
            continue

        # Columns to drop, excluding 'Date'
        columns_to_drop = ['Day', 'Unnamed: 3_level_1', 'Unnamed: 4_level_1', '1stD', 'PassY', 'RushY', 'Offense', 'Defense', 'Sp. Tms']
        for column in columns_to_drop:
            try:
                matches = matches.drop(columns=[column])
            except KeyError:
                print(f"Column '{column}' not found in DataFrame. Skipping...")

        # Rename columns for clarity
        matches.columns.values[6] = 'Opponent'
        matches = matches.rename(columns={
            "Unnamed: 5_level_1": "Result",
            "Unnamed: 8_level_1": "Home/Away",
            "Tm": "PF",
            "Opp": "PA"
        })
        matches.columns.values[9] = 'Off: TotYd'
        matches.columns.values[10] = 'Off: TO'
        matches.columns.values[11] = 'Def: TotYd'
        matches.columns.values[12] = 'Def: TO'

        # Replace values in the 'Home/Away' column
        matches['Home/Away'] = matches['Home/Away'].replace("@", "Away").fillna("Home")

        # Add zeros for zero turnovers
        matches['Off: TO'] = matches['Off: TO'].fillna(0)
        matches['Def: TO'] = matches['Def: TO'].fillna(0)

        # Split team name and year into separate columns
        matches.insert(0, "Team", team_name)
        matches.insert(1, "Year", year)

        matches['Date'] = matches['Date'] + ' ' + str(year)
        matches['Date'] = pd.to_datetime(matches['Date'], format='%B %d %Y', errors='coerce')
        matches['Date'] = matches['Date'].dt.strftime('%m/%d/%Y')  # Optional: format as MM/DD/YYYY

        # Check for rows marked as canceled and filter them out based on a broader set of criteria.
        matches = matches[~matches.apply(lambda row: 'Canceled' in row.values, axis=1)]

        # Append to the all_matches list
        all_matches.append(matches)

# Combine all collected data into a single DataFrame and save to CSV
match_df = pd.concat(all_matches, ignore_index=True)
match_df.to_csv("nfl_matches_2020_2023.csv", index=False)
print("Data collection complete. Saved to 'nfl_matches_2020_2023.csv'.")


Fetching data for the 2023 season...
Scraping data for BUF (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for MIA (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for NYJ (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for NWE (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for RAV (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for CLE (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for PIT (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for CIN (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for HTX (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for JAX (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for CLT (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for OTI (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for KAN (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for RAI (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for DEN (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for SDG (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for DAL (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for PHI (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for NYG (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for WAS (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for DET (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for GNB (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for MIN (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for CHI (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for TAM (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for NOR (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for ATL (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for CAR (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for SFO (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for RAM (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for SEA (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for CRD (2023)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Fetching data for the 2022 season...
Scraping data for BUF (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for MIA (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for NWE (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for NYJ (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for CIN (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for RAV (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for PIT (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for CLE (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for JAX (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for OTI (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for CLT (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for HTX (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for KAN (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for SDG (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for RAI (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for DEN (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for PHI (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for DAL (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for NYG (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for WAS (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for MIN (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for DET (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for GNB (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for CHI (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for TAM (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for CAR (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for NOR (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for ATL (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for SFO (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for SEA (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for RAM (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for CRD (2022)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Fetching data for the 2021 season...
Scraping data for BUF (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for NWE (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for MIA (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for NYJ (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for CIN (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for PIT (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for CLE (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for RAV (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for OTI (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for CLT (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for HTX (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for JAX (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for KAN (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for RAI (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for SDG (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for DEN (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for DAL (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for PHI (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for WAS (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for NYG (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for GNB (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for MIN (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for CHI (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for DET (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for TAM (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for NOR (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for ATL (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for CAR (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for RAM (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for CRD (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for SFO (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for SEA (2021)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Fetching data for the 2020 season...
Scraping data for BUF (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for MIA (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for NWE (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for NYJ (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for PIT (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for RAV (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for CLE (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for CIN (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for OTI (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for CLT (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for HTX (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for JAX (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for KAN (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for RAI (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for SDG (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for DEN (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for WAS (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for NYG (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for DAL (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for PHI (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for GNB (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for CHI (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for MIN (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for DET (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for NOR (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for TAM (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for CAR (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for ATL (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for SEA (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for RAM (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for CRD (2020)


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


Scraping data for SFO (2020)
Data collection complete. Saved to 'nfl_matches_2020_2023.csv'.


  matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]


In [6]:
import pandas as pd
matches = pd.read_csv('nfl_matches_2020_2023.csv')
matches["H/A_code"] = matches["Home/Away"].astype("category").cat.codes
matches["Opp_code"] = matches["Opponent"].astype("category").cat.codes
matches["Result_code"] = (matches["Result"] == "W").astype("int")

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
train = matches[matches["Year"] <= 2022]
test = matches[matches["Year"] == 2023]
predictors = ["H/A_code", "Opp_code"]
rf.fit(train[predictors], train["Result_code"])
RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=1)
preds = rf.predict(test[predictors])
from sklearn.metrics import accuracy_score
acc = accuracy_score(test["Result_code"], preds)
combined = pd.DataFrame(dict(actual=test["Result_code"], predicted=preds))
pd.crosstab(index=combined["actual"], columns=combined["predicted"])
from sklearn.metrics import precision_score
precision_score(test["Result_code"], preds)

def rolling_averages(group, cols, new_cols):
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

cols = ["PF", "PA", "Off: TotYd", "Off: TO", "Def: TotYd", "Def: TO"]
new_cols = [f"{c}_rolling" for c in cols]

matches_rolling = matches.groupby(["Team", "Year"]).apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('Team')
matches_rolling.index = range(matches_rolling.shape[0])
matches_rolling.to_csv("rolling_avg_data.csv", index=False)

def make_predictions(data, predictors):
    train = data[data["Year"] <= 2022]
    test = data[data["Year"] == 2023]
    rf.fit(train[predictors], train["Result_code"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["Result_code"], predicted=preds), index=test.index)
    precision = precision_score(test["Result_code"], preds)
    return combined, precision

combined, precision = make_predictions(matches_rolling, predictors + new_cols)
combined = combined.merge(matches_rolling[["Team", "Opponent", "Year", "Date", "Result"]], left_index=True, right_index=True)


class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    "CRD": "Arizona Cardinals",
    "ATL": "Atlanta Falcons",
    "BAL": "Baltimore Ravens",
    "BUF": "Buffalo Bills",
    "CAR": "Carolina Panthers",
    "CHI": "Chicago Bears",
    "CIN": "Cincinnati Bengals",
    "CLE": "Cleveland Browns",
    "DAL": "Dallas Cowboys",
    "DEN": "Denver Broncos",
    "DET": "Detroit Lions",
    "GNB": "Green Bay Packers",
    "HTX": "Houston Texans",
    "IND": "Indianapolis Colts",
    "JAX": "Jacksonville Jaguars",
    "KAN": "Kansas City Chiefs",
    "LAC": "Los Angeles Chargers",
    "LAR": "Los Angeles Rams",
    "MIA": "Miami Dolphins",
    "MIN": "Minnesota Vikings",
    "NWE": "New England Patriots",
    "NOR": "New Orleans Saints",
    "NYG": "New York Giants",
    "NYJ": "New York Jets",
    "OAK": "Oakland Raiders",
    "PHI": "Philadelphia Eagles",
    "PIT": "Pittsburgh Steelers",
    "SFO": "San Francisco 49ers",
    "SEA": "Seattle Seahawks",
    "TAM": "Tampa Bay Buccaneers",
    "TEN": "Tennessee Titans",
    "WAS": "Washington Football Team"
}
mapping = MissingDict(**map_values)
combined["New_team"] = combined["Team"].map(mapping)
merged = combined.merge(combined, left_on=["Date", "New_team"], right_on=["Date", "Opponent"])
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] == 0)]["actual_x"].value_counts()


  matches_rolling = matches.groupby(["Team", "Year"]).apply(lambda x: rolling_averages(x, cols, new_cols))


Unnamed: 0_level_0,count
actual_x,Unnamed: 1_level_1
1,28
0,27
