<a href="https://colab.research.google.com/github/samarthsarda9/Projects/blob/main/FootballPredictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup

years = [2024, 2023, 2022, 2021, 2020, 2019]  # List of years to scrape
all_matches = []

for year in years:
    standings_url = f"https://www.pro-football-reference.com/years/{year}/"
    print(f"Fetching data for the {year} season...")
    time.sleep(2)  # Respectful delay
    data = requests.get(standings_url)

    if data.status_code != 200:
        print(f"Failed to fetch data for {year}. Status code: {data.status_code}")
        continue

    soup = BeautifulSoup(data.text, "html.parser")
    afc_table = soup.find("table", {"id": "AFC"})
    nfc_table = soup.find("table", {"id": "NFC"})

    if not afc_table or not nfc_table:
        print(f"Could not find AFC or NFC tables for {year}. Skipping...")
        continue

    afc_links = afc_table.find_all('a')
    nfc_links = nfc_table.find_all('a')
    links = afc_links + nfc_links
    links = [l.get("href") for l in links]
    links = [l for l in links if "/teams/" in l]
    team_urls = [f"https://www.pro-football-reference.com{l}" for l in links]

    for team_url in team_urls:
        team_name = team_url.split("/")[4].upper()
        print(f"Scraping data for {team_name} ({year})")
        time.sleep(2)  # Delay to prevent hitting rate limits

        team_data = requests.get(team_url)
        if team_data.status_code != 200:
            print(f"Failed to fetch team data for {team_name}. Status code: {team_data.status_code}")
            continue

        try:
            matches = pd.read_html(team_data.text, match="Schedule & Game Results Table")[0]
            matches.columns = matches.columns.droplevel()
        except (ValueError, KeyError):
            print(f"Could not find schedule table for {team_name} in {year}. Skipping...")
            continue

        # Columns to drop, excluding 'Date'
        columns_to_drop = ['Day', 'Unnamed: 3_level_1', 'Unnamed: 4_level_1', '1stD', 'PassY', 'RushY', 'Offense', 'Defense', 'Sp. Tms']
        for column in columns_to_drop:
            try:
                matches = matches.drop(columns=[column])
            except KeyError:
                print(f"Column '{column}' not found in DataFrame. Skipping...")

        # Rename columns for clarity
        matches.columns.values[6] = 'Opponent'
        matches = matches.rename(columns={
            "Unnamed: 5_level_1": "Result",
            "Unnamed: 8_level_1": "Home/Away",
            "Tm": "PF",
            "Opp": "PA"
        })
        matches.columns.values[9] = 'Off: TotYd'
        matches.columns.values[10] = 'Off: TO'
        matches.columns.values[11] = 'Def: TotYd'
        matches.columns.values[12] = 'Def: TO'

        # Replace values in the 'Home/Away' column
        matches['Home/Away'] = matches['Home/Away'].replace("@", "Away").fillna("Home")

        # Add zeros for zero turnovers
        matches['Off: TO'] = matches['Off: TO'].fillna(0)
        matches['Def: TO'] = matches['Def: TO'].fillna(0)

        # Split team name and year into separate columns
        matches.insert(0, "Team", team_name)
        matches.insert(1, "Year", year)

        matches['Date'] = matches['Date'] + ' ' + str(year)
        matches['Date'] = pd.to_datetime(matches['Date'], format='%B %d %Y', errors='coerce')
        matches['Date'] = matches['Date'].dt.strftime('%m/%d/%Y')  # Optional: format as MM/DD/YYYY

        # Check for rows marked as canceled and filter them out based on a broader set of criteria.
        matches = matches[~matches.apply(lambda row: 'Canceled' in row.values, axis=1)]

        # Append to the all_matches list
        all_matches.append(matches)

# Combine all collected data into a single DataFrame and save to CSV
match_df = pd.concat(all_matches, ignore_index=True)
match_df.to_csv("nfl_matches_2019_2024.csv", index=False)
print("Data collection complete. Saved to 'nfl_matches_2019_2024.csv'.")


In [None]:
import requests
import time
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score

# Step 1: Load the Data
matches = pd.read_csv('nfl_matches_2019_2024.csv')

def update_elo(winner_elo, loser_elo, margin, home_advantage=55, k=20, home_team=None):
    if home_team == "home":
        winner_elo += home_advantage
    elif home_team == "away":
        loser_elo += home_advantage

    expected_win_prob = 1 / (1 + 10 ** ((loser_elo - winner_elo) / 400))
    elo_change = k * (1 - expected_win_prob) * (1 + (margin / 10))

    if home_team == "home":
        winner_elo -= home_advantage
    elif home_team == "away":
        loser_elo -= home_advantage

    return winner_elo + elo_change, loser_elo - elo_change

team_elos = {team: 1500 for team in matches["Team"].unique()}
elo_ratings = []

for _, row in matches.iterrows():
    team = row["Team"]
    opponent = row["Opponent"]
    home_status = "home" if row["Home/Away"] == "Home" else "away"

    team_elo = team_elos.get(team, 1500)
    opp_elo = team_elos.get(opponent, 1500)

    if row["Result"] == "W":
        team_elo, opp_elo = update_elo(team_elo, opp_elo, row["PF"] - row["PA"], home_team=home_status)
    else:
        opp_elo, team_elo = update_elo(opp_elo, team_elo, row["PA"] - row["PF"], home_team="away" if home_status == "home" else "home")

    team_elos[team] = team_elo
    team_elos[opponent] = opp_elo
    elo_ratings.append({"Date": row["Date"], "Team": team, "Elo": team_elo})

elo_df = pd.DataFrame(elo_ratings)
matches = matches.merge(elo_df, on=["Date", "Team"])

# Step 2: Rolling Averages

def rolling_averages(group, cols, new_cols):
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    return group.dropna(subset=new_cols)

# Add new columns for offensive and defensive stats
cols = ["PF", "PA", "Off: TotYd", "Off: TO", "Def: TotYd", "Def: TO"]
new_cols = [f"{c}_rolling" for c in cols]
matches_rolling = matches.groupby(["Team", "Year"]).apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('Team')
matches_rolling.index = range(matches_rolling.shape[0])

# Step 3: Model Training

matches_rolling["H/A_code"] = matches_rolling["Home/Away"].astype("category").cat.codes
matches_rolling["Opp_code"] = matches_rolling["Opponent"].astype("category").cat.codes
matches_rolling["Result_code"] = (matches_rolling["Result"] == "W").astype(int)

predictors = ["H/A_code", "Opp_code", "Elo"] + new_cols
rf = RandomForestClassifier(n_estimators=100, min_samples_split=10, random_state=1)
train = matches_rolling[matches_rolling["Year"] <= 2022]
test = matches_rolling[matches_rolling["Year"] == 2023]

rf.fit(train[predictors], train["Result_code"])
preds = rf.predict(test[predictors])

# Evaluation
acc = accuracy_score(test["Result_code"], preds)
precision = precision_score(test["Result_code"], preds)

print(f"Accuracy: {acc:.4f}")
print(f"Precision: {precision:.4f}")

# Step 4: Predict Winner and Odds Function

def predict_winner(home_team, away_team, rf_model, matches_df, home_advantage=55):
    # Check if home_team and away_team are in the dataset
    if home_team not in matches_df["Team"].values or away_team not in matches_df["Team"].values:
        raise ValueError(f"One or both of the teams '{home_team}' or '{away_team}' are not in the dataset.")

    # Get the most recent data for both teams
    home_data = matches_df[matches_df["Team"] == home_team].iloc[-1] if not matches_df[matches_df["Team"] == home_team].empty else None
    away_data = matches_df[matches_df["Team"] == away_team].iloc[-1] if not matches_df[matches_df["Team"] == away_team].empty else None

    # If there is no data for any team, raise an error
    if home_data is None or away_data is None:
        raise ValueError(f"Not enough data for one or both teams '{home_team}' or '{away_team}'.")

    # Calculate ELO ratings (use the last available ELO ratings)
    home_elo = home_data['Elo']
    away_elo = away_data['Elo']

    # Calculate rolling averages
    home_pf_rolling = home_data['PF_rolling']
    home_pa_rolling = home_data['PA_rolling']
    home_off_totyd_rolling = home_data['Off: TotYd_rolling']
    home_off_to_rolling = home_data['Off: TO_rolling']
    home_def_totyd_rolling = home_data['Def: TotYd_rolling']
    home_def_to_rolling = home_data['Def: TO_rolling']

    # We don't use away team's rolling stats in prediction
    away_pf_rolling = away_data['PF_rolling']
    away_pa_rolling = away_data['PA_rolling']
    away_off_totyd_rolling = away_data['Off: TotYd_rolling']
    away_off_to_rolling = away_data['Off: TO_rolling']
    away_def_totyd_rolling = away_data['Def: TotYd_rolling']
    away_def_to_rolling = away_data['Def: TO_rolling']

    # Determine if the home team is really the home team
    home_code = 1 if home_data["Home/Away"] == "Home" else 0
    away_code = 1 if away_data["Home/Away"] == "Home" else 0

    # Prepare the input features for the model
    input_features = {
        "H/A_code": home_code,  # Home team code (1 if home, 0 if away)
        "Opp_code": away_code,  # Away team code (1 if away, 0 if home)
        "Elo": home_elo,        # Home team Elo
        "PF_rolling": home_pf_rolling,  # Home team rolling points for
        "PA_rolling": home_pa_rolling,  # Home team rolling points against
        "Off: TotYd_rolling": home_off_totyd_rolling,  # Home team offensive yards rolling
        "Off: TO_rolling": home_off_to_rolling,  # Home team turnovers rolling
        "Def: TotYd_rolling": home_def_totyd_rolling,  # Home team defensive yards rolling
        "Def: TO_rolling": home_def_to_rolling,  # Home team defensive turnovers rolling
    }

    # Convert to DataFrame for prediction
    input_df = pd.DataFrame([input_features])

    # Predict the outcome
    preds = rf_model.predict(input_df)

    # Calculate the predicted probability (odds of winning)
    prob = rf_model.predict_proba(input_df)[:, 1]  # Probability of home team winning

    if preds[0] == 1:
        winner = home_team
        odds = prob[0]
    else:
        winner = away_team
        odds = 1 - prob[0]  # Probability of away team winning

    return winner, odds

# Example of using the function
home_team = input("Enter the home team: ")
away_team = input("Enter the away team: ")

try:
    winner, odds = predict_winner(home_team, away_team, rf, matches_rolling)
    print(f"The winner is {winner} with a probability of {odds:.4f}")
except ValueError as e:
    print(e)
