In [2]:
from datetime import datetime
from urllib.request import Request, urlopen
from typing import Dict
from bs4 import BeautifulSoup
import pandas as pd


current_year = datetime.now().year
seasons = [current_year - i for i in range(0, 3)]
seasons.reverse()


def get_html(page_url: str) -> str:
    '''
    Open a web page and return its source code
    '''

    request = Request(page_url, headers={'User-Agent': 'Mozilla/5.0'})
    return urlopen(request).read()


def extract_win_loss_data(html_content: str, year: int) -> pd.DataFrame:
    '''
    Extract the win/loss data from the html content into  a Pandas DataFrame
    '''

    soup = BeautifulSoup(html_content, 'html.parser')

    all_rows = soup.find_all('tr', class_=['darkcolor', 'lightcolor'])
    if not all_rows:
        print("Error: Data not found in page.")
        return pd.DataFrame()

    df = pd.DataFrame(columns=['date', 'home_team', 'away_team', 'home_score', 'away_score'])

    for row in all_rows:
        row_data = row.find_all('a')

        if len(row_data) < 3:
            continue

        try:

            match_date = row.find('td').get_text().strip()
            match_date = match_date.split(' ')
            match_date = datetime.strptime(f"{match_date[1]} {match_date[2]} {year}", '%d %b %Y')

            home = row_data[0].get_text().strip()
            away = row_data[1].get_text().strip()
            scores = row_data[2].get_text().strip().split('-')

            df.loc[len(df)] = [match_date, home, away, pd.to_numeric(scores[0]), pd.to_numeric(scores[1])]
        except AttributeError as e:
            # Handle cases where find('a') might return None or text parsing fails
            print(f"Error parsing specific elements in row: {e}. Skipping row.")
            continue
        except ValueError as e:
            # Handle conversion errors (e.g., date format, int conversion)
            print(f"Data conversion error in row: {e}. Skipping row.")
            continue
        except IndexError as e:
            print(f"Index error (missing column) in row: {e}. Skipping row.")
            continue

    return df


def get_season_results() -> Dict[int, pd.DataFrame]:
    '''
    Loop over each season and extract the win/loss data into a combined pandas dataframe
    '''

    all_seasons = []
    df = pd.DataFrame()

    for season in seasons:
        print("Loading data for season {}....".format(season))

        url = f"https://www.footywire.com/afl/footy/ft_match_list?year={season}"
        html = get_html(url)
        season_data = extract_win_loss_data(html, season)

        if season_data.empty:
            print(f"No data parsed for season {season}.")
            continue

        season_data['year'] = season
        all_seasons.append(season_data)

        print("Finished parsing data for season {}\n".format(season))

    return dict(zip(seasons, all_seasons))
    # if all_seasons:
    #     df = pd.concat(all_seasons, ignore_index=True)
    #     df.sort_values('date').reset_index(drop=True)
    # else:
    #     print("No season data found across all years.")
    #
    # return df


def create_stats_table(data: pd.DataFrame) -> pd.DataFrame:
    '''
    Function to create a table listing each team and their stats including ranking
    '''

    unique_team_names = data['home_team'].unique()

    df = pd.DataFrame()

    df['team'] = unique_team_names
    df['ranking'] = 0
    df['points'] = 0
    df['w'] = 0
    df['d'] = 0
    df['l'] = 0
    df['points_for'] = 0
    df['points_against'] = 0
    df['points_difference'] = 0
    df['games_played'] = 0
    df['last_date'] = max(data['date'])

    return df


def update_stats_table(stats_table: pd.DataFrame, season_data: pd.DataFrame): # TODO: Type return of function
    # print(season_data)
    # print(stats_table)
    stats_table_all = stats_table.copy()
    stats_table_new = stats_table.copy()

    for index, row in season_data.iterrows():
        home_team = row['home_team']
        away_team = row['away_team']

        home_team_points = row['home_score']
        away_team_points = row['away_score']

        result = 'H' if home_team_points > away_team_points else 'A'
        if home_team_points == away_team_points:
            result = 'D'

        last_date = row['date']

        # Update games played column
        stats_table_new.loc[stats_table_new['team'] == home_team, 'games_played'] += 1
        stats_table_new.loc[stats_table_new['team'] == away_team, 'games_played'] += 1

        # Update goals for / goals against
        stats_table_new.loc[stats_table_new['team'] == home_team, 'points_for'] += home_team_points
        stats_table_new.loc[stats_table_new['team'] == home_team, 'points_against'] += away_team_points
        stats_table_new.loc[stats_table_new['team'] == away_team, 'points_for'] += away_team_points
        stats_table_new.loc[stats_table_new['team'] == away_team, 'points_against'] += home_team_points

        # update 'last_date' column:
        stats_table_new.loc[stats_table_new['team'] == home_team, 'last_date'] = last_date

        # update 'points' and 'w', 'd', 'l' columns based on the result of the match:
        if result == 'H':
            stats_table_new.loc[stats_table_new['team'] == home_team, 'points'] += 3
            stats_table_new.loc[stats_table_new['team'] == away_team, 'points'] += 0

            stats_table_new.loc[stats_table_new['team'] == home_team, 'w'] += 1
            stats_table_new.loc[stats_table_new['team'] == away_team, 'l'] += 1

        elif result == 'A':
            stats_table_new.loc[stats_table_new['team'] == home_team, 'points'] += 0
            stats_table_new.loc[stats_table_new['team'] == away_team, 'points'] += 3

            stats_table_new.loc[stats_table_new['team'] == home_team, 'l'] += 1
            stats_table_new.loc[stats_table_new['team'] == away_team, 'w'] += 1

        elif result == 'D':
            stats_table_new.loc[stats_table_new['team'] == home_team, 'points'] += 1
            stats_table_new.loc[stats_table_new['team'] == away_team, 'points'] += 1

            stats_table_new.loc[stats_table_new['team'] == home_team, 'd'] += 1
            stats_table_new.loc[stats_table_new['team'] == away_team, 'd'] += 1


        # update 'points_difference' column:
        stats_table_new['points_difference'] = stats_table_new['points_for'] - stats_table_new['points_against']

        # crate ranking based on points, goal difference, goals for and goals against
        stats_table_new = stats_table_new.sort_values(by=['points', 'points_difference', 'points_for', 'points_against'],
                                                        ascending=False)
        stats_table_new = stats_table_new.reset_index(drop=True)
        stats_table_new['ranking'] = stats_table_new.index + 1

        # update our data
        updated_row = stats_table_new[
            (stats_table_new['team'] == home_team) | (stats_table_new['team'] == away_team)]
        stats_table_all = pd.concat([stats_table_all, updated_row], ignore_index=True)

        # reorder rows based on column points and points_for:
    stats_table_new = stats_table_new.sort_values(by=['points', 'points_difference', 'points_for', 'points_against'],
                                                    ascending=False)
    stats_table_new = stats_table_new.reset_index(drop=True)

    return stats_table_new, stats_table_all

if __name__ == '__main__':
    seasons_data = get_season_results()
    stats_2023 = create_stats_table(seasons_data[2023])

    stats_2023, league_table_all = update_stats_table(stats_2023, seasons_data[2023])
    print(stats_2023)
    # print("\n--- Consolidated Match Data Head ---")
    # print(seasons_data.head())
    # print("\n--- Consolidated Match Data Info ---")
    # seasons_data.info()
    # print(f"\nTotal matches collected: {len(seasons_data)}")

    # team_averages = pd.DataFrame(columns=['team', 'win_total', 'game_total', 'num_games'])






Loading data for season 2023....
Finished parsing data for season 2023

Loading data for season 2024....
Finished parsing data for season 2024

Loading data for season 2025....
Finished parsing data for season 2025

                team  ranking  points   w  d   l  points_for  points_against  \
0        Collingwood        1      63  21  0   5        2350            1883   
1           Brisbane        2      57  19  0   7        2468            1999   
2      Port Adelaide        3      51  17  0   8        2294            2122   
3          Melbourne        4      48  16  0   9        2203            1793   
4            Carlton        5      46  15  1  10        2132            1915   
5                GWS        6      45  15  0  11        2269            2090   
6           St Kilda        7      39  13  0  11        1852            1748   
7             Sydney        8      37  12  1  11        2118            1937   
8   Western Bulldogs        9      36  12  0  11        1919    