In [199]:

import pandas as pd
ladder_df = pd.read_csv("../data/ladder_2020_to_2025.csv")
team_abbr_to_full = {
    "GW":"GWS",
    "SY":"Sydney",
    "NM":"North Melbourne",
    "WB":"Western Bulldogs",
    "GC":"Gold Coast",
    "PA":"Port Adelaide",
    "HW":"Hawthorn",
    "SK":"St Kilda",
    "GE":"Geelong",
    "CW":"Collingwood",
    "ES":"Essendon",
    "CA":"Carlton",
    "RI":"Richmond",
    "AD":"Adelaide",
    "FR":"Fremantle",
    "WC":"West Coast",
    "BL":"Brisbane",
    "ME":"Melbourne",
}
ladder_df["Team"] = ladder_df["Team"].map(team_abbr_to_full)
ladder_df.drop(columns=["Unnamed: 0"], inplace=True)
print(ladder_df.head(20))

    Round  Position              Team  Played  Premiership_Points  Percentage
0       1         1               GWS       1                   4       200.0
1       1         2          Hawthorn       1                   4       126.3
2       1         3            Sydney       1                   0        79.2
3       1         4       Collingwood       1                   0        50.0
4       2         1               GWS       2                   8       143.7
5       2         2          Hawthorn       2                   8       128.6
6       2         3        Gold Coast       1                   4       277.6
7       2         4           Geelong       1                   4       213.0
8       2         5          Adelaide       1                   4       187.5
9       2         6       Collingwood       2                   4       126.2
10      2         7          Richmond       1                   4       118.8
11      2         8  Western Bulldogs       1                   

In [200]:
# Create binary label: 1 if home team won, else 0
def did_home_win(score):
    try:
        home_score, away_score = map(int, score.split("-"))
        return int(home_score > away_score)
    except:
        return None  # For BYEs or missing data

In [201]:

df_2020 = pd.read_csv("../data/parsed_real_afl_attendance_2020.csv")
df_2021 = pd.read_csv("../data/parsed_real_afl_attendance_2021.csv")
df_2022 = pd.read_csv("../data/parsed_real_afl_attendance_2022.csv")
df_2023 = pd.read_csv("../data/parsed_real_afl_attendance_2023.csv")
df_2024 = pd.read_csv("../data/parsed_real_afl_attendance_2024.csv")
df_2025 = pd.read_csv("../data/parsed_real_afl_attendance_2025.csv")


def format_season(df, year: int):
    df = df.dropna(subset=["Home v Away Teams"])
    teams_split = df["Home v Away Teams"].str.split(r"\s+v\s+", expand=True)
    df["home_team"] = teams_split[0]
    df["away_team"] = teams_split[1]
    df["Crowd"] = pd.to_numeric(df["Crowd"]).fillna(0).astype(int)
    df["Year"] = year
    df["home_win"] = df["Result"].apply(did_home_win)
    return df

df_2020 = format_season(df_2020, 2020)
df_2021 = format_season(df_2021, 2021)
df_2022 = format_season(df_2022, 2022)
df_2023 = format_season(df_2023, 2023)
df_2024 = format_season(df_2024, 2024)
df_2025 = format_season(df_2025, 2025)

In [202]:
df_all = pd.concat([df_2020, df_2021, df_2022, df_2023, df_2024, df_2025], ignore_index=True)
df_all["Round"] = pd.to_numeric(df_all["Round"], errors="coerce").fillna(0).astype(int)
df_all = df_all.sort_values(["Year", "Round"]).dropna(subset=["home_win"])


In [203]:

y = df_all["home_win"]

In [204]:
X = df_all[["home_team", "away_team", "Venue", "Crowd"]]

In [205]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define column transformer
categorical_cols = ["home_team", "away_team", "Venue"]
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_cols)
], remainder="passthrough")


In [206]:
# Pipeline with Random Forest
model = make_pipeline(preprocessor, RandomForestClassifier(random_state=42))
model.fit(X_train, y_train)

0,1,2
,steps,"[('columntransformer', ...), ('randomforestclassifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [207]:
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.2f}")


Accuracy: 0.64


In [208]:
# Get feature names after one-hot encoding
encoded_feature_names = model.named_steps["columntransformer"].get_feature_names_out()


In [209]:
rf_model = model.named_steps["randomforestclassifier"]


In [210]:
import pandas as pd

# Pair names with importance values
feature_importance = pd.DataFrame({
    "feature": encoded_feature_names,
    "importance": rf_model.feature_importances_
})

# Sort from most to least important
feature_importance = feature_importance.sort_values("importance", ascending=False)
feature_importance.head(10)


Unnamed: 0,feature,importance
57,remainder__Crowd,0.349118
11,cat__home_team_North Melbourne,0.029016
29,cat__away_team_North Melbourne,0.026392
34,cat__away_team_West Coast,0.02232
21,cat__away_team_Collingwood,0.018391
24,cat__away_team_GWS,0.01822
35,cat__away_team_Western Bulldogs,0.01775
2,cat__home_team_Carlton,0.017228
45,cat__Venue_MCG,0.016978
31,cat__away_team_Richmond,0.016761


In [211]:
ladder_df = pd.read_csv("../data/ladder_2020_to_2025.csv")

# Rename to clarify when merging
ladder_df = ladder_df.rename(columns={
    "Position": "ladder_position",
    "Premiership_Points": "premiership_points",
    "Percentage": "percentage"
})


In [212]:
def load_ladder(file, year):
    df = pd.read_csv("../data/" + file)
    df["Year"] = year
    return df

ladder_2020 = load_ladder("ladder_2020.csv", 2020)
ladder_2021 = load_ladder("ladder_2021.csv", 2021)
ladder_2022 = load_ladder("ladder_2022.csv", 2022)
ladder_2023 = load_ladder("ladder_2023.csv", 2023)
ladder_2024 = load_ladder("ladder_2024.csv", 2024)
ladder_2025 = load_ladder("ladder_2025.csv", 2025)

ladder_all = pd.concat([ladder_2020, ladder_2021, ladder_2022, ladder_2023, ladder_2024, ladder_2025], ignore_index=True)


In [213]:
# Merge home team ladder info
df_all = df_all.merge(
    ladder_all.rename(columns={
        "Team": "home_team",
        "Position": "home_position",
        "Premiership_Points": "home_points",
        "Percentage": "home_percentage"
    }),
    on=["Year", "Round", "home_team"],
    how="left"
)

# Merge away team ladder info
df_all = df_all.merge(
    ladder_all.rename(columns={
        "Team": "away_team",
        "Position": "away_position",
        "Premiership_Points": "away_points",
        "Percentage": "away_percentage"
    }),
    on=["Year", "Round", "away_team"],
    how="left"
)


In [214]:
df_all["position_diff"] = df_all["away_position"] - df_all["home_position"]
df_all["points_diff"] = df_all["away_points"] - df_all["home_points"]
df_all["percentage_diff"] = df_all["away_percentage"] - df_all["home_percentage"]


In [215]:
df_all = df_all.dropna(subset=["home_position", "away_position"])

In [216]:

y = df_all["home_win"]

In [217]:
X = df_all[["position_diff", "points_diff", "percentage_diff", "Crowd", "home_team", "away_team", "Venue", "Crowd"]]

In [218]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define column transformer
categorical_cols = ["home_team", "away_team", "Venue"]
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_cols)
], remainder="passthrough")


In [219]:
# Pipeline with Random Forest
model = make_pipeline(preprocessor, RandomForestClassifier(random_state=42))
model.fit(X_train, y_train)

0,1,2
,steps,"[('columntransformer', ...), ('randomforestclassifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [220]:
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.2f}")


Accuracy: 0.66


In [254]:
# Sort to get proper rolling windows
ladder_all = ladder_all.sort_values(["Team", "Year", "Round"])

# Calculate rolling average percentage (3 rounds)
ladder_all["rolling_pct_3"] = (
    ladder_all.groupby(["Team", "Year"])["Percentage"]
    .transform(lambda x: x.shift(1).rolling(window=30, min_periods=1).mean())
)


In [255]:
# Merge rolling percentage for home team
# Drop any leftover rolling_pct columns to avoid merge conflicts
print("Rows before merge:", df_all.shape[0])

df_all = df_all.drop(columns=[
    col for col in df_all.columns if "rolling_pct" in col
], errors="ignore")

df_all = df_all.merge(
    ladder_all[["Year", "Round", "Team", "rolling_pct_3"]].rename(columns={
        "Team": "home_team",
        "rolling_pct_3": "home_rolling_pct_3"
    }),
    on=["Year", "Round", "home_team"],
    how="left"
)

# Merge rolling percentage for away team
df_all = df_all.merge(
    ladder_all[["Year", "Round", "Team", "rolling_pct_3"]].rename(columns={
        "Team": "away_team",
        "rolling_pct_3": "away_rolling_pct_3"
    }),
    on=["Year", "Round", "away_team"],
    how="left"
)
df_all.head(10)
# Optional: Add difference feature
df_all["rolling_pct_diff"] = df_all["away_rolling_pct_3"] - df_all["home_rolling_pct_3"]
print("Missing rolling_pct values:")
print(df_all[["home_rolling_pct_3", "away_rolling_pct_3"]].isna().sum())




Rows before merge: 195
Missing rolling_pct values:
home_rolling_pct_3    0
away_rolling_pct_3    0
dtype: int64


In [256]:
df_all = df_all.dropna(subset=["home_rolling_pct_3", "away_rolling_pct_3"])
df_all.head(10)

Unnamed: 0,Round,Date,Home v Away Teams,Venue,Crowd,Result,Disposals,Goals,home_team,away_team,...,away_position,Played_y,away_points,away_percentage,position_diff,points_diff,percentage_diff,home_rolling_pct_3,away_rolling_pct_3,rolling_pct_diff
0,3,Thu 28 Mar 6:30pm,Brisbane v Collingwood,Gabba,34022,72-92,L. Neale 35,J. Elliott 4,Brisbane,Collingwood,...,15.0,3.0,0.0,74.2,3.0,0.0,-12.4,92.7,70.9,-21.8
1,3,Fri 29 Mar 4:20pm,North Melbourne v Carlton,Marvel Stadium,47565,81-137,H. Sheezel 32,H. McKay 5,North Melbourne,Carlton,...,7.0,2.0,8.0,103.6,-9.0,8.0,32.7,67.8,102.4,34.6
2,3,Fri 29 Mar 4:30pm,Fremantle v Adelaide,Optus Stadium,51037,69-34,H. Young 32,M. Walters 2; M. Taberner 2; J. Amiss 2,Fremantle,Adelaide,...,13.0,2.0,0.0,84.0,8.0,-8.0,-49.6,132.9,90.0,-42.9
3,3,Sat 30 Mar 4:20pm,Essendon v St Kilda,Marvel Stadium,44412,71-67,N. Martin 44,J. Stringer 3; K. Langford 3; J. Higgins 3,Essendon,St Kilda,...,9.0,2.0,4.0,104.5,-2.0,0.0,7.3,128.9,89.5,-39.4
4,3,Sat 30 Mar 7:00pm,Port Adelaide v Melbourne,Adelaide Oval,38105,89-96,T. Rivers 27,B. Brown 3,Port Adelaide,Melbourne,...,4.0,3.0,8.0,141.5,1.0,0.0,-7.9,171.4,94.85,-76.55
5,3,Sun 31 Mar 1:00pm,Western Bulldogs v West Coast,Marvel Stadium,22991,106-30,A. Treloar 35,M. Bontempelli 3,Western Bulldogs,West Coast,...,18.0,2.0,0.0,49.6,8.0,-4.0,-52.1,58.7,58.3,-0.4
6,3,Sun 31 Mar 4:00pm,Richmond v Sydney,MCG,45112,82-77,T. Taranto 35,T. Lynch 3; L. McDonald 3,Richmond,Sydney,...,2.0,3.0,12.0,136.3,-12.0,12.0,60.4,68.4,137.9,69.5
7,3,Mon 1 Apr 3:20pm,Hawthorn v Geelong,MCG,67020,70-106,J. Worpel 36,T. Hawkins 4; O. Henry 4; Date\tHome v Away Te...,Hawthorn,Geelong,...,6.0,2.0,8.0,118.6,-11.0,8.0,58.1,77.6,111.8,34.2
8,4,Thu 4 Apr 7:10pm,Adelaide v Melbourne,Adelaide Oval,48020,63-78,M. Crouch 30,B. Fritsch 3,Adelaide,Melbourne,...,3.0,4.0,12.0,130.7,-12.0,12.0,57.4,87.0,110.4,23.4
9,4,Fri 5 Apr 4:40pm,Brisbane v North Melbourne,Norwood Oval,9037,112-42,H. McCluggage 36,J. Daniher 5,Brisbane,North Melbourne,...,16.0,3.0,0.0,66.4,2.0,0.0,-17.4,90.666667,69.35,-21.316667


In [257]:

y = df_all["home_win"]

In [258]:
X = df_all[["points_diff", "rolling_pct_diff", "percentage_diff"]]

In [259]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define column transformer
#categorical_cols = ["home_team", "away_team", "Venue"]
#preprocessor = ColumnTransformer([
#    ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_cols)
#], remainder="passthrough")


In [260]:
# Pipeline with Random Forest
model = make_pipeline(RandomForestClassifier(random_state=42))
model.fit(X_train, y_train)

0,1,2
,steps,"[('randomforestclassifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [261]:
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.2f}")


Accuracy: 0.64
