In [None]:
# Create binary label: 1 if home team won, else 0
def did_home_win(score):
    try:
        home_score, away_score = map(int, score.split("-"))
        return int(home_score > away_score)
    except:
        return None  # For BYEs or missing data
%store -r df_regular
df_regular["home_win"] = df_regular["Result"].apply(did_home_win)
df_model = df_regular.dropna(subset=["home_win"])


In [None]:

y = df_model["home_win"]

In [None]:
X = df_model[["home_team", "away_team", "Venue", "Crowd"]]

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define column transformer
categorical_cols = ["home_team", "away_team", "Venue"]
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_cols)
], remainder="passthrough")


In [None]:
# Pipeline with Random Forest
model = make_pipeline(preprocessor, RandomForestClassifier(random_state=42))
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.2f}")


In [None]:
import pandas as pd
# Display first 10 predictions
pd.DataFrame({"Actual": y_test, "Predicted": y_pred}).head(10)


In [None]:
# X_test is a DataFrame of features (before encoding)
# y_test is the correct answer (1 or 0 for home win)

# First, make a copy so we can add predictions
test_results = X_test.copy()
test_results["actual"] = y_test
test_results["predicted"] = model.predict(X_test)


In [None]:
# Just the key columns for readability
print(test_results[["home_team", "away_team", "Crowd", "actual", "predicted"]].head(10))


In [None]:
# Get feature names after one-hot encoding
encoded_feature_names = model.named_steps["columntransformer"].get_feature_names_out()


In [None]:
import pandas as pd

# Pair names with importance values
feature_importance = pd.DataFrame({
    "feature": encoded_feature_names,
    "importance": rf_model.feature_importances_
})

# Sort from most to least important
feature_importance = feature_importance.sort_values("importance", ascending=False)
feature_importance.head(10)


In [None]:
ladder_df = pd.read_csv("../data/ladder_2024.csv")

# Rename to clarify when merging
ladder_df = ladder_df.rename(columns={
    "Position": "ladder_position",
    "Premiership_Points": "premiership_points",
    "Percentage": "percentage"
})


In [None]:
ladder_df.head(10)

In [None]:
df_model = pd.merge(
    df_model,
    ladder_df.rename(columns={
        "Team": "home_team",
        "ladder_position": "home_position",
        "premiership_points": "home_points",
        "percentage": "home_percentage",
        "Played": "home_played"
    }),
    how="left",
    on=["Round", "home_team"]
)
df_model = pd.merge(
    df_model,
    ladder_df.rename(columns={
        "Team": "away_team",
        "ladder_position": "away_position",
        "premiership_points": "away_points",
        "percentage": "away_percentage",
        "Played": "away_played"
    }),
    how="left",
    on=["Round", "away_team"]
)


In [None]:
df_model[[
    "Round", "home_team", "home_position", "away_team", "away_position",
    "home_points", "away_points", "home_percentage", "away_percentage"
]].sample(5)



In [None]:
df_model["position_diff"] = df_model["home_position"] - df_model["away_position"] 
df_model["percentage_diff"] = df_model["home_percentage"] - df_model["away_percentage"]
df_model["points_diff"] = df_model["home_points"] - df_model["away_points"]
df_model = df_model[df_model["Round"] >= 3]


In [None]:
df_model = df_model.dropna(subset=["home_position", "away_position"])

In [None]:
X = df_model[[
    "home_team", "away_team", "Venue", "Crowd",
    "position_diff", "percentage_diff",
    "points_diff"
]]


In [None]:
y= df_model["home_win"]

In [None]:
X = X.dropna()
y = y[X.index]


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define column transformer
categorical_cols = ["home_team", "away_team", "Venue"]
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_cols)
], remainder="passthrough")


In [None]:
print(X_train.shape)


In [None]:
# Pipeline with Random Forest
model = make_pipeline(preprocessor, RandomForestClassifier(random_state=42))
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.2f}")

In [None]:
# Get feature names after one-hot encoding
encoded_feature_names = model.named_steps["columntransformer"].get_feature_names_out()

In [None]:
feature_importance.sort_values("importance", ascending=False)


In [None]:
X_ladder = df_model[[
    "position_diff", "points_diff", "percentage_diff"
]].dropna()

y_ladder = df_model.loc[X_ladder.index, "home_win"]


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_ladder, y_ladder, test_size=0.2, random_state=42
)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Ladder-only model accuracy: {accuracy:.2f}")


In [None]:
import pandas as pd

importance = pd.DataFrame({
    "feature": X_train.columns,
    "importance": model.feature_importances_
}).sort_values("importance", ascending=False)

print(importance)
