# Predicting Australian Football Games

How much of a determining factor does home advantage play in AFL? below is a simple model I used to support a hypothesis/debate i had with a friend.

** include relevant libraries 

In [3]:
import urllib3  
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from io import StringIO
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [16]:
#fetch latest fixture data from fanfooty 
url = 'http://www.fanfooty.com.au/resource/draw.php'

#fetch and close connection 
connection_pool = urllib3.PoolManager()
resp = connection_pool.request('GET',url )
f = open('draw.php', 'wb')
f.write(resp.data)
f.close()
resp.release_conn()

#headers
columnames = ['draw ID', 'year', 'competition', 'round', 'gametime', 'day', 'home team', 'away team', 'ground', 'timeslot', 'TV coverage', 'home supergoals', 'home goals', 'home behinds', 'home points', 'away supergoals', 'away goals', 'away behinds', 'away points', 'match status'] 

#convert into pandas dataframe
games_df = pd.read_csv(StringIO(resp.data.decode("utf-8")), sep=",", names=columnames)


In [17]:
#define winner criteria 1 (home team), 0 (away team),2 (draw)
def determine_winner(row):
    if row['home points'] > row['away points']:
        winner = 1
    else:
        if row['home points'] == row['away points']:
            winner = 2
        else: 
            winner = 0
    return winner 

#filter data
games_df["Winner"] = games_df.apply(determine_winner,axis=1)
games_df = games_df[(games_df['match status'] != np.nan) & (games_df['match status'].str.contains('Full Time'))]
ames_df.head()

Unnamed: 0,draw ID,year,competition,round,gametime,day,home team,away team,ground,timeslot,...,home supergoals,home goals,home behinds,home points,away supergoals,away goals,away behinds,away points,match status,Winner
0,1006,1993,HA,1,1993-03-26 20:08:00,Friday,Western Bulldogs,Collingwood,MCG,N,...,,13.0,17.0,95.0,,17.0,13.0,115.0,Full Time,0
1,1000,1993,HA,1,1993-03-27 14:00:00,Saturday,North Melbourne,Brisbane Bears,MCG,D,...,,24.0,22.0,166.0,,22.0,11.0,143.0,Full Time,1
2,1001,1993,HA,1,1993-03-27 14:00:00,Saturday,Carlton,Fitzroy,Princes Park,D,...,,17.0,10.0,112.0,,17.0,16.0,118.0,Full Time,0
3,1002,1993,HA,1,1993-03-27 14:00:00,Saturday,Hawthorn,Melbourne,Waverley,D,...,,13.0,15.0,93.0,,11.0,4.0,70.0,Full Time,1
4,1005,1993,HA,1,1993-03-27 14:08:00,Saturday,Geelong,St Kilda,Kardinia,D,...,,20.0,16.0,136.0,,16.0,16.0,112.0,Full Time,1


In [18]:
#feature select 
games_df = games_df[['home team', 'away team','ground', 'day', 'Winner']]

In [19]:
# check if there are  null fields
np.where(games_df.isnull())

(array([], dtype=int64), array([], dtype=int64))

In [21]:
games_df.info()
games_df.columns

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5093 entries, 0 to 5102
Data columns (total 5 columns):
home team    5093 non-null object
away team    5093 non-null object
ground       5093 non-null object
day          5093 non-null object
Winner       5093 non-null int64
dtypes: int64(1), object(4)
memory usage: 238.7+ KB


Index(['home team', 'away team', 'ground', 'day', 'Winner'], dtype='object')

In [22]:
y = games_df['Winner']

#remove target and set all remaining fields as features
games_df.pop('Winner')
print('Set target variable')

Set target variable


In [23]:
# one hot encode categortical data using pandas built-in dummies method
all_categorical_fields = games_df.select_dtypes('object').columns
all_categorical_fields_indicator = []

for categorical_field in all_categorical_fields:
        all_categorical_fields_indicator.append(pd.get_dummies(games_df[categorical_field], prefix=categorical_field))
        
# create a new dataframe with one hot encoded features and continous variables
# now add remaining continous variables
all_categorical_fields_indicator.append(games_df.select_dtypes(['int64']))
X = pd.concat([categorical_field for categorical_field in all_categorical_fields_indicator], axis=1)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.7,
                                                    random_state=42)

In [28]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    '''
    print the accuracy score, classification report and confusion matrix of classifier
    '''
    if train:
        '''
        training performance
        '''
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report: \n {}\n".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, clf.predict(X_train))))

        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        '''
        test performance
        '''
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report: \n {}\n".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, clf.predict(X_test))))    

# GBM + Grid search to optimise hyperparameters

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

In [30]:
gbc_clf = GradientBoostingClassifier(random_state=42)

In [31]:
params_grid = {"max_depth": [3, 5, None],
               "min_samples_split": [2, 3, 10],
               "min_samples_leaf": [1, 3, 10],
               "learning_rate": [0.1, 0.2, 0.3, 0.4, 0.5],
               "n_estimators": [100, 200, 300]}

In [32]:
grid_search = GridSearchCV(gbc_clf, params_grid,
                           n_jobs=-1, cv=5,
                           verbose=1, scoring='accuracy')

In [33]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 405 candidates, totalling 2025 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   27.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 18.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 32.2min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 48.5min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 65.8min
[Parallel(n_jobs=-1)]: Done 2025 out of 2025 | elapsed: 78.7min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [3, 5, None], 'min_samples_split': [2, 3, 10], 'min_samples_leaf': [1, 3, 10], 'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5], 'n_estimators': [100, 200, 300]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [34]:
grid_search.best_score_
'''
0.6042884990253411
'''

'\n0.6042884990253411\n'

In [35]:
grid_search.best_estimator_.get_params()
'''
{'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.2,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 10,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'presort': 'auto',
 'random_state': 42,
 'subsample': 1.0,
 'verbose': 0,
 'warm_start': False}
'''

"\n{'criterion': 'friedman_mse',\n 'init': None,\n 'learning_rate': 0.2,\n 'loss': 'deviance',\n 'max_depth': 3,\n 'max_features': None,\n 'max_leaf_nodes': None,\n 'min_impurity_decrease': 0.0,\n 'min_impurity_split': None,\n 'min_samples_leaf': 10,\n 'min_samples_split': 2,\n 'min_weight_fraction_leaf': 0.0,\n 'n_estimators': 100,\n 'presort': 'auto',\n 'random_state': 42,\n 'subsample': 1.0,\n 'verbose': 0,\n 'warm_start': False}\n"

In [36]:
gbc_clf = GradientBoostingClassifier(
        criterion = 'friedman_mse',
        init = None,
        learning_rate = 0.2,
        loss = 'deviance',
        max_depth = 3,
        max_features = None,
        max_leaf_nodes = None,
        min_impurity_decrease = 0.0,
        min_impurity_split = None,
        min_samples_leaf = 10,
        min_samples_split = 2,
        min_weight_fraction_leaf = 0.0,
        n_estimators = 100,
        presort = 'auto',
        random_state = 42,
        subsample =  1.0,
        verbose =  0,
        warm_start = False)

In [37]:
gbc_clf.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.2, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=10, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)

In [38]:
print_score(gbc_clf, X_train, y_train, X_test, y_test, train=True)

Train Result:

accuracy score: 0.6660

Classification Report: 
              precision    recall  f1-score   support

          0       0.66      0.48      0.56       659
          1       0.67      0.81      0.73       856
          2       1.00      0.17      0.29        12

avg / total       0.67      0.67      0.65      1527


Confusion Matrix: 
 [[318 341   0]
 [159 697   0]
 [  5   5   2]]

Average Accuracy: 	 0.5665
Accuracy SD: 		 0.0345


In [39]:
print_score(gbc_clf, X_train, y_train, X_test, y_test, train=False)

Test Result:

accuracy score: 0.5774

Classification Report: 
              precision    recall  f1-score   support

          0       0.47      0.39      0.43      1413
          1       0.63      0.71      0.67      2115
          2       0.00      0.00      0.00        38

avg / total       0.56      0.58      0.57      3566


Confusion Matrix: 
 [[ 556  856    1]
 [ 608 1503    4]
 [  15   23    0]]

