### Creation of a baseline model for AFL results prediction

In [1]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sqlalchemy import create_engine

Let's create engine to connect to db

In [2]:
POSTGRES_ADDRESS = '127.0.0.1' 
POSTGRES_PORT = '5432'
POSTGRES_USERNAME = 'analyst'
POSTGRES_PASSWORD = 'analyst'
POSTGRES_DBNAME = 'afl-db' 
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
                .format(username=POSTGRES_USERNAME,
                password=POSTGRES_PASSWORD,
                ipaddress=POSTGRES_ADDRESS,
                port=POSTGRES_PORT,
                dbname=POSTGRES_DBNAME))

afl_conn= create_engine(postgres_str)

Next, let's read data

In [3]:
train_df = pd.read_sql_query("select * from afl.ml_mart_train", con=afl_conn)

Shuffle the data

In [4]:
df = shuffle(train_df)

Let's do some team mapping to encode categorical variable

In [5]:
d = {'Gold Coast':0, 'Western Bulldogs':1, 'GWS':2, 'Richmond':3, 'Geelong':4,
       'Adelaide':5, 'Essendon':6, 'Fremantle':7, 'Collingwood':8, 'Hawthorn':9,
       'West Coast':10, 'Carlton':11, 'St Kilda':12, 'Brisbane':13, 'Sydney':14,
       'Port Adelaide':15, 'Melbourne':16, 'North Melbourne':17}
df['home_team'] = df.home_team.map(d)
df['away_team'] = df.away_team.map(d)

In [6]:
X = df.drop(['is_home_win', 'score_total', 'venue'], axis=1)
y = df['is_home_win']
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.8, random_state=42)

## Logistic regression baseline model

In [7]:
model_base = LogisticRegression(max_iter=5000)
model_base.fit(X_train, y_train)
predictions = model_base.predict(X_test)
accuracy_score(y_test, predictions)

0.6572769953051644

## XGB baseline model

In [8]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.01, max_depth=5, random_state=42)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.6056338028169014

Download prediction data

In [9]:
predict_df=pd.read_sql_query("select * from afl.data_for_prediction", con=afl_conn)
predict_df['home_team'] = predict_df.home_team.map(d)
predict_df['away_team'] = predict_df.away_team.map(d)
X_ = predict_df.drop(['venue'], axis=1)

In [10]:
d2 = {y: x for x, y in d.items()}
predict_df['home_team'] = predict_df.home_team.map(d2)
predict_df['away_team'] = predict_df.away_team.map(d2)
predict_df['logreg predict'] = model_base.predict(X_)
predict_df['XGB predict'] = clf.predict(X_)

In [11]:
predict_df[['home_team', 'away_team', 'logreg predict', 'XGB predict']]

Unnamed: 0,home_team,away_team,logreg predict,XGB predict
0,Adelaide,Collingwood,1,1
1,Carlton,Geelong,0,1
2,Essendon,Gold Coast,1,1
3,Fremantle,Sydney,0,1
4,GWS,Brisbane,0,1
5,Hawthorn,West Coast,0,1
6,Melbourne,Port Adelaide,1,1
7,North Melbourne,Richmond,0,0
8,Western Bulldogs,St Kilda,1,1
