# Match Outcome Predictor

Script to generate the outcome of a given match using the parameters available of a started match (e.g. a match with a NULL score), using machine learning. Algorithm is applied to the dbo.matches table in the AOE2 SQL database

In [4]:
import pandas as pd
import numpy as np
import pyodbc
from sklearn.preprocessing import OneHotEncoder
import gc

In [5]:
# connect to database
sql_conn = pyodbc.connect(DRIVER="{SQL Server Native Client 11.0}", 
                          SERVER="localhost\SQLEXPRESS", 
                          DATABASE="AOE2", 
                          Trusted_Connection="yes") 

cursor = sql_conn.cursor()

In [6]:
# retrieve data from database
cursor.execute("SELECT TOP 100000 * FROM matchPredictions")
data = cursor.fetchall()

In [7]:
# insert into dataframe
raw = pd.DataFrame([[j for j in i] for i in data],columns=['ratingDifference','playerMatchup','civMatchup','winner'])

In [8]:
# create columns for onehot encoding
civNum = 37 # number of civs in the game

matchString=[]

for i in range(civNum):
    for j in range(civNum):
        
        matchString.append(str(i)+':'+str(j))

len(matchString)

1369

In [9]:
# perform onehot encoding on matchups
onehot = OneHotEncoder(dtype=np.int, sparse=True)

X = pd.DataFrame(
                onehot.fit_transform(raw[['civMatchup']]).toarray(),
                columns=matchString)

In [10]:
X['ratingDifference'] = raw.ratingDifference
y = raw.winner

# free up memory
del raw
gc.collect()

20

In [14]:
# split into training and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)

# normalise the ratingDifference feature
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(copy=False)

# split like this to avoid creating another memory-intensive variable
ratingDiffTrain = X_train['ratingDifference']
ratingDiffTest  = X_test['ratingDifference']

ratingDiffTrain.is_copy = None
ratingDiffTest.is_copy = None

X_train.drop('ratingDifference',axis=1,inplace=True)
X_test.drop( 'ratingDifference',axis=1,inplace=True)

X_train['ratingDifference'] = scaler.fit_transform(ratingDiffTrain[:,None])
X_test['ratingDifference']  = scaler.transform(ratingDiffTest[:,None])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['ratingDifference'] = scaler.fit_transform(ratingDiffTrain[:,None])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['ratingDifference']  = scaler.transform(ratingDiffTest[:,None])


In [15]:
## wait until access to more powerful computer before refinements + improvements to accuracy

# train a model
from sklearn.svm import SVC
linRidge = SVC(kernel='linear',C=1.0).fit(X_train,y_train)

In [17]:
linRidge.score(X_test,y_test)

0.58932

In [18]:
coeff_rating = list(zip(X.columns,linRidge.coef_[0]))
most_important_features = sorted(coeff_rating, key=lambda x: x[-1], reverse=True)

print(most_important_features[:4])

[('6:26', 1.2525532518598261), ('15:26', 1.1694300114628022), ('4:36', 1.1692891777100445), ('35:36', 1.127782783091056)]
