# Creating a baseline for classification


Notebook attempting to predict the result (Home win, away win, draw) of any fixture given the teams that are playing it based on their performance in the previous season. We use multiclass classification to predict the results of the matches. More feature engineering on the data might lead us to better results.

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as scipy

### Load the data

In [2]:
df = pd.read_csv("./Data/E0_13.csv")
df_14 = pd.read_csv("./Data/E0_14.csv")

In [3]:
df.columns

Index([u'Div', u'Date', u'HomeTeam', u'AwayTeam', u'FTHG', u'FTAG', u'FTR',
       u'HTHG', u'HTAG', u'HTR', u'Referee', u'HS', u'AS', u'HST', u'AST',
       u'HF', u'AF', u'HC', u'AC', u'HY', u'AY', u'HR', u'AR', u'B365H',
       u'B365D', u'B365A', u'BWH', u'BWD', u'BWA', u'IWH', u'IWD', u'IWA',
       u'LBH', u'LBD', u'LBA', u'PSH', u'PSD', u'PSA', u'WHH', u'WHD', u'WHA',
       u'SJH', u'SJD', u'SJA', u'VCH', u'VCD', u'VCA', u'Bb1X2', u'BbMxH',
       u'BbAvH', u'BbMxD', u'BbAvD', u'BbMxA', u'BbAvA', u'BbOU', u'BbMx>2.5',
       u'BbAv>2.5', u'BbMx<2.5', u'BbAv<2.5', u'BbAH', u'BbAHh', u'BbMxAHH',
       u'BbAvAHH', u'BbMxAHA', u'BbAvAHA', u'PSCH', u'PSCD', u'PSCA'],
      dtype='object')

### Cleaning

We do not need information about division, data, referee and the betting odds from various companies for this method. 

In [71]:
res_13 = df.ix[:,:23]
res_13 = res_13.drop(['Div','Date','Referee'],axis=1)
res_14 = df_14.ix[:,:23]
res_14 = res_14.drop(['Div','Date','Referee'],axis=1)
table_features = df.ix[:,:7]
table_features = table_features.drop(['FTHG','FTAG','Div','Date'],axis=1)
bet_13 = df.ix[:,23:]


In [96]:
res_13.groupby('FTR').count()

Unnamed: 0_level_0,HomeTeam,AwayTeam,FTHG,FTAG,HTHG,HTAG,HTR,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
FTR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
A,123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,123
D,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78
H,179,179,179,179,179,179,179,179,179,179,179,179,179,179,179,179,179,179,179


In [98]:
from math import log

In [104]:
entropy = -((0.32 * log(0.32,3)) + (0.20 * log(0.20,3)) + (0.47 * log(0.47,3)))

In [105]:
entropy

0.947893245378005

In [72]:
res_13.head()
feature_table = df.ix[:,:23]

In [73]:
#Team, Home Goals Score, Away Goals Score, Attack Strength, Home Goals Conceded, Away Goals Conceded, Defensive Strength
table_13 = pd.DataFrame(columns=('Team','HGS','AGS','HAS','AAS','HGC','AGC','HDS','ADS'))

In [74]:
avg_home_scored_13 = res_13.FTHG.sum() / 380.0
avg_away_scored_13 = res_13.FTAG.sum() / 380.0
avg_home_conceded_13 = avg_away_scored_13
avg_away_conceded_13 = avg_home_scored_13
print "Average number of goals at home",avg_home_scored_13
print "Average number of goals away", avg_away_scored_13
print "Average number of goals conceded at home",avg_away_conceded_13
print "Average number of goals conceded away",avg_home_conceded_13


Average number of goals at home 1.57368421053
Average number of goals away 1.19473684211
Average number of goals conceded at home 1.57368421053
Average number of goals conceded away 1.19473684211


In [75]:
res_home = res_13.groupby('HomeTeam')
res_away = res_13.groupby('AwayTeam')

In [76]:
table_13.Team = res_home.HomeTeam.all().values
table_13.HGS = res_home.FTHG.sum().values
table_13.HGC = res_home.FTAG.sum().values
table_13.AGS = res_away.FTAG.sum().values
table_13.AGC = res_away.FTHG.sum().values
table_13.head()

Unnamed: 0,Team,HGS,AGS,HAS,AAS,HGC,AGC,HDS,ADS
0,Arsenal,36,32,,,11,30,,
1,Aston Villa,22,17,,,29,32,,
2,Cardiff,20,12,,,35,39,,
3,Chelsea,43,28,,,11,16,,
4,Crystal Palace,18,15,,,23,25,,


In [77]:
table_13.HAS = (table_13.HGS / 19.0) / avg_home_scored_13
table_13.AAS = (table_13.AGS / 19.0) / avg_away_scored_13
table_13.HDS = (table_13.HGC / 19.0) / avg_home_conceded_13
table_13.ADS = (table_13.AGC / 19.0) / avg_away_conceded_13
table_13.head()

Unnamed: 0,Team,HGS,AGS,HAS,AAS,HGC,AGC,HDS,ADS
0,Arsenal,36,32,1.204013,1.409692,11,30,0.484581,1.003344
1,Aston Villa,22,17,0.735786,0.748899,29,32,1.277533,1.070234
2,Cardiff,20,12,0.668896,0.528634,35,39,1.54185,1.304348
3,Chelsea,43,28,1.438127,1.23348,11,16,0.484581,0.535117
4,Crystal Palace,18,15,0.602007,0.660793,23,25,1.013216,0.83612


In [78]:
feature_table = feature_table[['HomeTeam','AwayTeam','FTR']]
f_HAS = []
f_HDS = []
f_AAS = []
f_ADS = []
for index,row in feature_table.iterrows():
    f_HAS.append(table_13[table_13['Team'] == row['HomeTeam']]['HAS'].values[0])
    f_HDS.append(table_13[table_13['Team'] == row['HomeTeam']]['HDS'].values[0])
    f_AAS.append(table_13[table_13['Team'] == row['HomeTeam']]['AAS'].values[0])
    f_ADS.append(table_13[table_13['Team'] == row['HomeTeam']]['ADS'].values[0])
    
feature_table['HAS'] = f_HAS
feature_table['HDS'] = f_HDS
feature_table['AAS'] = f_AAS
feature_table['ADS'] = f_ADS

In [79]:
feature_table.head()

Unnamed: 0,HomeTeam,AwayTeam,FTR,HAS,HDS,AAS,ADS
0,Arsenal,Aston Villa,A,1.204013,0.484581,1.409692,1.003344
1,Liverpool,Stoke,H,1.772575,0.792952,2.114537,1.070234
2,Norwich,Everton,D,0.568562,0.792952,0.484581,1.471572
3,Sunderland,Fulham,A,0.702341,1.189427,0.881057,1.103679
4,Swansea,Man United,A,1.103679,1.145374,0.92511,0.936455


In [82]:
def transformResult(row):
    if(row.FTR == 'H'):
        return 1
    elif(row.FTR == 'A'):
        return -1
    else:
        return 0

In [83]:
feature_table["Result"] = feature_table.apply(lambda row: transformResult(row),axis=1)

In [84]:
feature_table.head()

Unnamed: 0,HomeTeam,AwayTeam,FTR,HAS,HDS,AAS,ADS,Result
0,Arsenal,Aston Villa,A,1.204013,0.484581,1.409692,1.003344,-1
1,Liverpool,Stoke,H,1.772575,0.792952,2.114537,1.070234,1
2,Norwich,Everton,D,0.568562,0.792952,0.484581,1.471572,0
3,Sunderland,Fulham,A,0.702341,1.189427,0.881057,1.103679,-1
4,Swansea,Man United,A,1.103679,1.145374,0.92511,0.936455,-1


In [85]:
X_train = feature_table[['HAS','HDS','AAS','ADS']]
y_train = feature_table['Result']

In [86]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [87]:
clf1 = DecisionTreeClassifier()
clf2 = XGBClassifier()
clf3 = KNeighborsClassifier(n_neighbors=15)
clf3.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=15, p=2,
           weights='uniform')

In [88]:
y_pred = clf3.predict(X_train)
accuracy_score(y_pred,y_train)

0.54736842105263162