In [218]:
library(tidyverse)
library(rpart)
library(pROC)
library(rpart.plot)
library(caret)
library(nnet)
library(randomForest)
library(Matrix)
library(xgboost)

In [219]:
df <- read.csv('epl_data_w_features.csv')

In [220]:
df_play <- df %>%
    filter(data_type == 'hist') %>%
    na.omit %>%
    select(-data_type, -team_1_score, -team_2_score, -id)

### Use the last 5% of rows for 'hold-out'

In [221]:
holdout <- df_play[round(df_play %>% nrow * 0.95):(df_play %>% nrow),]
df_model <- df_play[1:round(df_play %>% nrow * 0.95),]

In [222]:
index <- caret::createDataPartition(y = df_model$result, p = 0.8, list = F)

In [223]:
train <- df_model[index,]
test <- df_model[-index,]

# Classification Models

## Linear Models

### Multinom

In [224]:
fit.mult <- train %>%
                multinom(formula = result ~ .)

# weights:  342 (226 variable)
initial  value 3856.129133 
iter  10 value 3434.523244
iter  20 value 3375.618731
iter  30 value 3340.619217
iter  40 value 3299.878072
iter  50 value 3289.056870
iter  60 value 3280.858340
iter  70 value 3277.195925
iter  80 value 3275.667038
iter  90 value 3274.637179
iter 100 value 3274.061109
final  value 3274.061109 
stopped after 100 iterations


In [225]:
fit.mult %>% summary

"NaNs produced"

Call:
multinom(formula = result ~ ., data = .)

Coefficients:
           (Intercept) is_february is_november c_ability_3 d_ability_1
team_2_win    7.255912 -0.05951545 -0.03437371   0.4070949   -7.154766
tie           4.185965 -0.01906930  0.06766479  -0.1186425   -2.608994
           d_ability_3 d_ability_4     d_form_4    d_h2h_2
team_2_win    5.702480   0.3273958 -0.005919646 -0.4512597
tie           6.336302  -4.0571990 -0.148796377 -0.1584988
           team_1_nameAston Villa team_1_nameBirmingham team_1_nameBlackburn
team_2_win               2.296608             1.5326872            1.9727453
tie                      1.274676             0.8604254            0.9949871
           team_1_nameBlackpool team_1_nameBolton team_1_nameBournemouth
team_2_win             2.758605         1.8003330               1.502174
tie                    2.067055         0.5493466               1.092561
           team_1_nameBrighton team_1_nameBurnley team_1_nameCardiff
team_2_win                   

In [226]:
preds <- predict(fit.mult, newdata = test, type="probs")

In [227]:
test$team_tie <- as.numeric(test$result == 'tie')
test$team_1_win <- as.numeric(test$result == 'team_1_win')
test$team_2_win <- as.numeric(test$result == 'team_2_win')

test$team_tie_prob_mult_nom <- (preds %>% as.data.frame)$tie
test$team_1_prob_mult_nom <- (preds %>% as.data.frame)$team_1_win
test$team_2_prob_mult_nom <- (preds %>% as.data.frame)$team_2_win

In [228]:
pROC::roc(response = test$team_tie, predictor = test$team_tie_prob_mult_nom)


Call:
roc.default(response = test$team_tie, predictor = test$team_tie_prob_mult_nom)

Data: test$team_tie_prob_mult_nom in 654 controls (test$team_tie 0) < 223 cases (test$team_tie 1).
Area under the curve: 0.5474

In [229]:
pROC::roc(response = test$team_1_win, predictor = test$team_1_prob_mult_nom)


Call:
roc.default(response = test$team_1_win, predictor = test$team_1_prob_mult_nom)

Data: test$team_1_prob_mult_nom in 468 controls (test$team_1_win 0) < 409 cases (test$team_1_win 1).
Area under the curve: 0.7144

In [230]:
pROC::roc(response = test$team_2_win, predictor = test$team_2_prob_mult_nom)


Call:
roc.default(response = test$team_2_win, predictor = test$team_2_prob_mult_nom)

Data: test$team_2_prob_mult_nom in 632 controls (test$team_2_win 0) < 245 cases (test$team_2_win 1).
Area under the curve: 0.7421

### pairwise logreg

In [231]:
train$team_tie <- as.numeric(train$result == 'tie')
train$team_1_win <- as.numeric(train$result == 'team_1_win')
train$team_2_win <- as.numeric(train$result == 'team_2_win')


fit.log_tie <- train %>%
                select(-team_1_win,-team_2_win, -result) %>%
                glm(formula = team_tie ~ ., family="binomial")

fit.log_team_1 <- train %>%
                select(-team_tie,-team_2_win, -result) %>%
                glm(formula = team_1_win ~ ., family="binomial")

fit.log_team_2 <- train %>%
                select(-team_1_win,-team_tie, -result) %>%
                glm(formula = team_2_win ~ ., family="binomial")

In [232]:
fit.log_tie %>% summary


Call:
glm(formula = team_tie ~ ., family = "binomial", data = .)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.331  -0.802  -0.666   1.167   2.433  

Coefficients:
                               Estimate Std. Error z value Pr(>|z|)   
(Intercept)                   1.464e+00  2.390e+00   0.613  0.54006   
is_february                   5.687e-03  4.221e-02   0.135  0.89283   
is_november                   7.869e-02  4.250e-02   1.851  0.06411 . 
c_ability_3                  -3.052e-01  7.054e-01  -0.433  0.66530   
d_ability_1                   1.860e-01  1.356e+00   0.137  0.89090   
d_ability_3                   5.768e+00  4.255e+00   1.355  0.17526   
d_ability_4                  -5.812e+00  3.802e+00  -1.529  0.12633   
d_form_4                     -1.524e-01  1.482e-01  -1.029  0.30351   
d_h2h_2                       2.121e-02  6.656e-02   0.319  0.74999   
team_1_nameAston Villa        5.530e-01  3.750e-01   1.474  0.14037   
team_1_nameBirmingham         5.31

In [233]:
pred_tie_log <- predict(fit.log_tie, newdata = test, type="response")
pred_team_1_log <- predict(fit.log_team_1, newdata = test, type="response")
pred_team_2_log <- predict(fit.log_team_2, newdata = test, type="response")

In [234]:
test$pred_tie_log <- pred_tie_log
test$pred_team_1_log <- pred_team_1_log
test$pred_team_2_log <- pred_team_2_log

In [235]:
pROC::roc(response = test$team_tie, predictor = test$pred_tie_log)


Call:
roc.default(response = test$team_tie, predictor = test$pred_tie_log)

Data: test$pred_tie_log in 654 controls (test$team_tie 0) < 223 cases (test$team_tie 1).
Area under the curve: 0.5412

In [236]:
pROC::roc(response = test$team_1_win, predictor = test$pred_team_1_log)


Call:
roc.default(response = test$team_1_win, predictor = test$pred_team_1_log)

Data: test$pred_team_1_log in 468 controls (test$team_1_win 0) < 409 cases (test$team_1_win 1).
Area under the curve: 0.716

In [237]:
pROC::roc(response = test$team_2_win, predictor = test$pred_team_2_log)


Call:
roc.default(response = test$team_2_win, predictor = test$pred_team_2_log)

Data: test$pred_team_2_log in 632 controls (test$team_2_win 0) < 245 cases (test$team_2_win 1).
Area under the curve: 0.7411

## Non Linear Models

### D Tree (excl. team name factors)

In [238]:
train <- df_model[index,]
test <- df_model[-index,]

fit.tree <- train %>%
                select(-team_1_name, -team_2_name) %>% # rpart isn't very efficient with 'large' factors
                rpart(formula = result ~ .
                      , parms =  list(split = "information")
                      , control = rpart.control(
                                minsplit = 20
                              , cp = 0.001
                              #, maxcompete = 4
                              #, maxsurrogate = 5
                              #, usesurrogate = 2
                              #, xval = 10
                              #, surrogatestyle = 0
                              #, maxdepth = 30
                      )
                     )

In [239]:
#fit.tree %>% summary

In [240]:
#rpart.plot(fit.tree)

In [241]:
preds <- predict(fit.tree, newdata = test, type="prob") %>% as.data.frame

In [242]:
test$team_tie <- as.numeric(test$result == 'tie')
test$team_1_win <- as.numeric(test$result == 'team_1_win')
test$team_2_win <- as.numeric(test$result == 'team_2_win')

test$team_tie_prob_dtree <- (preds %>% as.data.frame)$tie
test$team_1_prob_dtree <- (preds %>% as.data.frame)$team_1_win
test$team_2_prob_dtree <- (preds %>% as.data.frame)$team_2_win

In [243]:
pROC::roc(response = test$team_tie, predictor = test$team_tie_prob_dtree)


Call:
roc.default(response = test$team_tie, predictor = test$team_tie_prob_dtree)

Data: test$team_tie_prob_dtree in 654 controls (test$team_tie 0) < 223 cases (test$team_tie 1).
Area under the curve: 0.5188

In [244]:
pROC::roc(response = test$team_1_win, predictor = test$team_1_prob_dtree)


Call:
roc.default(response = test$team_1_win, predictor = test$team_1_prob_dtree)

Data: test$team_1_prob_dtree in 468 controls (test$team_1_win 0) < 409 cases (test$team_1_win 1).
Area under the curve: 0.6445

In [245]:
pROC::roc(response = test$team_2_win, predictor = test$team_2_prob_dtree)


Call:
roc.default(response = test$team_2_win, predictor = test$team_2_prob_dtree)

Data: test$team_2_prob_dtree in 632 controls (test$team_2_win 0) < 245 cases (test$team_2_win 1).
Area under the curve: 0.6428

### Random Forest

In [246]:
# train <- df_model[index,]
# test <- df_model[-index,]

# fit.forest <- train %>%
#                 select(-team_1_name, -team_2_name) %>% # rpart isn't very efficient with 'large' factors
#                 randomForest(formula = result ~ .
#                      )

In [247]:
# fit.forest %>% summary

### Xgboost

In [248]:
sparse_matrix <- sparse.model.matrix(result ~ .-1, data = df_model)

sparse_matrix_train <- sparse_matrix[as.vector(index), ,]
sparse_matrix_test <- sparse_matrix[-as.vector(index), ,]

In [249]:
label <- as.numeric(df_model$result)-1

In [250]:
df_model$result[1:15]

In [251]:
label[1:15]

In [252]:
train_label <- label[index]
test_label <- label[-index]

In [253]:
xgb <- xgboost(data = data.matrix(sparse_matrix_train), label = train_label,
            booster = 'gbtree',
             eta = 0.1,
             gamma = 0.01,
             max_depth = 10, 
             subsample = 0.5,
             colsample_bytree = 0.5,
             seed = 1,
             eval_metric = "mlogloss",
             #num_parallel_tree,
             num_class = 3,
             nthread = 8,
             nround=50,
             #feval
              objective = "multi:softprob")

[1]	train-mlogloss:1.058370 
[2]	train-mlogloss:1.025150 
[3]	train-mlogloss:0.992755 
[4]	train-mlogloss:0.960848 
[5]	train-mlogloss:0.934877 
[6]	train-mlogloss:0.909743 
[7]	train-mlogloss:0.885279 
[8]	train-mlogloss:0.859590 
[9]	train-mlogloss:0.838350 
[10]	train-mlogloss:0.819809 
[11]	train-mlogloss:0.800861 
[12]	train-mlogloss:0.781410 
[13]	train-mlogloss:0.764735 
[14]	train-mlogloss:0.747724 
[15]	train-mlogloss:0.731224 
[16]	train-mlogloss:0.714098 
[17]	train-mlogloss:0.696908 
[18]	train-mlogloss:0.685256 
[19]	train-mlogloss:0.674023 
[20]	train-mlogloss:0.660260 
[21]	train-mlogloss:0.651496 
[22]	train-mlogloss:0.639115 
[23]	train-mlogloss:0.626778 
[24]	train-mlogloss:0.613572 
[25]	train-mlogloss:0.604649 
[26]	train-mlogloss:0.594764 
[27]	train-mlogloss:0.585980 
[28]	train-mlogloss:0.576342 
[29]	train-mlogloss:0.565001 
[30]	train-mlogloss:0.555570 
[31]	train-mlogloss:0.548421 
[32]	train-mlogloss:0.541207 
[33]	train-mlogloss:0.532858 
[34]	train-mlogloss

In [254]:
pred <- predict(xgb, data.matrix(sparse_matrix_test), type="probs")

In [255]:
pred_team_tie <- pred[seq(3,length(pred)+2,3)]
pred_team_1 <- pred[seq(1,length(pred),3)]
pred_team_2 <- pred[seq(2,length(pred)+1,3)]

In [256]:
test$team_tie <- as.numeric(test$result == 'tie')
test$team_1_win <- as.numeric(test$result == 'team_1_win')
test$team_2_win <- as.numeric(test$result == 'team_2_win')

test$team_tie_prob_xgb_tree <- pred_team_tie
test$team_1_prob_xgb_tree <- pred_team_1
test$team_2_prob_xgb_tree <- pred_team_2

In [257]:
pROC::roc(response = test$team_tie, predictor = test$team_tie_prob_xgb_tree)


Call:
roc.default(response = test$team_tie, predictor = test$team_tie_prob_xgb_tree)

Data: test$team_tie_prob_xgb_tree in 654 controls (test$team_tie 0) < 223 cases (test$team_tie 1).
Area under the curve: 0.5607

In [258]:
pROC::roc(response = test$team_1_win, predictor = test$team_1_prob_xgb_tree)


Call:
roc.default(response = test$team_1_win, predictor = test$team_1_prob_xgb_tree)

Data: test$team_1_prob_xgb_tree in 468 controls (test$team_1_win 0) < 409 cases (test$team_1_win 1).
Area under the curve: 0.6972

In [259]:
pROC::roc(response = test$team_2_win, predictor = test$team_2_prob_xgb_tree)


Call:
roc.default(response = test$team_2_win, predictor = test$team_2_prob_xgb_tree)

Data: test$team_2_prob_xgb_tree in 632 controls (test$team_2_win 0) < 245 cases (test$team_2_win 1).
Area under the curve: 0.7173

### Results are surprisingly good. Try on hold out

In [260]:
sparse_matrix_holdout <- sparse.model.matrix(result ~ .-1, data = holdout)

In [261]:
label <- as.numeric(holdout$result)-1

In [262]:
holdout$result[1:15]

In [263]:
label[1:15]

In [264]:
pred_holdout <- predict(xgb, data.matrix(sparse_matrix_holdout), type="probs")

In [265]:
holdout_pred_team_tie <- pred_holdout[seq(3,length(pred_holdout)+2,3)]
holdout_pred_team_1 <- pred_holdout[seq(1,length(pred_holdout),3)]
holdout_pred_team_2 <- pred_holdout[seq(2,length(pred_holdout)+1,3)]

In [266]:
holdout$team_tie <- as.numeric(holdout$result == 'tie')
holdout$team_1_win <- as.numeric(holdout$result == 'team_1_win')
holdout$team_2_win <- as.numeric(holdout$result == 'team_2_win')

holdout$team_tie_prob_xgb_tree <- holdout_pred_team_tie
holdout$team_1_prob_xgb_tree <- holdout_pred_team_1
holdout$team_2_prob_xgb_tree <- holdout_pred_team_2

In [267]:
pROC::roc(response = holdout$team_tie, predictor = holdout$team_tie_prob_xgb_tree)


Call:
roc.default(response = holdout$team_tie, predictor = holdout$team_tie_prob_xgb_tree)

Data: holdout$team_tie_prob_xgb_tree in 180 controls (holdout$team_tie 0) < 52 cases (holdout$team_tie 1).
Area under the curve: 0.6025

In [268]:
pROC::roc(response = holdout$team_1_win, predictor = holdout$team_1_prob_xgb_tree)


Call:
roc.default(response = holdout$team_1_win, predictor = holdout$team_1_prob_xgb_tree)

Data: holdout$team_1_prob_xgb_tree in 126 controls (holdout$team_1_win 0) < 106 cases (holdout$team_1_win 1).
Area under the curve: 0.7681

In [269]:
pROC::roc(response = holdout$team_2_win, predictor = holdout$team_2_prob_xgb_tree)


Call:
roc.default(response = holdout$team_2_win, predictor = holdout$team_2_prob_xgb_tree)

Data: holdout$team_2_prob_xgb_tree in 158 controls (holdout$team_2_win 0) < 74 cases (holdout$team_2_win 1).
Area under the curve: 0.7905

In [270]:
df_comp <- df %>%
    filter(data_type == 'comp') %>%
    #na.omit %>%
    select(-data_type, -team_1_score, -team_2_score, -id)

In [271]:
df_comp

result,is_february,is_november,c_ability_3,d_ability_1,d_ability_3,d_ability_4,d_form_4,d_h2h_2,team_1_name,...,team_2_team_score_ma_10,team_2_team_win_index,team_2_team_loss_index,team_2_tie_index,x_year,y_year,x_week,y_week,x_day,y_day
tie,-0.3178843,-0.3385736,,-2.20500635,-2.2454484,-2.2445428,-0.94887132,-1.525527,West Ham,...,1.7,0.8953355,1.459592,0.9748441,0.9291414,-0.3697245,0.2225209,-0.9749279,-1.0,1.224647e-16
tie,-0.3178843,-0.3385736,-0.89467958,-0.88925776,-0.8380912,-0.8429574,0.03982277,-1.525527,Huddersfield,...,1.0,1.0714286,1.428571,0.4285714,0.9291414,-0.3697245,0.3302791,-0.9438833,1.0,0.0
tie,-0.3178843,-0.3385736,0.02459464,-0.02423344,-0.1066833,-0.1148918,0.7435476,-1.525527,Swansea,...,0.8,0.6683673,1.121498,1.1080827,0.9291414,-0.3697245,0.3302791,-0.9438833,1.0,0.0
tie,-0.3178843,-0.3385736,0.30058679,0.3648399,0.3435022,0.3538497,0.71709936,-1.525527,Burnley,...,1.8,0.8892463,1.235383,0.671875,0.9291414,-0.3697245,0.3302791,-0.9438833,1.0,0.0
tie,-0.3178843,-0.3385736,,-0.33941503,-0.427606,-0.4371238,0.28919047,-1.525527,Crystal Palace,...,1.0,0.8238683,1.107407,1.0111111,0.9291414,-0.3697245,0.3302791,-0.9438833,1.0,0.0
tie,-0.3178843,-0.3385736,2.04973789,2.02210898,2.0842651,2.0996903,0.79409032,-1.525527,Tottenham,...,1.2,0.6071429,1.314685,0.990099,0.9291414,-0.3697245,0.3302791,-0.9438833,1.0,0.0
tie,-0.3178843,-0.3385736,-0.31882605,-0.55800277,-0.4606444,-0.441627,0.21156075,-1.525527,Newcastle Utd,...,1.3,0.7169811,1.32,0.974359,0.9291414,-0.3697245,0.399892,-0.9165623,-1.0,1.224647e-16
tie,-0.3178843,-0.3385736,,-1.16595526,-1.1310483,-1.1045425,-0.73148374,-1.525527,Southampton,...,2.2,0.8211312,1.45675,1.0573135,0.9353679,-0.3536761,0.9009689,-0.4338837,1.0,0.0
tie,-0.3178843,-0.3385736,1.14170119,1.17455248,1.2793569,1.2666328,-0.29539431,-1.525527,Liverpool,...,1.7,0.6824645,1.235669,1.1948052,0.9353679,-0.3536761,0.9308737,-0.365341,6.123234000000001e-17,1.0
tie,-0.3178843,-0.3385736,,-0.56000922,-0.3529263,-0.3716552,-1.48296742,-1.525527,Manchester United,...,3.0,0.7537313,1.323944,1.1785714,0.9353679,-0.3536761,0.9555728,-0.2947552,-1.0,1.224647e-16
