In [136]:
library(tidyverse)
library(rpart)
library(pROC)
library(rpart.plot)
library(caret)
library(nnet)
library(randomForest)
library(Matrix)
library(xgboost)

In [5]:
df <- read.csv('epl_data_w_features.csv')

In [31]:
df_play <- df %>%
    filter(data_type == 'hist') %>%
    na.omit %>%
    select(-data_type, -team_1_score, -team_2_score)

### Use the last 5% of rows for 'hold-out'

In [32]:
holdout <- df_play[round(df_play %>% nrow * 0.95):(df_play %>% nrow),]
df_model <- df_play[1:round(df_play %>% nrow * 0.95),]

In [33]:
index <- caret::createDataPartition(y = df_model$result, p = 0.8, list = F)

In [34]:
train <- df_model[index,]
test <- df_model[-index,]

# Classification Models

## Linear Models

### Multinom

In [35]:
fit.mult <- train %>%
                multinom(formula = result ~ .)

# weights:  342 (226 variable)
initial  value 15776.072465 
iter  10 value 14050.961694
iter  20 value 13837.163131
iter  30 value 13765.606720
iter  40 value 13667.293979
iter  50 value 13589.281797
iter  60 value 13546.854082
iter  70 value 13530.828685
iter  80 value 13526.440738
iter  90 value 13523.261664
iter 100 value 13520.637527
final  value 13520.637527 
stopped after 100 iterations


In [36]:
fit.mult %>% summary

"NaNs produced"

Call:
multinom(formula = result ~ ., data = .)

Coefficients:
           (Intercept) is_february is_november c_ability_3 d_ability_1
team_2_win   0.3676769 -0.05770273 0.004129931 -0.06303336   -5.394345
tie          0.5893085  0.01593427 0.059187991 -1.13638565   -2.804251
           d_ability_3 d_ability_4  d_form_4     d_h2h_2 team_1_nameAston Villa
team_2_win    6.031571   -1.097106 0.2924610 -0.30101180               1.634776
tie           7.283479   -3.762729 0.1275206 -0.03864114               0.761339
           team_1_nameBirmingham team_1_nameBlackburn team_1_nameBlackpool
team_2_win             0.8388550            1.0348717             1.978206
tie                    0.5609212            0.0999535             0.329718
           team_1_nameBolton team_1_nameBournemouth team_1_nameBrighton
team_2_win         1.0960967              0.8033679                   0
tie                0.1539115              0.1033317                   0
           team_1_nameBurnley team_1_nameCar

In [38]:
preds <- predict(fit.mult, newdata = test, type="probs")

In [50]:
test$team_tie <- as.numeric(test$result == 'tie')
test$team_1_win <- as.numeric(test$result == 'team_1_win')
test$team_2_win <- as.numeric(test$result == 'team_2_win')

test$team_tie_prob_mult_nom <- (preds %>% as.data.frame)$tie
test$team_1_prob_mult_nom <- (preds %>% as.data.frame)$team_1_win
test$team_2_prob_mult_nom <- (preds %>% as.data.frame)$team_2_win

In [49]:
pROC::roc(response = test$team_tie, predictor = test$team_tie_prob_mult_nom)


Call:
roc.default(response = test$team_tie, predictor = test$team_tie_prob_mult_nom)

Data: test$team_tie_prob_mult_nom in 2676 controls (test$team_tie 0) < 912 cases (test$team_tie 1).
Area under the curve: 0.5919

In [51]:
pROC::roc(response = test$team_1_win, predictor = test$team_1_prob_mult_nom)


Call:
roc.default(response = test$team_1_win, predictor = test$team_1_prob_mult_nom)

Data: test$team_1_prob_mult_nom in 1915 controls (test$team_1_win 0) < 1673 cases (test$team_1_win 1).
Area under the curve: 0.7252

In [52]:
pROC::roc(response = test$team_2_win, predictor = test$team_2_prob_mult_nom)


Call:
roc.default(response = test$team_2_win, predictor = test$team_2_prob_mult_nom)

Data: test$team_2_prob_mult_nom in 2585 controls (test$team_2_win 0) < 1003 cases (test$team_2_win 1).
Area under the curve: 0.7388

### pairwise logreg

In [61]:
train$team_tie <- as.numeric(train$result == 'tie')
train$team_1_win <- as.numeric(train$result == 'team_1_win')
train$team_2_win <- as.numeric(train$result == 'team_2_win')


fit.log_tie <- train %>%
                select(-team_1_win,-team_2_win, -result) %>%
                glm(formula = team_tie ~ ., family="binomial")

fit.log_team_1 <- train %>%
                select(-team_tie,-team_2_win, -result) %>%
                glm(formula = team_1_win ~ ., family="binomial")

fit.log_team_2 <- train %>%
                select(-team_1_win,-team_tie, -result) %>%
                glm(formula = team_2_win ~ ., family="binomial")

In [74]:
fit.log_tie %>% summary


Call:
glm(formula = team_tie ~ ., family = "binomial", data = .)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4096  -0.8020  -0.6794   1.2471   2.4286  

Coefficients:
                               Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  -3.498e-01  4.951e-01  -0.707 0.479822    
is_february                   3.920e-02  2.046e-02   1.916 0.055417 .  
is_november                   5.772e-02  2.110e-02   2.736 0.006219 ** 
c_ability_3                  -1.160e+00  3.264e-01  -3.554 0.000380 ***
d_ability_1                  -7.951e-01  5.114e-01  -1.555 0.120003    
d_ability_3                   6.581e+00  1.931e+00   3.408 0.000655 ***
d_ability_4                  -4.835e+00  1.725e+00  -2.802 0.005077 ** 
d_form_4                      1.781e-02  5.375e-02   0.331 0.740359    
d_h2h_2                       7.496e-02  2.865e-02   2.616 0.008895 ** 
team_1_nameAston Villa        2.555e-01  1.497e-01   1.707 0.087832 .  
team_1_nameBi

In [68]:
pred_tie_log <- predict(fit.log_tie, newdata = test, type="response")
pred_team_1_log <- predict(fit.log_team_1, newdata = test, type="response")
pred_team_2_log <- predict(fit.log_team_2, newdata = test, type="response")

In [69]:
test$pred_tie_log <- pred_tie_log
test$pred_team_1_log <- pred_team_1_log
test$pred_team_2_log <- pred_team_2_log

In [70]:
pROC::roc(response = test$team_tie, predictor = test$pred_tie_log)


Call:
roc.default(response = test$team_tie, predictor = test$pred_tie_log)

Data: test$pred_tie_log in 2676 controls (test$team_tie 0) < 912 cases (test$team_tie 1).
Area under the curve: 0.5785

In [71]:
pROC::roc(response = test$team_1_win, predictor = test$pred_team_1_log)


Call:
roc.default(response = test$team_1_win, predictor = test$pred_team_1_log)

Data: test$pred_team_1_log in 1915 controls (test$team_1_win 0) < 1673 cases (test$team_1_win 1).
Area under the curve: 0.7258

In [72]:
pROC::roc(response = test$team_2_win, predictor = test$pred_team_2_log)


Call:
roc.default(response = test$team_2_win, predictor = test$pred_team_2_log)

Data: test$pred_team_2_log in 2585 controls (test$team_2_win 0) < 1003 cases (test$team_2_win 1).
Area under the curve: 0.7391

## Non Linear Models

### D Tree (excl. team name factors)

In [98]:
train <- df_model[index,]
test <- df_model[-index,]

fit.tree <- train %>%
                select(-team_1_name, -team_2_name) %>% # rpart isn't very efficient with 'large' factors
                rpart(formula = result ~ .
                      , parms =  list(split = "information")
                      , control = rpart.control(
                                minsplit = 20
                              , cp = 0.001
                              #, maxcompete = 4
                              #, maxsurrogate = 5
                              #, usesurrogate = 2
                              #, xval = 10
                              #, surrogatestyle = 0
                              #, maxdepth = 30
                      )
                     )

In [102]:
#fit.tree %>% summary

In [101]:
#rpart.plot(fit.tree)

In [117]:
preds <- predict(fit.tree, newdata = test, type="prob") %>% as.data.frame

In [120]:
test$team_tie <- as.numeric(test$result == 'tie')
test$team_1_win <- as.numeric(test$result == 'team_1_win')
test$team_2_win <- as.numeric(test$result == 'team_2_win')

test$team_tie_prob_dtree <- (preds %>% as.data.frame)$tie
test$team_1_prob_dtree <- (preds %>% as.data.frame)$team_1_win
test$team_2_prob_dtree <- (preds %>% as.data.frame)$team_2_win

In [121]:
pROC::roc(response = test$team_tie, predictor = test$team_tie_prob_dtree)


Call:
roc.default(response = test$team_tie, predictor = test$team_tie_prob_dtree)

Data: test$team_tie_prob_dtree in 2676 controls (test$team_tie 0) < 912 cases (test$team_tie 1).
Area under the curve: 0.6416

In [122]:
pROC::roc(response = test$team_1_win, predictor = test$team_1_prob_dtree)


Call:
roc.default(response = test$team_1_win, predictor = test$team_1_prob_dtree)

Data: test$team_1_prob_dtree in 1915 controls (test$team_1_win 0) < 1673 cases (test$team_1_win 1).
Area under the curve: 0.7377

In [123]:
pROC::roc(response = test$team_2_win, predictor = test$team_2_prob_dtree)


Call:
roc.default(response = test$team_2_win, predictor = test$team_2_prob_dtree)

Data: test$team_2_prob_dtree in 2585 controls (test$team_2_win 0) < 1003 cases (test$team_2_win 1).
Area under the curve: 0.7555

### Random Forest

In [127]:
# train <- df_model[index,]
# test <- df_model[-index,]

# fit.forest <- train %>%
#                 select(-team_1_name, -team_2_name) %>% # rpart isn't very efficient with 'large' factors
#                 randomForest(formula = result ~ .
#                      )

In [131]:
# fit.forest %>% summary

### Xgboost

In [169]:
sparse_matrix <- sparse.model.matrix(result ~ .-1, data = df_model)

sparse_matrix_train <- sparse_matrix[as.vector(index), ,]
sparse_matrix_test <- sparse_matrix[-as.vector(index), ,]

In [173]:
label <- as.numeric(df_model$result)-1

In [187]:
df_model$result[1:15]

In [188]:
label[1:15]

In [176]:
train_label <- label[index]
test_label <- label[-index]

In [178]:
xgb <- xgboost(data = data.matrix(sparse_matrix_train), label = train_label,
            booster = 'gbtree',
             eta = 0.1,
             gamma = 0.01,
             max_depth = 10, 
             subsample = 0.5,
             colsample_bytree = 0.5,
             seed = 1,
             eval_metric = "mlogloss",
             #num_parallel_tree,
             num_class = 3,
             nthread = 8,
             nround=50,
             #feval
              objective = "multi:softprob")

[1]	train-mlogloss:1.059615 
[2]	train-mlogloss:1.023653 
[3]	train-mlogloss:0.993390 
[4]	train-mlogloss:0.968390 
[5]	train-mlogloss:0.943642 
[6]	train-mlogloss:0.917093 
[7]	train-mlogloss:0.894182 
[8]	train-mlogloss:0.873009 
[9]	train-mlogloss:0.852926 
[10]	train-mlogloss:0.837225 
[11]	train-mlogloss:0.823549 
[12]	train-mlogloss:0.810765 
[13]	train-mlogloss:0.793902 
[14]	train-mlogloss:0.776414 
[15]	train-mlogloss:0.764874 
[16]	train-mlogloss:0.752564 
[17]	train-mlogloss:0.739220 
[18]	train-mlogloss:0.728563 
[19]	train-mlogloss:0.717959 
[20]	train-mlogloss:0.709159 
[21]	train-mlogloss:0.700949 
[22]	train-mlogloss:0.691296 
[23]	train-mlogloss:0.682693 
[24]	train-mlogloss:0.674318 
[25]	train-mlogloss:0.665528 
[26]	train-mlogloss:0.657604 
[27]	train-mlogloss:0.649091 
[28]	train-mlogloss:0.640728 
[29]	train-mlogloss:0.634544 
[30]	train-mlogloss:0.626369 
[31]	train-mlogloss:0.617973 
[32]	train-mlogloss:0.609972 
[33]	train-mlogloss:0.603002 
[34]	train-mlogloss

In [179]:
pred <- predict(xgb, data.matrix(sparse_matrix_test), type="probs")

In [194]:
pred_team_tie <- pred[seq(3,length(pred)+2,3)]
pred_team_1 <- pred[seq(1,length(pred),3)]
pred_team_2 <- pred[seq(2,length(pred)+1,3)]

In [195]:
test$team_tie <- as.numeric(test$result == 'tie')
test$team_1_win <- as.numeric(test$result == 'team_1_win')
test$team_2_win <- as.numeric(test$result == 'team_2_win')

test$team_tie_prob_xgb_tree <- pred_team_tie
test$team_1_prob_xgb_tree <- pred_team_1
test$team_2_prob_xgb_tree <- pred_team_2

In [196]:
pROC::roc(response = test$team_tie, predictor = test$team_tie_prob_xgb_tree)


Call:
roc.default(response = test$team_tie, predictor = test$team_tie_prob_xgb_tree)

Data: test$team_tie_prob_xgb_tree in 2676 controls (test$team_tie 0) < 912 cases (test$team_tie 1).
Area under the curve: 0.9302

In [197]:
pROC::roc(response = test$team_1_win, predictor = test$team_1_prob_xgb_tree)


Call:
roc.default(response = test$team_1_win, predictor = test$team_1_prob_xgb_tree)

Data: test$team_1_prob_xgb_tree in 1915 controls (test$team_1_win 0) < 1673 cases (test$team_1_win 1).
Area under the curve: 0.9097

In [198]:
pROC::roc(response = test$team_2_win, predictor = test$team_2_prob_xgb_tree)


Call:
roc.default(response = test$team_2_win, predictor = test$team_2_prob_xgb_tree)

Data: test$team_2_prob_xgb_tree in 2585 controls (test$team_2_win 0) < 1003 cases (test$team_2_win 1).
Area under the curve: 0.9181

### Results are surprisingly good. Try on hold out

In [200]:
sparse_matrix_holdout <- sparse.model.matrix(result ~ .-1, data = holdout)

In [201]:
label <- as.numeric(holdout$result)-1

In [204]:
holdout$result[1:15]

In [203]:
label[1:15]

In [205]:
pred_holdout <- predict(xgb, data.matrix(sparse_matrix_holdout), type="probs")

In [206]:
holdout_pred_team_tie <- pred_holdout[seq(3,length(pred_holdout)+2,3)]
holdout_pred_team_1 <- pred_holdout[seq(1,length(pred_holdout),3)]
holdout_pred_team_2 <- pred_holdout[seq(2,length(pred_holdout)+1,3)]

In [208]:
holdout$team_tie <- as.numeric(holdout$result == 'tie')
holdout$team_1_win <- as.numeric(holdout$result == 'team_1_win')
holdout$team_2_win <- as.numeric(holdout$result == 'team_2_win')

holdout$team_tie_prob_xgb_tree <- holdout_pred_team_tie
holdout$team_1_prob_xgb_tree <- holdout_pred_team_1
holdout$team_2_prob_xgb_tree <- holdout_pred_team_2

In [209]:
pROC::roc(response = holdout$team_tie, predictor = holdout$team_tie_prob_xgb_tree)


Call:
roc.default(response = holdout$team_tie, predictor = holdout$team_tie_prob_xgb_tree)

Data: holdout$team_tie_prob_xgb_tree in 732 controls (holdout$team_tie 0) < 214 cases (holdout$team_tie 1).
Area under the curve: 0.61

In [210]:
pROC::roc(response = holdout$team_1_win, predictor = holdout$team_1_prob_xgb_tree)


Call:
roc.default(response = holdout$team_1_win, predictor = holdout$team_1_prob_xgb_tree)

Data: holdout$team_1_prob_xgb_tree in 512 controls (holdout$team_1_win 0) < 434 cases (holdout$team_1_win 1).
Area under the curve: 0.7719

In [211]:
pROC::roc(response = holdout$team_2_win, predictor = holdout$team_2_prob_xgb_tree)


Call:
roc.default(response = holdout$team_2_win, predictor = holdout$team_2_prob_xgb_tree)

Data: holdout$team_2_prob_xgb_tree in 648 controls (holdout$team_2_win 0) < 298 cases (holdout$team_2_win 1).
Area under the curve: 0.8009