In [218]:
library(tidyverse)
library(rpart)
library(pROC)
library(rpart.plot)
library(caret)
library(nnet)
library(randomForest)
library(Matrix)
library(xgboost)
library(zoo)

In [219]:
df <- read.csv('epl_data_w_features.csv')

In [322]:
df <- df %>% select(-c_ability_3)

In [323]:
df_play <- df %>%
    filter(data_type == 'hist') %>%
    na.omit %>%
    select(-data_type, -team_1_score, -team_2_score, -id)

### Use the last 5% of rows for 'hold-out'

### Helper function for log loss

In [457]:
logloss <- function(result, team_1_prob, team_2_prob, team_tie_prob){
    
    team_tie <- as.numeric(result == 'tie')
    team_1_win <- as.numeric(result == 'team_1_win')
    team_2_win <- as.numeric(result == 'team_2_win')
    
    log_losses <- team_tie * log(team_tie_prob) + team_1_win * log(team_1_prob) + team_2_win * log(team_2_prob)
    
    log_losses[which(!is.finite(log_losses))] <- NA
    
    return(
         -1 * mean( log_losses, na.rm = T  )
        )
}

In [324]:
holdout <- df_play[round(df_play %>% nrow * 0.95):(df_play %>% nrow),]
df_model <- df_play[1:round(df_play %>% nrow * 0.95),]

In [325]:
index <- caret::createDataPartition(y = df_model$result, p = 0.8, list = F)

In [326]:
train <- df_model[index,]
test <- df_model[-index,]

# Classification Models

## Linear Models

### Multinom

In [327]:
fit.mult <- train %>%
                multinom(formula = result ~ .)

# weights:  339 (224 variable)
initial  value 3856.129133 
iter  10 value 3444.284188
iter  20 value 3385.442892
iter  30 value 3341.695076
iter  40 value 3294.117657
iter  50 value 3280.204725
iter  60 value 3273.122916
iter  70 value 3269.953816
iter  80 value 3268.858662
iter  90 value 3267.643583
iter 100 value 3266.948937
final  value 3266.948937 
stopped after 100 iterations


In [328]:
fit.mult %>% summary

"NaNs produced"

Call:
multinom(formula = result ~ ., data = .)

Coefficients:
           (Intercept) is_february is_november d_ability_1 d_ability_3
team_2_win    4.033818 -0.09617404 -0.04268949   -8.093651    7.687734
tie           1.857866 -0.01106152  0.03737004   -3.163904    8.462804
           d_ability_4    d_form_4    d_h2h_2 team_1_nameAston Villa
team_2_win  -0.2706663  0.01294139 -0.5023426              2.3453274
tie         -5.6950433 -0.08489720 -0.0890849              0.7844435
           team_1_nameBirmingham team_1_nameBlackburn team_1_nameBlackpool
team_2_win             1.3158453            1.5676319             3.501682
tie                    0.2237564            0.2079519             1.570668
           team_1_nameBolton team_1_nameBournemouth team_1_nameBrighton
team_2_win         1.4147930              2.2734065                   0
tie                0.1265607              0.8492698                   0
           team_1_nameBurnley team_1_nameCardiff team_1_nameCharlton
team_2_w

In [329]:
preds <- predict(fit.mult, newdata = test, type="probs")

In [434]:
test$team_tie <- as.numeric(test$result == 'tie')
test$team_1_win <- as.numeric(test$result == 'team_1_win')
test$team_2_win <- as.numeric(test$result == 'team_2_win')

test$team_tie_prob_mult_nom <- (preds %>% as.data.frame)$tie
test$team_1_prob_mult_nom <- (preds %>% as.data.frame)$team_1_win
test$team_2_prob_mult_nom <- (preds %>% as.data.frame)$team_2_win

In [435]:
pROC::roc(response = test$team_tie, predictor = test$team_tie_prob_mult_nom)


Call:
roc.default(response = test$team_tie, predictor = test$team_tie_prob_mult_nom)

Data: test$team_tie_prob_mult_nom in 654 controls (test$team_tie 0) < 223 cases (test$team_tie 1).
Area under the curve: 0.5607

In [436]:
pROC::roc(response = test$team_1_win, predictor = test$team_1_prob_mult_nom)


Call:
roc.default(response = test$team_1_win, predictor = test$team_1_prob_mult_nom)

Data: test$team_1_prob_mult_nom in 468 controls (test$team_1_win 0) < 409 cases (test$team_1_win 1).
Area under the curve: 0.6434

In [437]:
pROC::roc(response = test$team_2_win, predictor = test$team_2_prob_mult_nom)


Call:
roc.default(response = test$team_2_win, predictor = test$team_2_prob_mult_nom)

Data: test$team_2_prob_mult_nom in 632 controls (test$team_2_win 0) < 245 cases (test$team_2_win 1).
Area under the curve: 0.6061

In [456]:
logloss(test$result, test$team_tie_prob_mult_nom, test$team_1_prob_mult_nom, test$team_2_prob_mult_nom)

### pairwise logreg (forced 'hidden' layer Neural Net)

In [334]:
train$team_tie <- as.numeric(train$result == 'tie')
train$team_1_win <- as.numeric(train$result == 'team_1_win')
train$team_2_win <- as.numeric(train$result == 'team_2_win')


fit.log_tie <- train %>%
                select(-team_1_win,-team_2_win, -result) %>%
                glm(formula = team_tie ~ ., family="binomial")

fit.log_team_1 <- train %>%
                select(-team_tie,-team_2_win, -result) %>%
                glm(formula = team_1_win ~ ., family="binomial")

fit.log_team_2 <- train %>%
                select(-team_1_win,-team_tie, -result) %>%
                glm(formula = team_2_win ~ ., family="binomial")

In [335]:
fit.log_tie %>% summary


Call:
glm(formula = team_tie ~ ., family = "binomial", data = .)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4356  -0.8018  -0.6726   1.1940   2.2434  

Coefficients:
                               Estimate Std. Error z value Pr(>|z|)    
(Intercept)                   6.195e-01  2.391e+00   0.259 0.795540    
is_february                   2.617e-02  4.216e-02   0.621 0.534795    
is_november                   5.178e-02  4.326e-02   1.197 0.231361    
d_ability_1                   6.660e-04  1.336e+00   0.000 0.999602    
d_ability_3                   7.093e+00  4.053e+00   1.750 0.080133 .  
d_ability_4                  -7.233e+00  3.735e+00  -1.937 0.052771 .  
d_form_4                     -8.911e-02  1.478e-01  -0.603 0.546655    
d_h2h_2                       1.084e-01  6.581e-02   1.648 0.099424 .  
team_1_nameAston Villa        7.613e-02  3.871e-01   0.197 0.844087    
team_1_nameBirmingham         4.331e-02  4.695e-01   0.092 0.926507    
team_1_nameBl

In [477]:
## built multinom model ontop to select probabilities

train_tie_log <- predict(fit.log_tie, newdata = train, type="response")
train_team_1_log <- predict(fit.log_team_1, newdata = train, type="response")
train_team_2_log <- predict(fit.log_team_2, newdata = train, type="response")

tmp_train <- train$result %>% as.data.frame
names(tmp_train) <- 'result'
tmp_train$train_tie_log <- train_tie_log
tmp_train$train_team_1_log <- train_team_1_log
tmp_train$train_team_2_log <- train_team_2_log

fit.mult_log <- tmp_train %>%
                multinom(formula = result ~ .)

# weights:  15 (8 variable)
initial  value 3856.129133 
iter  10 value 3271.157696
final  value 3270.874533 
converged


In [480]:
pred_tie_log <- predict(fit.log_tie, newdata = test, type="response")
pred_team_1_log <- predict(fit.log_team_1, newdata = test, type="response")
pred_team_2_log <- predict(fit.log_team_2, newdata = test, type="response")

tmp_test <- test$result %>% as.data.frame
names(tmp_test) <- 'result'
tmp_test$train_tie_log <- pred_tie_log
tmp_test$train_team_1_log <- pred_team_1_log
tmp_test$train_team_2_log <- pred_team_2_log

In [482]:
preds_mult_log <- predict(fit.mult_log, newdata = tmp_test, type="probs")

In [483]:
test$team_tie <- as.numeric(test$result == 'tie')
test$team_1_win <- as.numeric(test$result == 'team_1_win')
test$team_2_win <- as.numeric(test$result == 'team_2_win')

test$team_tie_prob_log_mult <- (preds_mult_log %>% as.data.frame)$tie
test$team_1_prob_log_mult <- (preds_mult_log %>% as.data.frame)$team_1_win
test$team_2_prob_log_mult <- (preds_mult_log %>% as.data.frame)$team_2_win

In [337]:
test$pred_tie_log <- pred_tie_log
test$pred_team_1_log <- pred_team_1_log
test$pred_team_2_log <- pred_team_2_log

In [484]:
pROC::roc(response = test$team_tie, predictor = test$team_tie_prob_log_mult)


Call:
roc.default(response = test$team_tie, predictor = test$team_tie_prob_log_mult)

Data: test$team_tie_prob_log_mult in 654 controls (test$team_tie 0) < 223 cases (test$team_tie 1).
Area under the curve: 0.6023

In [485]:
pROC::roc(response = test$team_1_win, predictor = test$team_1_prob_log_mult)


Call:
roc.default(response = test$team_1_win, predictor = test$team_1_prob_log_mult)

Data: test$team_1_prob_log_mult in 468 controls (test$team_1_win 0) < 409 cases (test$team_1_win 1).
Area under the curve: 0.7019

In [486]:
pROC::roc(response = test$team_2_win, predictor = test$team_2_prob_log_mult)


Call:
roc.default(response = test$team_2_win, predictor = test$team_2_prob_log_mult)

Data: test$team_2_prob_log_mult in 632 controls (test$team_2_win 0) < 245 cases (test$team_2_win 1).
Area under the curve: 0.7178

In [487]:
logloss(test$result, test$team_tie_prob_log_mult, test$team_1_prob_log_mult, test$team_2_prob_log_mult)

## Non Linear Models

### D Tree (excl. team name factors)

In [602]:
train <- df_model[index,]
test <- df_model[-index,]

fit.tree <- train %>%
                select(-team_1_name, -team_2_name) %>% # rpart isn't very efficient with 'large' factors
                rpart(formula = result ~ .
                      , parms =  list(split = "information")
                      , control = rpart.control(
                                minsplit = 20
                              , cp = 0.005
                              #, maxcompete = 4
                              #, maxsurrogate = 5
                              #, usesurrogate = 2
                              #, xval = 10
                              #, surrogatestyle = 0
                              #, maxdepth = 30
                      )
                     )

In [603]:
#fit.tree %>% summary

In [605]:
#rpart.plot(fit.tree)

In [606]:
preds <- predict(fit.tree, newdata = test, type="prob") %>% as.data.frame

In [607]:
test$team_tie <- as.numeric(test$result == 'tie')
test$team_1_win <- as.numeric(test$result == 'team_1_win')
test$team_2_win <- as.numeric(test$result == 'team_2_win')

test$team_tie_prob_dtree <- (preds %>% as.data.frame)$tie
test$team_1_prob_dtree <- (preds %>% as.data.frame)$team_1_win
test$team_2_prob_dtree <- (preds %>% as.data.frame)$team_2_win

In [608]:
pROC::roc(response = test$team_tie, predictor = test$team_tie_prob_dtree)


Call:
roc.default(response = test$team_tie, predictor = test$team_tie_prob_dtree)

Data: test$team_tie_prob_dtree in 654 controls (test$team_tie 0) < 223 cases (test$team_tie 1).
Area under the curve: 0.5489

In [609]:
pROC::roc(response = test$team_1_win, predictor = test$team_1_prob_dtree)


Call:
roc.default(response = test$team_1_win, predictor = test$team_1_prob_dtree)

Data: test$team_1_prob_dtree in 468 controls (test$team_1_win 0) < 409 cases (test$team_1_win 1).
Area under the curve: 0.6783

In [610]:
pROC::roc(response = test$team_2_win, predictor = test$team_2_prob_dtree)


Call:
roc.default(response = test$team_2_win, predictor = test$team_2_prob_dtree)

Data: test$team_2_prob_dtree in 632 controls (test$team_2_win 0) < 245 cases (test$team_2_win 1).
Area under the curve: 0.6877

In [611]:
logloss(test$result, test$team_tie_prob_dtree, test$team_1_prob_dtree, test$team_2_prob_dtree)

### Random Forest

In [349]:
# train <- df_model[index,]
# test <- df_model[-index,]

# fit.forest <- train %>%
#                 select(-team_1_name, -team_2_name) %>% # rpart isn't very efficient with 'large' factors
#                 randomForest(formula = result ~ .
#                      )

In [350]:
# fit.forest %>% summary

### Xgboost

In [520]:
sparse_matrix <- sparse.model.matrix(result ~ .-1, data = df_model)

sparse_matrix_train <- sparse_matrix[as.vector(index), ,]
sparse_matrix_test <- sparse_matrix[-as.vector(index), ,]

In [521]:
label <- as.numeric(df_model$result)-1

In [522]:
df_model$result[1:15]

In [523]:
label[1:15]

In [524]:
train_label <- label[index]
test_label <- label[-index]

In [731]:
xgb <- xgboost(data = data.matrix(sparse_matrix_train), label = train_label,
            booster = 'gbtree',
             eta = 0.005,
             gamma = 0.3,
             max_depth = 8, 
             subsample = 0.15,
             colsample_bytree = 1,
             #seed = 1,
             eval_metric = "mlogloss",
             #num_parallel_tree,
             num_class = 3,
             nthread = 8,
             nround=1000,
             #feval
              objective = "multi:softprob")

[1]	train-mlogloss:1.097242 
[2]	train-mlogloss:1.095849 
[3]	train-mlogloss:1.094501 
[4]	train-mlogloss:1.093122 
[5]	train-mlogloss:1.091952 
[6]	train-mlogloss:1.090551 
[7]	train-mlogloss:1.089353 
[8]	train-mlogloss:1.088055 
[9]	train-mlogloss:1.086713 
[10]	train-mlogloss:1.085436 
[11]	train-mlogloss:1.084222 
[12]	train-mlogloss:1.083008 
[13]	train-mlogloss:1.081765 
[14]	train-mlogloss:1.080406 
[15]	train-mlogloss:1.079062 
[16]	train-mlogloss:1.077817 
[17]	train-mlogloss:1.076688 
[18]	train-mlogloss:1.075506 
[19]	train-mlogloss:1.074251 
[20]	train-mlogloss:1.073109 
[21]	train-mlogloss:1.071813 
[22]	train-mlogloss:1.070481 
[23]	train-mlogloss:1.069269 
[24]	train-mlogloss:1.067956 
[25]	train-mlogloss:1.066840 
[26]	train-mlogloss:1.065625 
[27]	train-mlogloss:1.064566 
[28]	train-mlogloss:1.063440 
[29]	train-mlogloss:1.062351 
[30]	train-mlogloss:1.061367 
[31]	train-mlogloss:1.060171 
[32]	train-mlogloss:1.059132 
[33]	train-mlogloss:1.058060 
[34]	train-mlogloss

In [732]:
#? xgboost

In [733]:
pred <- predict(xgb, data.matrix(sparse_matrix_test), type="probs")

In [734]:
pred_team_tie <- pred[seq(3,length(pred)+2,3)]
pred_team_1 <- pred[seq(1,length(pred),3)]
pred_team_2 <- pred[seq(2,length(pred)+1,3)]

In [735]:
test$team_tie <- as.numeric(test$result == 'tie')
test$team_1_win <- as.numeric(test$result == 'team_1_win')
test$team_2_win <- as.numeric(test$result == 'team_2_win')

test$team_tie_prob_xgb_tree <- pred_team_tie
test$team_1_prob_xgb_tree <- pred_team_1
test$team_2_prob_xgb_tree <- pred_team_2

In [736]:
pROC::roc(response = test$team_tie, predictor = test$team_tie_prob_xgb_tree)


Call:
roc.default(response = test$team_tie, predictor = test$team_tie_prob_xgb_tree)

Data: test$team_tie_prob_xgb_tree in 654 controls (test$team_tie 0) < 223 cases (test$team_tie 1).
Area under the curve: 0.5823

In [737]:
pROC::roc(response = test$team_1_win, predictor = test$team_1_prob_xgb_tree)


Call:
roc.default(response = test$team_1_win, predictor = test$team_1_prob_xgb_tree)

Data: test$team_1_prob_xgb_tree in 468 controls (test$team_1_win 0) < 409 cases (test$team_1_win 1).
Area under the curve: 0.6884

In [738]:
pROC::roc(response = test$team_2_win, predictor = test$team_2_prob_xgb_tree)


Call:
roc.default(response = test$team_2_win, predictor = test$team_2_prob_xgb_tree)

Data: test$team_2_prob_xgb_tree in 632 controls (test$team_2_win 0) < 245 cases (test$team_2_win 1).
Area under the curve: 0.7055

In [739]:
logloss(test$result, test$team_tie_prob_xgb_tree, test$team_1_prob_xgb_tree, test$team_2_prob_xgb_tree)

### Results are within the norm. Try Linear booster

In [420]:
xgb_lin <- xgboost(data = data.matrix(sparse_matrix_train), label = train_label,
            booster = 'gblinear',
            lambda = 0.0,
            lambda_bias = 0.0,
            alpha = 0.0,
             seed = 1,
             eval_metric = "mlogloss",
             #num_parallel_tree,
             num_class = 3,
             nthread = 8,
             nround=50,
             #feval
              objective = "multi:softprob")

[1]	train-mlogloss:0.962866 
[2]	train-mlogloss:0.950674 
[3]	train-mlogloss:0.947586 
[4]	train-mlogloss:0.946364 
[5]	train-mlogloss:0.945688 
[6]	train-mlogloss:0.945296 
[7]	train-mlogloss:0.944963 
[8]	train-mlogloss:0.944709 
[9]	train-mlogloss:0.944428 
[10]	train-mlogloss:0.944176 
[11]	train-mlogloss:0.943973 
[12]	train-mlogloss:0.943789 
[13]	train-mlogloss:0.943597 
[14]	train-mlogloss:0.943437 
[15]	train-mlogloss:0.943266 
[16]	train-mlogloss:0.943087 
[17]	train-mlogloss:0.942942 
[18]	train-mlogloss:0.942759 
[19]	train-mlogloss:0.942604 
[20]	train-mlogloss:0.942468 
[21]	train-mlogloss:0.942340 
[22]	train-mlogloss:0.942231 
[23]	train-mlogloss:0.942115 
[24]	train-mlogloss:0.941989 
[25]	train-mlogloss:0.941863 
[26]	train-mlogloss:0.941739 
[27]	train-mlogloss:0.941628 
[28]	train-mlogloss:0.941500 
[29]	train-mlogloss:0.941378 
[30]	train-mlogloss:0.941265 
[31]	train-mlogloss:0.941145 
[32]	train-mlogloss:0.941035 
[33]	train-mlogloss:0.940919 
[34]	train-mlogloss

In [364]:
pred_lin <- predict(xgb_lin, data.matrix(sparse_matrix_test), type="probs")

In [365]:
pred_lin_team_tie <- pred_lin[seq(3,length(pred_lin)+2,3)]
pred_lin_team_1 <- pred_lin[seq(1,length(pred_lin),3)]
pred_lin_team_2 <- pred_lin[seq(2,length(pred_lin)+1,3)]

In [366]:
test$team_tie <- as.numeric(test$result == 'tie')
test$team_1_win <- as.numeric(test$result == 'team_1_win')
test$team_2_win <- as.numeric(test$result == 'team_2_win')

test$team_tie_prob_xgb_lin <- pred_lin_team_tie
test$team_1_prob_xgb_lin <- pred_lin_team_1
test$team_2_prob_xgb_lin <- pred_lin_team_2

In [367]:
pROC::roc(response = test$team_tie, predictor = test$team_tie_prob_xgb_lin)


Call:
roc.default(response = test$team_tie, predictor = test$team_tie_prob_xgb_lin)

Data: test$team_tie_prob_xgb_lin in 654 controls (test$team_tie 0) < 223 cases (test$team_tie 1).
Area under the curve: 0.5802

In [368]:
pROC::roc(response = test$team_1_win, predictor = test$team_1_prob_xgb_lin)


Call:
roc.default(response = test$team_1_win, predictor = test$team_1_prob_xgb_lin)

Data: test$team_1_prob_xgb_lin in 468 controls (test$team_1_win 0) < 409 cases (test$team_1_win 1).
Area under the curve: 0.701

In [369]:
pROC::roc(response = test$team_2_win, predictor = test$team_2_prob_xgb_lin)


Call:
roc.default(response = test$team_2_win, predictor = test$team_2_prob_xgb_lin)

Data: test$team_2_prob_xgb_lin in 632 controls (test$team_2_win 0) < 245 cases (test$team_2_win 1).
Area under the curve: 0.7153

# Test on holdout

In [740]:
sparse_matrix_holdout <- sparse.model.matrix(result ~ .-1, data = holdout)

In [741]:
label <- as.numeric(holdout$result)-1

In [742]:
holdout$result[1:15]

In [743]:
label[1:15]

In [744]:
pred_holdout <- predict(xgb, data.matrix(sparse_matrix_holdout), type="probs")

In [745]:
holdout_pred_team_tie <- pred_holdout[seq(3,length(pred_holdout)+2,3)]
holdout_pred_team_1 <- pred_holdout[seq(1,length(pred_holdout),3)]
holdout_pred_team_2 <- pred_holdout[seq(2,length(pred_holdout)+1,3)]

In [746]:
holdout$team_tie <- as.numeric(holdout$result == 'tie')
holdout$team_1_win <- as.numeric(holdout$result == 'team_1_win')
holdout$team_2_win <- as.numeric(holdout$result == 'team_2_win')

holdout$team_tie_prob_xgb_tree <- holdout_pred_team_tie
holdout$team_1_prob_xgb_tree <- holdout_pred_team_1
holdout$team_2_prob_xgb_tree <- holdout_pred_team_2

In [747]:
pROC::roc(response = holdout$team_tie, predictor = holdout$team_tie_prob_xgb_tree)


Call:
roc.default(response = holdout$team_tie, predictor = holdout$team_tie_prob_xgb_tree)

Data: holdout$team_tie_prob_xgb_tree in 180 controls (holdout$team_tie 0) < 52 cases (holdout$team_tie 1).
Area under the curve: 0.5949

In [748]:
pROC::roc(response = holdout$team_1_win, predictor = holdout$team_1_prob_xgb_tree)


Call:
roc.default(response = holdout$team_1_win, predictor = holdout$team_1_prob_xgb_tree)

Data: holdout$team_1_prob_xgb_tree in 126 controls (holdout$team_1_win 0) < 106 cases (holdout$team_1_win 1).
Area under the curve: 0.7671

In [749]:
pROC::roc(response = holdout$team_2_win, predictor = holdout$team_2_prob_xgb_tree)


Call:
roc.default(response = holdout$team_2_win, predictor = holdout$team_2_prob_xgb_tree)

Data: holdout$team_2_prob_xgb_tree in 158 controls (holdout$team_2_win 0) < 74 cases (holdout$team_2_win 1).
Area under the curve: 0.8023

In [750]:
logloss(holdout$result, holdout$team_tie_prob_xgb_tree, holdout$team_1_prob_xgb_tree, holdout$team_2_prob_xgb_tree)

In [751]:
df_comp <- df %>%
    filter(data_type == 'comp') %>%
    #na.omit %>%
    select(-data_type, -team_1_score, -team_2_score, -id)

In [752]:
df_comp

result,is_february,is_november,d_ability_1,d_ability_3,d_ability_4,d_form_4,d_h2h_2,team_1_name,team_2_name,...,team_2_team_score_ma_10,team_2_team_win_index,team_2_team_loss_index,team_2_tie_index,x_year,y_year,x_week,y_week,x_day,y_day
tie,-0.3178843,-0.3385736,-2.20500635,-2.2454484,-2.2445428,-0.94887132,-1.525527,West Ham,Chelsea,...,1.7,0.8953355,1.459592,0.9748441,0.9291414,-0.3697245,0.2225209,-0.9749279,-1.0,1.224647e-16
tie,-0.3178843,-0.3385736,-0.88925776,-0.8380912,-0.8429574,0.03982277,-1.525527,Huddersfield,Brighton,...,1.0,1.0714286,1.428571,0.4285714,0.9291414,-0.3697245,0.3302791,-0.9438833,1.0,0.0
tie,-0.3178843,-0.3385736,-0.02423344,-0.1066833,-0.1148918,0.7435476,-1.525527,Swansea,West Brom,...,0.8,0.6683673,1.121498,1.1080827,0.9291414,-0.3697245,0.3302791,-0.9438833,1.0,0.0
tie,-0.3178843,-0.3385736,0.3648399,0.3435022,0.3538497,0.71709936,-1.525527,Burnley,Watford,...,1.8,0.8892463,1.235383,0.671875,0.9291414,-0.3697245,0.3302791,-0.9438833,1.0,0.0
tie,-0.3178843,-0.3385736,-0.33941503,-0.427606,-0.4371238,0.28919047,-1.525527,Crystal Palace,Bournemouth,...,1.0,0.8238683,1.107407,1.0111111,0.9291414,-0.3697245,0.3302791,-0.9438833,1.0,0.0
tie,-0.3178843,-0.3385736,2.02210898,2.0842651,2.0996903,0.79409032,-1.525527,Tottenham,Stoke City,...,1.2,0.6071429,1.314685,0.990099,0.9291414,-0.3697245,0.3302791,-0.9438833,1.0,0.0
tie,-0.3178843,-0.3385736,-0.55800277,-0.4606444,-0.441627,0.21156075,-1.525527,Newcastle Utd,Leicester,...,1.3,0.7169811,1.32,0.974359,0.9291414,-0.3697245,0.399892,-0.9165623,-1.0,1.224647e-16
tie,-0.3178843,-0.3385736,-1.16595526,-1.1310483,-1.1045425,-0.73148374,-1.525527,Southampton,Arsenal,...,2.2,0.8211312,1.45675,1.0573135,0.9353679,-0.3536761,0.9009689,-0.4338837,1.0,0.0
tie,-0.3178843,-0.3385736,1.17455248,1.2793569,1.2666328,-0.29539431,-1.525527,Liverpool,Everton,...,1.7,0.6824645,1.235669,1.1948052,0.9353679,-0.3536761,0.9308737,-0.365341,6.123234000000001e-17,1.0
tie,-0.3178843,-0.3385736,-0.56000922,-0.3529263,-0.3716552,-1.48296742,-1.525527,Manchester United,Manchester City,...,3.0,0.7537313,1.323944,1.1785714,0.9353679,-0.3536761,0.9555728,-0.2947552,-1.0,1.224647e-16


# submit

In [753]:
sparse_matrix_comp <- sparse.model.matrix(result ~ .-1, data = df_comp)

In [754]:
pred_comp <- predict(xgb, data.matrix(sparse_matrix_comp), type="probs")

In [755]:
comp_pred_team_tie <- pred_comp[seq(3,length(pred_comp)+2,3)]
comp_pred_team_1 <- pred_comp[seq(1,length(pred_comp),3)]
comp_pred_team_2 <- pred_comp[seq(2,length(pred_comp)+1,3)]

In [756]:
df_comp$team_tie_prob <- comp_pred_team_tie
df_comp$team_1_prob <- comp_pred_team_1
df_comp$team_2_prob <- comp_pred_team_2

In [757]:
names(df)

In [758]:
cbind(df %>%
    filter(data_type == 'comp') %>% select(id, team_1_name, team_2_name), df_comp %>% select(team_1_prob, team_2_prob, team_tie_prob) ) %>% write.csv('submit.csv')

In [759]:
holdout

Unnamed: 0,result,is_february,is_november,d_ability_1,d_ability_3,d_ability_4,d_form_4,d_h2h_2,team_1_name,team_2_name,...,x_week,y_week,x_day,y_day,team_tie,team_1_win,team_2_win,team_tie_prob_xgb_tree,team_1_prob_xgb_tree,team_2_prob_xgb_tree
4959,tie,-0.3178843,-0.3385736,-0.470756469,-0.490722809,-0.473596593,1.55552530,-1.1639329,Sunderland,Burnley,...,3.302791e-01,-0.9438833,1.000000e+00,0.000000e+00,1,0,0,0.51627398,0.2967409,0.18698512
4960,team_2_win,-0.3178843,-0.3385736,-0.028217053,-0.062176715,-0.084058343,-0.03603988,-1.1532618,West Ham,Leicester,...,3.302791e-01,-0.9438833,1.000000e+00,0.000000e+00,0,0,1,0.26685303,0.4982227,0.23492430
4961,team_2_win,-0.3178843,-0.3385736,-1.575147450,-1.522828176,-1.525281323,-0.77514404,-1.1318592,Stoke City,Chelsea,...,3.302791e-01,-0.9438833,1.000000e+00,0.000000e+00,0,0,1,0.19060796,0.2597259,0.54966617
4962,team_1_win,-0.3178843,-0.3385736,0.026651534,-0.036350433,-0.026219782,0.20397129,-1.1179886,Bournemouth,Swansea,...,3.998920e-01,-0.9165623,-1.000000e+00,1.224647e-16,0,1,0,0.19385187,0.5032742,0.30287388
4963,team_2_win,-0.3178843,-0.3385736,-1.225435498,-1.183822254,-1.168835932,-0.91181741,-1.1852997,Middlesbrough,Manchester United,...,9.009689e-01,-0.4338837,1.000000e+00,0.000000e+00,0,0,1,0.21671821,0.2477370,0.53554481
4964,team_1_win,-0.3178843,-0.3385736,1.057301957,1.140058252,1.124049604,-0.00853235,-1.1676363,Tottenham,Southampton,...,9.308737e-01,-0.3653410,6.123234e-17,1.000000e+00,0,1,0,0.12927678,0.7759579,0.09476530
4965,tie,-0.3178843,-0.3385736,0.228954504,0.420244669,0.431922151,-1.52104564,-1.1782931,Manchester City,Liverpool,...,9.555728e-01,-0.2947552,-1.000000e+00,1.224647e-16,1,0,0,0.18776959,0.5487440,0.26348644
4966,team_1_win,-0.3178843,-0.3385736,0.205763270,0.342714352,0.358106917,-0.40180876,-1.1323344,Liverpool,Everton,...,1.859116e-01,-0.9825665,-1.000000e+00,1.224647e-16,0,1,0,0.34155887,0.4982745,0.16016667
4967,team_2_win,-0.3178843,-0.3385736,-1.559197665,-1.429278006,-1.427209821,-0.57418561,-1.1820384,Burnley,Tottenham,...,2.947552e-01,-0.9555728,1.000000e+00,0.000000e+00,0,0,1,0.30141944,0.2503015,0.44827908
4968,team_1_win,-0.3178843,-0.3385736,-0.735955135,-0.748060680,-0.735863023,-0.01813003,-1.1750100,Hull City,West Ham,...,2.947552e-01,-0.9555728,1.000000e+00,0.000000e+00,0,1,0,0.30060878,0.2639770,0.43541420
