In [1]:
import pandas as pd
import numpy as np
import re
import json

In [3]:
results = pd.read_csv('results-2018.csv')[['Round', 'Home', 'Away', 'Score']]
fivethirtyeight = pd.read_csv('538-2018.csv').rename(columns={
    'proba': '538_proba',
    'probx': '538_probx',
    'probb': '538_probb',
}).drop(['teama', 'teamb'], axis=1)
df = results.join(fivethirtyeight)
df

Unnamed: 0,Round,Home,Away,Score,538_proba,538_probx,538_probb
0,Group stage,Russia ru,sa Saudi Arabia,5–0,0.66,0.22,0.12
1,Group stage,Egypt eg,uy Uruguay,0–1,0.18,0.28,0.54
2,Group stage,Morocco ma,ir IR Iran,0–1,0.40,0.33,0.27
3,Group stage,Portugal pt,es Spain,3–3,0.25,0.27,0.48
4,Group stage,France fr,au Australia,2–1,0.77,0.16,0.07
...,...,...,...,...,...,...,...
59,Quarter-finals,Russia ru,hr Croatia,(3) 2–2 (4),0.39,0.00,0.61
60,Semi-finals,France fr,be Belgium,1–0,0.52,0.00,0.48
61,Semi-finals,Croatia hr,eng England,2–1,0.43,0.00,0.57
62,Third-place match,Belgium be,eng England,2–0,0.51,0.00,0.49


In [5]:
def winner(score):
    if isinstance(score, float) and np.isnan(score):
        return None
    scores = score.split(' ')
    if len(scores) == 1:
        a, b = int(score.split('–')[0]), int(score.split('–')[1])
        if a == b:
            return 'x'
        if a > b:
            return 'a'
        return 'b'
    segs = re.split(r'(\d+)', score)
    if int(segs[1]) < int(segs[7]):
        return 'b'
    return 'a'
df['winner'] = df['Score'].apply(winner)
df

Unnamed: 0,Round,Home,Away,Score,538_proba,538_probx,538_probb,winner
0,Group stage,Russia ru,sa Saudi Arabia,5–0,0.66,0.22,0.12,a
1,Group stage,Egypt eg,uy Uruguay,0–1,0.18,0.28,0.54,b
2,Group stage,Morocco ma,ir IR Iran,0–1,0.40,0.33,0.27,b
3,Group stage,Portugal pt,es Spain,3–3,0.25,0.27,0.48,x
4,Group stage,France fr,au Australia,2–1,0.77,0.16,0.07,a
...,...,...,...,...,...,...,...,...
59,Quarter-finals,Russia ru,hr Croatia,(3) 2–2 (4),0.39,0.00,0.61,b
60,Semi-finals,France fr,be Belgium,1–0,0.52,0.00,0.48,a
61,Semi-finals,Croatia hr,eng England,2–1,0.43,0.00,0.57,a
62,Third-place match,Belgium be,eng England,2–0,0.51,0.00,0.49,a


In [6]:
providers = ['538']
def brier_score_row(row, prefix):
    return (
        pow(row[f'{prefix}_proba'] - (1 if row['winner'] == 'a' else 0), 2) +
        pow(row[f'{prefix}_probb'] - (1 if row['winner'] == 'b' else 0), 2) +
        pow(row[f'{prefix}_probx'] - (1 if row['winner'] == 'x' else 0), 2)
    )
for p in providers:
    df[f'brier_score_{p}'] = df.apply(lambda row: brier_score_row(row, p), axis=1)
df

Unnamed: 0,Round,Home,Away,Score,538_proba,538_probx,538_probb,winner,brier_score_538
0,Group stage,Russia ru,sa Saudi Arabia,5–0,0.66,0.22,0.12,a,0.1784
1,Group stage,Egypt eg,uy Uruguay,0–1,0.18,0.28,0.54,b,0.3224
2,Group stage,Morocco ma,ir IR Iran,0–1,0.40,0.33,0.27,b,0.8018
3,Group stage,Portugal pt,es Spain,3–3,0.25,0.27,0.48,x,0.8258
4,Group stage,France fr,au Australia,2–1,0.77,0.16,0.07,a,0.0834
...,...,...,...,...,...,...,...,...,...
59,Quarter-finals,Russia ru,hr Croatia,(3) 2–2 (4),0.39,0.00,0.61,b,0.3042
60,Semi-finals,France fr,be Belgium,1–0,0.52,0.00,0.48,a,0.4608
61,Semi-finals,Croatia hr,eng England,2–1,0.43,0.00,0.57,a,0.6498
62,Third-place match,Belgium be,eng England,2–0,0.51,0.00,0.49,a,0.4802


In [7]:
{p: (1-df[f'brier_score_{p}'].sum()/40) for p in providers}

{'538': 0.16005499999999984}