In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import poisson,skellam,itemfreq
from datetime import datetime

In [26]:
# Getting the offensive and defensive scores for each team
wc_forecasts=pd.read_csv('https://projects.fivethirtyeight.com/soccer-api/international/2018/wc_forecasts.csv')

In [27]:
# Converting the time stamp into datetime objects
wc_forecasts['forecast_timestamp']=pd.to_datetime(wc_forecasts['forecast_timestamp'].str[:-3],format='%Y-%m-%d %H:%M:%S')

In [28]:
val=round((wc_forecasts['forecast_timestamp']-datetime.today()).dt.total_seconds()/(24*3600)).unique().max()

In [29]:
wc_forecasts['time_delta']=round((wc_forecasts['forecast_timestamp']-datetime.today()).dt.total_seconds()/(24*3600))

In [30]:
# Looking at the latest available values
stats=wc_forecasts[wc_forecasts['time_delta']==val][['team','global_o','global_d']]

In [31]:
# Looking at the values e.g. Brazil has 3.08 offensive score which means on average Brazil scores 3.08 goals versus an an average
# team and concedes 0.28 goals against an average team
stats.head()

Unnamed: 0,team,global_o,global_d
0,Brazil,3.07097,0.28134
1,Spain,3.33909,0.52313
2,Belgium,2.9195,0.61759
3,England,2.44525,0.49344
4,France,2.62942,0.50813


In [34]:
# We are going to generate goals scored by each team in a match using effective offensive score i.e. global_o minus global_d and simulate the end result by using poisson difference distribution 
# and then generate win probabilities
def generate_prob(lst):
    team1=stats[stats.team==lst[0]].loc[:,'global_o'].values[0]-stats[stats.team==lst[0]].loc[:,'global_d'].values[0]
    team2=stats[stats.team==lst[1]].loc[:,'global_o'].values[0]-stats[stats.team==lst[1]].loc[:,'global_d'].values[0]

    team1p=[skellam.pmf(i,team1,team2) for i in range(1,100)]
    team2p=[skellam.pmf(-i,team1,team2) for i in range(1,100)]
    
    tie_prob=skellam.pmf(0,team1,team2)
    team1p_tie=[skellam.pmf(i,team1/3,team2/3) for i in range(1,100)]
    team2p_tie=[skellam.pmf(-i,team1/3,team2/3) for i in range(1,100)]
    
    prob1=sum(team1p)+tie_prob*(sum(team1p_tie)+skellam.pmf(0,team1/3,team2/3)*0.5)
    prob2=sum(team2p)+tie_prob*(sum(team2p_tie)+skellam.pmf(0,team1/3,team2/3)*0.5)
    
    return [prob1/(prob1+prob2),prob2/(prob1+prob2)]

In [10]:
# Setting up the teams for pre quarterfinal matches and then running Monte Carlo simulations to generate expected teams who will
# the semi finals as well as making the finals
match1=['Uruguay','Portugal']
prob1=generate_prob(match1)

match2=['France','Argentina']
prob2=generate_prob(match2)

match3=['Brazil','Mexico']
prob3=generate_prob(match3)

match4=['Belgium','Japan']
prob4=generate_prob(match4)

match5=['Russia','Spain']
prob5=generate_prob(match5)

match6=['Denmark','Croatia']
prob6=generate_prob(match6)

match7=['Switzerland','Sweden']
prob7=generate_prob(match7)

match8=['England','Colombia']
prob8=generate_prob(match8)

In [11]:
lst=[(match1,prob1),(match2,prob2),(match3,prob3),(match4,prob4),(match5,prob5),(match6,prob6),(match7,prob7),(match8,prob8)]

In [12]:
# Expected win probabilities for the pre quarter-final matches
for match,prob in lst:
    print("Probability for {} win: {}".format(match[0],prob[0]))
    print("Probability for {} win: {}".format(match[1],prob[1]))
    print("----------------------------------------------")

Probability for Uruguay win: 0.480303009940984
Probability for Portugal win: 0.519696990059016
----------------------------------------------
Probability for France win: 0.5188555589796385
Probability for Argentina win: 0.4811444410203614
----------------------------------------------
Probability for Brazil win: 0.7703002682738505
Probability for Mexico win: 0.22969973172614952
----------------------------------------------
Probability for Belgium win: 0.7278631712049775
Probability for Japan win: 0.2721368287950226
----------------------------------------------
Probability for Russia win: 0.17472553581260475
Probability for Spain win: 0.8252744641873953
----------------------------------------------
Probability for Denmark win: 0.3623866098119792
Probability for Croatia win: 0.6376133901880209
----------------------------------------------
Probability for Switzerland win: 0.5363534763950117
Probability for Sweden win: 0.46364652360498837
----------------------------------------------


In [13]:
quarterfinal_simulations=[]
semifinal_simulations=[]
finals_simulations=[]
winner=[]
sims=50000

In [14]:
for sim in range(sims):
    quarter_finals=[]
    semi_finals=[]
    for i in range(4):
        quarter_finals.append([np.random.choice(lst[i*2][0],p=lst[i*2][1]),np.random.choice(lst[i*2+1][0],p=lst[i*2+1][1])])
    quarterfinal_simulations.append(quarter_finals)
    semi_finals=[[np.random.choice(quarter_finals[0],p=generate_prob(quarter_finals[0])),
                                     np.random.choice(quarter_finals[1],p=generate_prob(quarter_finals[1]))],
                                    [np.random.choice(quarter_finals[2],p=generate_prob(quarter_finals[2])),
                                     np.random.choice(quarter_finals[3],p=generate_prob(quarter_finals[3]))]]
    semifinal_simulations.append(semi_finals)
    finals=[np.random.choice(semi_finals[0],p=generate_prob(semi_finals[0])),
                                     np.random.choice(semi_finals[1],p=generate_prob(semi_finals[1]))]
    finals_simulations.append(finals)
    winner.append(np.random.choice(finals,p=generate_prob(finals)))
    if (sim%10000==0)&(sim!=0):
        print('Simulation # ',sim)

Simulation #  10000
Simulation #  20000
Simulation #  30000
Simulation #  40000


In [15]:
winner_freq=sorted(list(map(lambda x:[x[0],int(x[1])],itemfreq(winner))),key= lambda x:x[1],reverse=True)

`itemfreq` is deprecated and will be removed in a future version. Use instead `np.unique(..., return_counts=True)`
  """Entry point for launching an IPython kernel.


In [16]:
# Expected Winner
print('Expected Winner:{} winning {} of {} simulations'.format(winner_freq[0][0],winner_freq[0][1],sims))

Expected Winner:Spain winning 12855 of 50000 simulations


In [17]:
# Expected Runnerup
print('Expected Runner Up Winner:{} winning {} of {} simulations'.format(winner_freq[1][0],winner_freq[1][1],sims))

Expected Runner Up Winner:Brazil winning 10399 of 50000 simulations


In [18]:
semifinal1=list(map(lambda x:x[0][0],semifinal_simulations))
semifinal2=list(map(lambda x:x[0][1],semifinal_simulations))
semifinal3=list(map(lambda x:x[1][0],semifinal_simulations))
semifinal4=list(map(lambda x:x[1][1],semifinal_simulations))

In [19]:
winner_freq1=sorted(map(lambda x:[x[0],int(x[1])],itemfreq(semifinal1)),key= lambda x:x[1],reverse=True)
winner_freq2=sorted(map(lambda x:[x[0],int(x[1])],itemfreq(semifinal2)),key= lambda x:x[1],reverse=True)
winner_freq3=sorted(map(lambda x:[x[0],int(x[1])],itemfreq(semifinal3)),key= lambda x:x[1],reverse=True)
winner_freq4=sorted(map(lambda x:[x[0],int(x[1])],itemfreq(semifinal4)),key= lambda x:x[1],reverse=True)

`itemfreq` is deprecated and will be removed in a future version. Use instead `np.unique(..., return_counts=True)`
  """Entry point for launching an IPython kernel.
`itemfreq` is deprecated and will be removed in a future version. Use instead `np.unique(..., return_counts=True)`
  
`itemfreq` is deprecated and will be removed in a future version. Use instead `np.unique(..., return_counts=True)`
  This is separate from the ipykernel package so we can avoid doing imports until
`itemfreq` is deprecated and will be removed in a future version. Use instead `np.unique(..., return_counts=True)`
  after removing the cwd from sys.path.


In [20]:
# Expected Semifinalists for Semi Final #1
winner_freq1

[['France', 14705],
 ['Argentina', 13144],
 ['Portugal', 11711],
 ['Uruguay', 10440]]

In [21]:
# Expected Semifinalist for Semi Final #2
winner_freq2

[['Brazil', 25092], ['Belgium', 17206], ['Mexico', 4181], ['Japan', 3521]]

In [22]:
# Expected Semifinalist for Semi Final #2
winner_freq3

[['Spain', 29990], ['Croatia', 12058], ['Denmark', 4660], ['Russia', 3292]]

In [23]:
# Expected Semifinalist for Semi Final #2
winner_freq4

[['England', 16650],
 ['Switzerland', 12277],
 ['Colombia', 11241],
 ['Sweden', 9832]]