In [1]:
import pandas as pd
from scipy.stats import poisson

In [2]:
df_historical_data = pd.read_csv('_world_cup_2002_onward_matches.csv')

In [3]:
# split the data teamwise
df_home = df_historical_data[['HomeTeam', 'HomeGoals', 'AwayGoals']]
df_away = df_historical_data[['AwayTeam', 'HomeGoals', 'AwayGoals']]

In [4]:
# rename columns
df_home = df_home.rename(columns={'HomeTeam': 'Team', 'HomeGoals': 'GoalsFor', 'AwayGoals': 'GoalsAgainst'})
df_away = df_away.rename(columns={'AwayTeam': 'Team', 'HomeGoals': 'GoalsAgainst', 'AwayGoals': 'GoalsFor'})

In [5]:
# combine datasets, group by team and calculate mean
df_team_strength = pd.concat([df_home, df_away], ignore_index=True).groupby('Team').mean()
df_team_strength

Unnamed: 0_level_0,GoalsFor,GoalsAgainst
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
Algeria,1.000000,1.285714
Angola,0.333333,0.666667
Argentina,1.586207,1.000000
Australia,1.000000,1.882353
Belgium,1.526316,0.947368
...,...,...
USA,0.750000,1.000000
Ukraine,1.000000,1.400000
United States,1.187500,1.500000
Uruguay,1.272727,1.090909


In [6]:
# Predict points for home and away team using Poisson distribution to calculate probability
def predict_points(home, away):
    if home in df_team_strength.index and away in df_team_strength.index:
        # goals_for * goals_against
        lambda_home = df_team_strength.at[home,'GoalsFor'] * df_team_strength.at[away,'GoalsAgainst']
        lambda_away = df_team_strength.at[away,'GoalsFor'] * df_team_strength.at[home,'GoalsAgainst']
        prob_home, prob_away, prob_draw = 0, 0, 0
        for x in range(0,9): # number of goals home team
            for y in range(0, 9): # number of goals away team
                p = poisson.pmf(x, lambda_home) * poisson.pmf(y, lambda_away)
                if x == y:
                    prob_draw += p
                elif x > y:
                    prob_home += p
                else:
                    prob_away += p
        
        points_home = 3 * prob_home + prob_draw
        points_away = 3 * prob_away + prob_draw
        return (points_home, points_away)
    else:
        return (0, 0)

In [7]:
def get_winner(home_team, away_team):
    points = predict_points(home_team, away_team)
    if points[0] > points[1]:
        winner = home_team
    else:
        winner = away_team
    return winner

Predict the top 4 spots based on actual results upto 2022 QF stage matches (beginning from WC2002)

In [8]:
print('Semi #1 (ARG v CRO) winner:', get_winner('Argentina', 'Croatia'))
print('Semi #2 (FRA v MAR) winner:', get_winner('France', 'Morocco'))
print('3rd place (CRO v MAR) winner:', get_winner('Croatia', 'Morocco'))
print('Final (FRA v ARG) winner:', get_winner('France', 'Argentina'))

Semi #1 (ARG v CRO) winner: Argentina
Semi #2 (FRA v MAR) winner: France
3rd place (CRO v MAR) winner: Morocco
Final (FRA v ARG) winner: France
