In [48]:
import pandas as pd
import pickle
from scipy.stats import poisson

In [49]:
dict_table = pickle.load(open('dict_table', 'rb')) #rb stands for read byte

# 'to_csv' is used to import the dataframe into a csv file
# 'read_csv' is used to export the dataframe from the csv file
df_historical_data = pd.read_csv('Clean_FIFA_WC_Historical_Data.csv')
df_fixture = pd.read_csv('Clean_FIFA_WC_2022_Fixture.csv')

In [50]:
dict_table['Group H']

Unnamed: 0,Pos,Team,Pld,W,D,L,GF,GA,GD,Pts
0,1,Portugal,0,0,0,0,0,0,0,0
1,2,Ghana,0,0,0,0,0,0,0,0
2,3,Uruguay,0,0,0,0,0,0,0,0
3,4,South Korea,0,0,0,0,0,0,0,0


In [51]:
df_historical_data

Unnamed: 0,HomeTeam,AwayTeam,Year,HomeGoals,AwayGoals,TotalGoals
0,France,Mexico,1930,4,1,5
1,Uruguay,Argentina,1930,4,2,6
2,Uruguay,Yugoslavia,1930,6,1,7
3,Argentina,United States,1930,6,1,7
4,Paraguay,Belgium,1930,1,0,1
...,...,...,...,...,...,...
895,Serbia,Brazil,2018,0,2,2
896,Serbia,Switzerland,2018,1,2,3
897,Brazil,Costa Rica,2018,2,0,2
898,Costa Rica,Serbia,2018,0,1,1


In [52]:
df_fixture

Unnamed: 0,home,score,away,year
0,Qatar,Match 1,Ecuador,2022
1,Senegal,Match 2,Netherlands,2022
2,Qatar,Match 18,Senegal,2022
3,Netherlands,Match 19,Ecuador,2022
4,Ecuador,Match 35,Senegal,2022
...,...,...,...,...
59,Winners Match 51,Match 59,Winners Match 52,2022
60,Winners Match 57,Match 61,Winners Match 58,2022
61,Winners Match 59,Match 62,Winners Match 60,2022
62,Losers Match 61,Match 63,Losers Match 62,2022


In [53]:
# Splitting the df_historical_data dataframe into Home and Away dataframes

df_home = df_historical_data[['HomeTeam', 'HomeGoals', 'AwayGoals']]
df_away = df_historical_data[['AwayTeam', 'HomeGoals', 'AwayGoals']]

In [54]:
# Renaming the columns
df_home = df_home.rename(columns={'HomeTeam': 'Team', 'HomeGoals': 'GoalsScored', 'AwayGoals': 'GoalsConceded'})
df_away = df_away.rename(columns={'AwayTeam': 'Team', 'HomeGoals': 'GoalsConceded', 'AwayGoals': 'GoalsScored'})

In [55]:
# Concatenate df_home and df_away
# The 2 dataframes are passed as elements of a list
df_team_strength = pd.concat([df_home, df_away], ignore_index = True).groupby('Team').mean()
# GoalsScored consists of the mean of goals scored by the team in all WC matches they have played
# GoalsConceded consists of the mean of goals conceded by the team in all WC matches they have played
df_team_strength

Unnamed: 0_level_0,GoalsScored,GoalsConceded
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
Algeria,1.000000,1.461538
Angola,0.333333,0.666667
Argentina,1.691358,1.148148
Australia,0.812500,1.937500
Austria,1.482759,1.620690
...,...,...
Uruguay,1.553571,1.321429
Wales,0.800000,0.800000
West Germany,2.112903,1.241935
Yugoslavia,1.666667,1.272727


In [56]:
# We're gonna predict based on poisson distribution using python and 

# 'Team' column is the index of the dataframe
def predict_points(home,away):
    if home in df_team_strength.index and away in df_team_strength.index:
        # We define lambda as goals scored * goals conceded
        # .at is an accessor which is used to extract a specific value from the DataFrame. 
        #It is accessing the cell in the DataFrame located at the row labeled by the value stored in the variable home and the column labeled 'GoalsScored'.
        # Syntax
        # value = dataframe.at[row_label, column_label]
        lambda_home = df_team_strength.at[home, 'GoalsScored'] * df_team_strength.at[away, 'GoalsConceded']
        lambda_away = df_team_strength.at[away, 'GoalsScored'] * df_team_strength.at[home, 'GoalsConceded']
        # PMF P(X=x) is different for team A and team B
        # The loops are used to simulate results of matches between the two teams assuming no single team scores more than 10 goals
        # (0,0),(0,1)....(10,0),(10,1),...(10,10)
        # poisson.pmf(x,lambda_home) gives the probability of home team scoring x no. of goals in the given time interval (90 mins)
        # poisson.pmf(y,lambda_away) gives the probability of away team scoring y no. of goals in the given time interval (90 mins)
        # P(A and B) = P(A) * P(B)
        # So,p is the product of two probabilities and is equal to the joint probability of both events happening simultaneously.
        # Accumulating draw probabilities in prob_draw, win probabilities in prob_win and so on
        prob_draw, prob_home, prob_away = 0, 0, 0
        for x in range(0,11):
            for y in range(0,11):
                p = poisson.pmf(x,lambda_home) * poisson.pmf(y,lambda_away)
                if x == y: # Draw
                    prob_draw+= p
                elif x > y: # Home team wins
                    prob_home+= p
                else: # Away team wins
                    prob_away+= p
        # Allocating points
        points_home = 3*prob_home + prob_draw # The assumption here is that if prob_home is 1 then prob_draw would be zero and vice verse
        points_away = 3*prob_away + prob_draw # If prob_away is 1 then prob_draw would be zero
        return(points_home, points_away)
    # The 'else' part is executed when one of the teams is not found in the dataframe, i.e, if 2022 is their first WC
    else:
        return(0,0)

In [57]:
# Testing the function 

predict_points('Argentina', 'Mexico')

(2.3129151525530505, 0.5378377125059863)

In [58]:
# Qatar is playing in it's first ever WC 
predict_points('Qatar (H)', 'Ecuador')

(0, 0)

In [133]:
# Splitting fixtures into group, ro16, quarter, semi, finals

# copying the first 48 rows (or records) of the existing DataFrame
# Code uses dataframe slicing, start(inclusive) and stop(exclusive)
df_fixture_group = df_fixture[:48].copy() # copys from index 0 to 47
df_fixture_ro16 = df_fixture[48:56].copy() # copys from index 48 to 55
df_fixture_quarter = df_fixture[56:60].copy() # copys from index 56 to 59
df_fixture_semi = df_fixture[60:62].copy() # copys from index 60 to 61
df_fixture_final = df_fixture[62:].copy() # copying all rows starting from index 62 to the end of an existing DataFrame

In [134]:
df_fixture_group

Unnamed: 0,home,score,away,year
0,Qatar,Match 1,Ecuador,2022
1,Senegal,Match 2,Netherlands,2022
2,Qatar,Match 18,Senegal,2022
3,Netherlands,Match 19,Ecuador,2022
4,Ecuador,Match 35,Senegal,2022
5,Netherlands,Match 36,Qatar,2022
6,England,Match 3,Iran,2022
7,United States,Match 4,Wales,2022
8,Wales,Match 17,Iran,2022
9,England,Match 20,United States,2022


In [61]:
for group in dict_table: # group iterates through the keys of the dictionary - 'Group A', 'Group B'...
    # dict_table[group] represents a dataframe
    # teams_in_group is a list which consist of teams of a group as it's elements
    # As the group variable changes the elements of the list changes
    teams_in_group = dict_table[group]['Team'].values # The elements under the 'Team' column of each group gets stored in a list
    # df_fixture_group['home'] extracts the values from the 'home' column of the DataFrame
    # .isin() is a method which checks whether values in the 'home' column are present in the list 'teams_in_group'
    # df_fixture_group_6 is a dataframe of 6 matches played between the teams of a group
    df_fixture_group_6 = df_fixture_group[df_fixture_group['home'].isin(teams_in_group)]
    # Extracts index and row data while iterating through rows
    # The row variable holds the data from the current row as a Pandas Series and 
    for index, row in df_fixture_group_6.iterrows():
        home, away = row['home'], row['away'] # home and away variables
        home_points, away_points = predict_points(home, away) # home_points and away_points are variable which recieve values returned from the function
        # .loc() is a method used to locate row corresponding to the 'home' team and increments its 'Pts' value
        dict_table[group].loc[dict_table[group]['Team'] == home, 'Pts'] += home_points
        dict_table[group].loc[dict_table[group]['Team'] == away, 'Pts'] += away_points
    # Sorting the rows of the dataframe in descending order
    # When you sort a DataFrame, the original index may no longer be in order. 
    dict_table[group] = dict_table[group].sort_values('Pts', ascending = False).reset_index()
    # Modifying the dataframes to have only 2 columns
    dict_table[group] = dict_table[group][['Team', 'Pts']]
    dict_table[group] = dict_table[group].round(0)

  dict_table[group].loc[dict_table[group]['Team'] == home, 'Pts'] += home_points
  dict_table[group].loc[dict_table[group]['Team'] == home, 'Pts'] += home_points
  dict_table[group].loc[dict_table[group]['Team'] == home, 'Pts'] += home_points
  dict_table[group].loc[dict_table[group]['Team'] == home, 'Pts'] += home_points
  dict_table[group].loc[dict_table[group]['Team'] == home, 'Pts'] += home_points
  dict_table[group].loc[dict_table[group]['Team'] == home, 'Pts'] += home_points
  dict_table[group].loc[dict_table[group]['Team'] == home, 'Pts'] += home_points
  dict_table[group].loc[dict_table[group]['Team'] == home, 'Pts'] += home_points


In [135]:
dict_table['Group H']

Unnamed: 0,Team,Pts
0,Portugal,6.0
1,Uruguay,5.0
2,Ghana,4.0
3,South Korea,2.0


In [136]:
df_fixture_ro16

Unnamed: 0,home,score,away,year
48,Winners Group A,Match 49,Runners-up Group B,2022
49,Winners Group C,Match 50,Runners-up Group D,2022
50,Winners Group D,Match 52,Runners-up Group C,2022
51,Winners Group B,Match 51,Runners-up Group A,2022
52,Winners Group E,Match 53,Runners-up Group F,2022
53,Winners Group G,Match 54,Runners-up Group H,2022
54,Winners Group F,Match 55,Runners-up Group E,2022
55,Winners Group H,Match 56,Runners-up Group G,2022


In [137]:
# Updating RO16 ties with winners and runners of each group
# .loc() is used to locate cells with index no. and column name
# group variable traverses through the keys of the dictionary
for group in dict_table:
    group_winner = dict_table[group].loc[0, 'Team']
    group_runner_up = dict_table[group].loc[1, 'Team']
    # f string allows us to add a variable inside a string
    # .replace() is used to replace values within the DataFrame.
    df_fixture_ro16.replace({f'Winners {group}':group_winner, f'Runners-up {group}':group_runner_up}, inplace = True)
    df_fixture_ro16['Winner'] = '?'

In [138]:
df_fixture_ro16

Unnamed: 0,home,score,away,year,Winner
48,Netherlands,Match 49,Wales,2022,?
49,Argentina,Match 50,Denmark,2022,?
50,France,Match 52,Poland,2022,?
51,England,Match 51,Senegal,2022,?
52,Germany,Match 53,Belgium,2022,?
53,Brazil,Match 54,Uruguay,2022,?
54,Croatia,Match 55,Spain,2022,?
55,Portugal,Match 56,Switzerland,2022,?


In [139]:
# Function which returns the winner
# df_fixture_updated accepts argument of the function call statement
def get_winner(df_fixture_updated):
    #extracting index and row value from each row of the dataframe
    # row is of 'pandas Series' datatype
    for index, row in df_fixture_updated.iterrows():
        home, away = row['home'], row['away']
        points_home, points_away = predict_points(home, away)
        if points_home > points_away:
            winner = home
        else:
            winner = away
        #.loc is used to identify the cell
        df_fixture_updated.loc[index,'Winner'] = winner
    return df_fixture_updated

In [140]:
get_winner(df_fixture_ro16)
# df_fixture_ro16 automatically gets updated by jsut passing it through the function ?

Unnamed: 0,home,score,away,year,Winner
48,Netherlands,Match 49,Wales,2022,Netherlands
49,Argentina,Match 50,Denmark,2022,Argentina
50,France,Match 52,Poland,2022,France
51,England,Match 51,Senegal,2022,England
52,Germany,Match 53,Belgium,2022,Germany
53,Brazil,Match 54,Uruguay,2022,Brazil
54,Croatia,Match 55,Spain,2022,Spain
55,Portugal,Match 56,Switzerland,2022,Portugal


In [141]:
df_fixture_ro16

Unnamed: 0,home,score,away,year,Winner
48,Netherlands,Match 49,Wales,2022,Netherlands
49,Argentina,Match 50,Denmark,2022,Argentina
50,France,Match 52,Poland,2022,France
51,England,Match 51,Senegal,2022,England
52,Germany,Match 53,Belgium,2022,Germany
53,Brazil,Match 54,Uruguay,2022,Brazil
54,Croatia,Match 55,Spain,2022,Spain
55,Portugal,Match 56,Switzerland,2022,Portugal


In [142]:
# Unupdated quarter-finals dataframe
df_fixture_quarter

Unnamed: 0,home,score,away,year
56,Winners Match 53,Match 58,Winners Match 54,2022
57,Winners Match 49,Match 57,Winners Match 50,2022
58,Winners Match 55,Match 60,Winners Match 56,2022
59,Winners Match 51,Match 59,Winners Match 52,2022


In [149]:
# Function to update the required round table with the winners of previous round
# The function accepts 2 argument (updated df of previous round, df of the round to be updated)
def update_table(df_fixture_round_1, df_fixture_round_2):
    for index, row in df_fixture_round_1.iterrows():
        winner = df_fixture_round_1.loc[index, 'Winner']
        match = df_fixture_round_1.loc[index, 'score']
        # print(winner)
        # print(match)
        df_fixture_round_2.replace({f'Winners {match}':winner}, inplace = True)
    df_fixture_round_2['Winner'] = '?'
    return df_fixture_round_2

In [150]:
update_table(df_fixture_ro16,df_fixture_quarter)
# df_fixture_quarter automatically gets updated by just passing it through the function ?

Unnamed: 0,home,score,away,year,Winner
56,Germany,Match 58,Brazil,2022,?
57,Netherlands,Match 57,Argentina,2022,?
58,Spain,Match 60,Portugal,2022,?
59,England,Match 59,France,2022,?


In [151]:
df_fixture_quarter

Unnamed: 0,home,score,away,year,Winner
56,Germany,Match 58,Brazil,2022,?
57,Netherlands,Match 57,Argentina,2022,?
58,Spain,Match 60,Portugal,2022,?
59,England,Match 59,France,2022,?


In [152]:
get_winner(df_fixture_quarter)
# df_fixture_quarter automatically gets updated by just passing it through the function ?

Unnamed: 0,home,score,away,year,Winner
56,Germany,Match 58,Brazil,2022,Brazil
57,Netherlands,Match 57,Argentina,2022,Netherlands
58,Spain,Match 60,Portugal,2022,Portugal
59,England,Match 59,France,2022,France


In [153]:
df_fixture_semi

Unnamed: 0,home,score,away,year
60,Winners Match 57,Match 61,Winners Match 58,2022
61,Winners Match 59,Match 62,Winners Match 60,2022


In [154]:
#Updating the semis dataframe
update_table(df_fixture_quarter, df_fixture_semi)
# df_fixture_semi automatically gets updated by just passing it through the function ?

Unnamed: 0,home,score,away,year,Winner
60,Netherlands,Match 61,Brazil,2022,?
61,France,Match 62,Portugal,2022,?


In [155]:
df_fixture_semi

Unnamed: 0,home,score,away,year,Winner
60,Netherlands,Match 61,Brazil,2022,?
61,France,Match 62,Portugal,2022,?


In [156]:
get_winner(df_fixture_semi)
# df_fixture_semi automatically gets updated by just passing it through the function ?

Unnamed: 0,home,score,away,year,Winner
60,Netherlands,Match 61,Brazil,2022,Brazil
61,France,Match 62,Portugal,2022,France


In [158]:
# Predicting the winner
update_table(df_fixture_semi, df_fixture_final)

Unnamed: 0,home,score,away,year,Winner
62,Losers Match 61,Match 63,Losers Match 62,2022,?
63,Brazil,Match 64,France,2022,?


In [159]:
get_winner(df_fixture_final)

Unnamed: 0,home,score,away,year,Winner
62,Losers Match 61,Match 63,Losers Match 62,2022,Losers Match 62
63,Brazil,Match 64,France,2022,Brazil
