# Imports

In [11]:
import numpy as np
import pandas as pd

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

# Read Datasets

In [13]:
historical_results = pd.read_csv('https://raw.githubusercontent.com/rafabandoni/world-cup-22-predict/main/data/historical-results.csv')
historical_results.head(5)

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,England,False
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,England,False
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,Scotland,False


In [14]:
historical_results = historical_results.replace('United States','USA', regex=True) # changing for pattern

In [15]:
historical_win_loose_draw_ratios = pd.read_csv('https://raw.githubusercontent.com/rafabandoni/world-cup-22-predict/main/data/historical_win-loose-draw_ratios.csv')
historical_win_loose_draw_ratios.head(5)

Unnamed: 0,country1,country2,games,wins,looses,draws
0,Argentina,Australia,7,0.714286,0.142857,0.142857
1,Australia,Argentina,7,0.142857,0.714286,0.142857
2,Argentina,Belgium,4,0.75,0.25,0.0
3,Belgium,Argentina,4,0.25,0.75,0.0
4,Argentina,Brazil,108,0.361111,0.398148,0.240741


In [16]:
ranking = pd.read_csv('https://raw.githubusercontent.com/rafabandoni/world-cup-22-predict/main/data/ranking.csv')
ranking.head(5)

Unnamed: 0,rank,country_full,country_abrv,total_points,previous_points,rank_change,confederation,rank_date
0,1,Germany,GER,57.0,0.0,0,UEFA,1992-12-31
1,96,Syria,SYR,11.0,0.0,0,AFC,1992-12-31
2,97,Burkina Faso,BFA,11.0,0.0,0,CAF,1992-12-31
3,99,Latvia,LVA,10.0,0.0,0,UEFA,1992-12-31
4,100,Burundi,BDI,10.0,0.0,0,CAF,1992-12-31


In [17]:
shootouts = pd.read_csv('https://raw.githubusercontent.com/rafabandoni/world-cup-22-predict/main/data/shootouts.csv')
shootouts.head(5)

Unnamed: 0,date,home_team,away_team,winner
0,1967-08-22,India,Taiwan,Taiwan
1,1971-11-14,South Korea,Vietnam Republic,South Korea
2,1972-05-17,Thailand,South Korea,South Korea
3,1972-05-19,Thailand,Cambodia,Thailand
4,1973-04-21,Senegal,Ghana,Ghana


# Historical Results

In [18]:
historical_results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,England,False
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,England,False
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,Scotland,False


In [19]:
historical_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44060 entries, 0 to 44059
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        44060 non-null  object 
 1   home_team   44060 non-null  object 
 2   away_team   44060 non-null  object 
 3   home_score  44059 non-null  float64
 4   away_score  44059 non-null  float64
 5   tournament  44060 non-null  object 
 6   city        44060 non-null  object 
 7   country     44060 non-null  object 
 8   neutral     44060 non-null  bool   
dtypes: bool(1), float64(2), object(6)
memory usage: 2.7+ MB


Top 10 tournaments on database:

In [20]:
historical_results.groupby('tournament').count().sort_values('date', ascending=False)['date'].head(10)

tournament
Friendly                                17425
FIFA World Cup qualification             7774
UEFA Euro qualification                  2593
African Cup of Nations qualification     1932
FIFA World Cup                            900
Copa América                              841
AFC Asian Cup qualification               764
African Cup of Nations                    742
CECAFA Cup                                620
CFU Caribbean Cup qualification           606
Name: date, dtype: int64

In [21]:
historical_results_world_cup = historical_results.query('tournament == "FIFA World Cup"').sort_values('date').reset_index(drop=True)
historical_results_world_cup.head(10)

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1930-07-13,Belgium,USA,0.0,3.0,FIFA World Cup,Montevideo,Uruguay,True
1,1930-07-13,France,Mexico,4.0,1.0,FIFA World Cup,Montevideo,Uruguay,True
2,1930-07-14,Brazil,Yugoslavia,1.0,2.0,FIFA World Cup,Montevideo,Uruguay,True
3,1930-07-14,Peru,Romania,1.0,3.0,FIFA World Cup,Montevideo,Uruguay,True
4,1930-07-15,Argentina,France,1.0,0.0,FIFA World Cup,Montevideo,Uruguay,True
5,1930-07-16,Chile,Mexico,3.0,0.0,FIFA World Cup,Montevideo,Uruguay,True
6,1930-07-17,Bolivia,Yugoslavia,0.0,4.0,FIFA World Cup,Montevideo,Uruguay,True
7,1930-07-17,Paraguay,USA,0.0,3.0,FIFA World Cup,Montevideo,Uruguay,True
8,1930-07-18,Uruguay,Peru,1.0,0.0,FIFA World Cup,Montevideo,Uruguay,False
9,1930-07-19,Argentina,Mexico,6.0,3.0,FIFA World Cup,Montevideo,Uruguay,True


Identifying finals

In [22]:
historical_results_world_cup['is_final'] = ''

for i in range(len(historical_results_world_cup['date'])):
  if i < 899 and historical_results_world_cup[i:i+1]['date'].astype(str).str[:4][i] != historical_results_world_cup[i+1:i+2]['date'].astype(str).str[:4][i+1]:
    historical_results_world_cup['is_final'][i] = 'TRUE'
  elif i < 899:
    historical_results_world_cup['is_final'][i] = 'FALSE'
  elif i == 899:
    historical_results_world_cup['is_final'][i] = 'TRUE'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_results_world_cup['is_final'][i] = 'FALSE'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_results_world_cup['is_final'][i] = 'TRUE'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_results_world_cup['is_final'][i] = 'TRUE'


In [23]:
historical_results_world_cup.query('is_final == "TRUE"')

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,is_final
17,1930-07-30,Uruguay,Argentina,4.0,2.0,FIFA World Cup,Montevideo,Uruguay,False,True
34,1934-06-10,Italy,Czechoslovakia,2.0,1.0,FIFA World Cup,Rome,Italy,False,True
52,1938-06-19,Hungary,Italy,2.0,4.0,FIFA World Cup,Colombes,France,True,True
74,1950-07-16,Spain,Sweden,1.0,3.0,FIFA World Cup,São Paulo,Brazil,True,True
100,1954-07-04,Germany,Hungary,3.0,2.0,FIFA World Cup,Berne,Switzerland,True,True
135,1958-06-29,Sweden,Brazil,2.0,5.0,FIFA World Cup,Solna,Sweden,False,True
167,1962-06-17,Brazil,Czechoslovakia,3.0,1.0,FIFA World Cup,Santiago,Chile,True,True
199,1966-07-30,England,Germany,4.0,2.0,FIFA World Cup,London,England,False,True
231,1970-06-21,Brazil,Italy,4.0,1.0,FIFA World Cup,Mexico City,Mexico,True,True
269,1974-07-07,Germany,Netherlands,2.0,1.0,FIFA World Cup,Munich,Germany,False,True


In [24]:
historical_results_world_cup['date'] = historical_results_world_cup['date'].astype('datetime64')

# Historical Results with possibily to win

Addying winner

In [25]:
historical_results_world_cup['winner'] = ''

for i in range(len(historical_results_world_cup['date'])):
  if historical_results_world_cup['home_score'][i] > historical_results_world_cup['away_score'][i]:
    historical_results_world_cup['winner'][i] = 'home_win'
  elif historical_results_world_cup['home_score'][i] < historical_results_world_cup['away_score'][i]:
    historical_results_world_cup['winner'][i] = 'away_win'
  else:
    historical_results_world_cup['winner'][i] = 'draw'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_results_world_cup['winner'][i] = 'away_win'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_results_world_cup['winner'][i] = 'home_win'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical_results_world_cup['winner'][i] = 'draw'


In [26]:
historical_results_world_cup

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,is_final,winner
0,1930-07-13,Belgium,USA,0.0,3.0,FIFA World Cup,Montevideo,Uruguay,True,FALSE,away_win
1,1930-07-13,France,Mexico,4.0,1.0,FIFA World Cup,Montevideo,Uruguay,True,FALSE,home_win
2,1930-07-14,Brazil,Yugoslavia,1.0,2.0,FIFA World Cup,Montevideo,Uruguay,True,FALSE,away_win
3,1930-07-14,Peru,Romania,1.0,3.0,FIFA World Cup,Montevideo,Uruguay,True,FALSE,away_win
4,1930-07-15,Argentina,France,1.0,0.0,FIFA World Cup,Montevideo,Uruguay,True,FALSE,home_win
...,...,...,...,...,...,...,...,...,...,...,...
895,2018-07-07,Russia,Croatia,2.0,2.0,FIFA World Cup,Sochi,Russia,False,FALSE,draw
896,2018-07-10,France,Belgium,1.0,0.0,FIFA World Cup,Saint Petersburg,Russia,True,FALSE,home_win
897,2018-07-11,Croatia,England,2.0,1.0,FIFA World Cup,Moscow,Russia,True,FALSE,home_win
898,2018-07-14,Belgium,England,2.0,0.0,FIFA World Cup,Saint Petersburg,Russia,True,FALSE,home_win


In [27]:
historical_results_world_cup = historical_results_world_cup.merge(historical_win_loose_draw_ratios, left_on=['home_team', 'away_team'], right_on=['country1', 'country2'], how='left').rename(columns={
    'wins' : 'home_wins',
    'looses' : 'home_looses'
})[['date', 'home_team', 'away_team', 'home_score', 'away_score', 'tournament', 'city', 'country', 'neutral', 'is_final', 'winner', 'games', 'home_wins', 'home_looses', 'draws']]

In [28]:
historical_results_world_cup = historical_results_world_cup.replace(np.nan, 0) # adding for teams that never played before

In [29]:
historical_results_world_cup.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,is_final,winner,games,home_wins,home_looses,draws
0,1930-07-13,Belgium,USA,0.0,3.0,FIFA World Cup,Montevideo,Uruguay,True,False,away_win,0.0,0.0,0.0,0.0
1,1930-07-13,France,Mexico,4.0,1.0,FIFA World Cup,Montevideo,Uruguay,True,False,home_win,7.0,0.714286,0.142857,0.142857
2,1930-07-14,Brazil,Yugoslavia,1.0,2.0,FIFA World Cup,Montevideo,Uruguay,True,False,away_win,0.0,0.0,0.0,0.0
3,1930-07-14,Peru,Romania,1.0,3.0,FIFA World Cup,Montevideo,Uruguay,True,False,away_win,0.0,0.0,0.0,0.0
4,1930-07-15,Argentina,France,1.0,0.0,FIFA World Cup,Montevideo,Uruguay,True,False,home_win,12.0,0.5,0.25,0.25


# Historical result with ranking on date

Checking data

In [30]:
ranking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63916 entries, 0 to 63915
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   rank             63916 non-null  int64  
 1   country_full     63916 non-null  object 
 2   country_abrv     63916 non-null  object 
 3   total_points     63916 non-null  float64
 4   previous_points  63916 non-null  float64
 5   rank_change      63916 non-null  int64  
 6   confederation    63916 non-null  object 
 7   rank_date        63916 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 3.9+ MB


In [31]:
ranking['country_full'].unique()

array(['Germany', 'Syria', 'Burkina Faso', 'Latvia', 'Burundi', 'Togo',
       'Angola', 'Suriname', 'Luxembourg',
       'St. Vincent and the Grenadines', 'Fiji', 'Mozambique',
       'Indonesia', 'Antigua and Barbuda', 'Jordan', 'Eswatini',
       'Faroe Islands', 'Lithuania', 'Uganda', 'Korea DPR', 'Peru',
       'Sierra Leone', 'Tanzania', 'Niger', 'Iraq', 'Guatemala',
       'Liberia', 'Ethiopia', 'Albania', 'Bolivia', 'Bahrain',
       'Singapore', 'Congo', 'Malaysia', 'Sudan', 'Croatia', 'Barbados',
       'Yemen', 'Cape Verde Islands', 'Solomon Islands', 'Libya',
       'Botswana', 'Brunei Darussalam', 'Dominican Republic', 'Lesotho',
       'India', 'Cuba', 'Pakistan', 'Belarus', 'Seychelles',
       'Central African Republic', 'Nepal', 'Gambia', 'Mauritania',
       'Kenya', 'Bangladesh', 'Slovenia', 'Oman', 'Guinea-Bissau',
       'St. Lucia', 'Hong Kong', 'Panama', 'Grenada', 'Puerto Rico',
       'Tahiti', 'South Africa', 'Venezuela', 'San Marino', 'Benin',
       'Guyana'

Adding home rank

In [32]:
historical_results_world_cup = historical_results_world_cup.merge(ranking, left_on=[historical_results_world_cup['date'].astype(str).str[:7], 'home_team'],
                                            right_on=[ranking['rank_date'].str[:7], 'country_full'],
                                            how='left').rename(columns={
                                                'rank' : 'home_rank',
                                                'total_points' : 'home_ranking_points'
})[['date', 'home_team', 'away_team', 'home_score', 'away_score',
    'tournament', 'city', 'country', 'neutral', 'is_final', 'winner',
    'games', 'home_wins', 'home_looses', 'draws', 'home_rank', 'home_ranking_points']]

Adding away rank

In [33]:
historical_results_world_cup = historical_results_world_cup.merge(ranking, left_on=[historical_results_world_cup['date'].astype(str).str[:7], 'away_team'],
                                            right_on=[ranking['rank_date'].str[:7], 'country_full'],
                                            how='left').rename(columns={
                                                'rank' : 'away_rank',
                                                'total_points' : 'away_ranking_points'
})[['date', 'home_team', 'away_team', 'home_score', 'away_score',
    'tournament', 'city', 'country', 'neutral', 'is_final', 'winner',
    'games', 'home_wins', 'home_looses', 'draws', 'home_rank', 'home_ranking_points',
    'away_rank', 'away_ranking_points']]

# Final historical dataset

In [34]:
historical_results_world_cup = historical_results_world_cup.replace(np.nan, 0) # replacing null ranks
historical_results_world_cup

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,is_final,winner,games,home_wins,home_looses,draws,home_rank,home_ranking_points,away_rank,away_ranking_points
0,1930-07-13,Belgium,USA,0.0,3.0,FIFA World Cup,Montevideo,Uruguay,True,FALSE,away_win,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
1,1930-07-13,France,Mexico,4.0,1.0,FIFA World Cup,Montevideo,Uruguay,True,FALSE,home_win,7.0,0.714286,0.142857,0.142857,0.0,0.0,0.0,0.0
2,1930-07-14,Brazil,Yugoslavia,1.0,2.0,FIFA World Cup,Montevideo,Uruguay,True,FALSE,away_win,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
3,1930-07-14,Peru,Romania,1.0,3.0,FIFA World Cup,Montevideo,Uruguay,True,FALSE,away_win,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
4,1930-07-15,Argentina,France,1.0,0.0,FIFA World Cup,Montevideo,Uruguay,True,FALSE,home_win,12.0,0.500000,0.250000,0.250000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,2018-07-07,Russia,Croatia,2.0,2.0,FIFA World Cup,Sochi,Russia,False,FALSE,draw,0.0,0.000000,0.000000,0.000000,49.0,1758.0,12.0,2036.0
896,2018-07-10,France,Belgium,1.0,0.0,FIFA World Cup,Saint Petersburg,Russia,True,FALSE,home_win,75.0,0.346667,0.400000,0.253333,2.0,2164.0,5.0,2124.0
897,2018-07-11,Croatia,England,2.0,1.0,FIFA World Cup,Moscow,Russia,True,FALSE,home_win,11.0,0.272727,0.545455,0.181818,12.0,2036.0,7.0,2099.0
898,2018-07-14,Belgium,England,2.0,0.0,FIFA World Cup,Saint Petersburg,Russia,True,FALSE,home_win,26.0,0.153846,0.653846,0.192308,5.0,2124.0,7.0,2099.0


In [35]:
historical_results_world_cup['home_rank'] = historical_results_world_cup['home_rank'].astype('float64')
historical_results_world_cup['home_ranking_points'] = historical_results_world_cup['home_ranking_points'].astype('float64')
historical_results_world_cup['away_rank'] = historical_results_world_cup['away_rank'].astype('float64')
historical_results_world_cup['away_ranking_points'] = historical_results_world_cup['away_ranking_points'].astype('float64')

In [36]:
historical_results_world_cup.corr()

  historical_results_world_cup.corr()


Unnamed: 0,home_score,away_score,neutral,games,home_wins,home_looses,draws,home_rank,home_ranking_points,away_rank,away_ranking_points
home_score,1.0,-0.055842,-0.079319,0.022959,0.138142,-0.132751,-0.04601,-0.069305,-0.064221,0.034068,-0.102018
away_score,-0.055842,1.0,0.090099,-0.019433,-0.136194,0.094234,-0.058265,0.050285,0.004528,-0.050946,0.03785
neutral,-0.079319,0.090099,1.0,-0.071665,-0.040462,0.022078,-0.009058,-0.020909,0.033924,0.034224,0.02417
games,0.022959,-0.019433,-0.071665,1.0,0.36431,0.358666,0.40517,-0.05218,0.043702,-0.061593,0.045578
home_wins,0.138142,-0.136194,-0.040462,0.36431,1.0,0.235928,0.41411,-0.071595,0.123191,0.074035,0.083405
home_looses,-0.132751,0.094234,0.022078,0.358666,0.235928,1.0,0.412383,0.081047,0.13448,0.043277,0.199022
draws,-0.04601,-0.058265,-0.009058,0.40517,0.41411,0.412383,1.0,0.005018,0.181988,0.00526,0.164009
home_rank,-0.069305,0.050285,-0.020909,-0.05218,-0.071595,0.081047,0.005018,1.0,0.389807,0.475894,0.482736
home_ranking_points,-0.064221,0.004528,0.033924,0.043702,0.123191,0.13448,0.181988,0.389807,1.0,0.500292,0.910891
away_rank,0.034068,-0.050946,0.034224,-0.061593,0.074035,0.043277,0.00526,0.475894,0.500292,1.0,0.4143


# Algoritmo

One hot encoding

In [37]:
# One-hot encode the data using pandas get_dummies
features = pd.get_dummies(historical_results_world_cup)
# Display the first 5 rows of the last 12 columns
features.iloc[:,5:].head(5)

Unnamed: 0,home_wins,home_looses,draws,home_rank,home_ranking_points,away_rank,away_ranking_points,home_team_Algeria,home_team_Angola,home_team_Argentina,...,country_Spain,country_Sweden,country_Switzerland,country_USA,country_Uruguay,is_final_FALSE,is_final_TRUE,winner_away_win,winner_draw,winner_home_win
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,...,0,0,0,0,1,1,0,1,0,0
1,0.714286,0.142857,0.142857,0.0,0.0,0.0,0.0,0,0,0,...,0,0,0,0,1,1,0,0,0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,...,0,0,0,0,1,1,0,1,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,...,0,0,0,0,1,1,0,1,0,0
4,0.5,0.25,0.25,0.0,0.0,0.0,0.0,0,0,1,...,0,0,0,0,1,1,0,0,0,1


Assigning label (what we want to predict) and converting to numpy

In [38]:
# Labels are the values we want to predict
train_labels = np.array(features.query('date < 2018')['home_score'])

# Remove the labels from the features
# axis 1 refers to the columns
train_features = features.query('date < 2018').drop('home_score', axis = 1).drop('date', axis = 1)
# Convert to numpy array
train_features = np.array(train_features)

In [39]:
test_labels = np.array(features.query('date >= 2018')['home_score'])
test_features = features.query('date >= 2018').drop('home_score', axis = 1).drop('date', axis = 1)
test_features = np.array(test_features)

Train and test split

In [40]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (836, 351)
Training Labels Shape: (836,)
Testing Features Shape: (64, 351)
Testing Labels Shape: (64,)


Establishing a baseline error

In [41]:
# # The baseline predictions are the historical averages
# baseline_preds = test_features[:, feature_list.index('average')]
# # Baseline errors, and display average baseline error
# baseline_errors = abs(baseline_preds - test_labels)
# print('Average baseline error: ', round(np.mean(baseline_errors), 2))

Predicting

In [42]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rf.fit(train_features, train_labels);

In [43]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 0.54 degrees.


In [44]:
predictions.round(0)

array([3., 0., 0., 3., 1., 0., 1., 2., 0., 0., 1., 2., 1., 1., 1., 1., 2.,
       2., 3., 0., 1., 1., 3., 2., 1., 1., 3., 1., 2., 2., 2., 1., 1., 2.,
       2., 1., 1., 0., 1., 1., 2., 1., 1., 2., 1., 0., 0., 0., 6., 2., 1.,
       1., 3., 3., 2., 1., 1., 1., 1., 2., 2., 2., 2., 3.])

In [45]:
df_predict_test = pd.merge(pd.DataFrame(predictions.round(0)), pd.DataFrame(test_labels), left_index=True, right_index=True).rename(columns={'0_x' : 'predicts', '0_y' : 'reality'})
df_predict_test['is_correct'] = df_predict_test['predicts'] - df_predict_test['reality']

df_predict_test.head(10)

Unnamed: 0,predicts,reality,is_correct
0,3.0,5.0,-2.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,3.0,3.0,0.0
4,1.0,2.0,-1.0
5,0.0,0.0,0.0
6,1.0,1.0,0.0
7,2.0,2.0,0.0
8,0.0,0.0,0.0
9,0.0,0.0,0.0


In [46]:
predict_right = ((df_predict_test['reality'].count() - df_predict_test.query('is_correct != 0')['is_correct'].count()) / df_predict_test['reality'].count() * 100).round(2)

print(f"The algorithm predicted right: {predict_right}% of the values")

The algorithm predicted right: 62.5% of the values


# 2022 calendar predictions

Reading and treating data

In [135]:
matches_schedule = pd.read_csv('https://raw.githubusercontent.com/rafabandoni/world-cup-22-predict/main/data/matches-schedule.csv')

matches_schedule['date'] = matches_schedule['date'].str[-4:] + '-' + matches_schedule['date'].str[3:5] + '-' + matches_schedule['date'].str[:2]

matches_schedule['home_score'] = ''
matches_schedule['away_score'] = ''

matches_schedule.rename(columns={'country1' : 'home_team', 'country2' : 'away_team'}, inplace=True)

matches_schedule = matches_schedule[['date', 'home_team', 'home_score', 'away_score', 'away_team', 'phase']]

matches_schedule['date'] = matches_schedule['date'].astype('datetime64')

matches_schedule.head(5)

Unnamed: 0,date,home_team,home_score,away_score,away_team,phase
0,2022-11-21,Qatar,,,Ecuador,group matches
1,2022-11-21,Senegal,,,Netherlands,group matches
2,2022-11-21,England,,,Iran,group matches
3,2022-11-21,USA,,,Wales,group matches
4,2022-11-22,France,,,Australia,group matches


Adding historical ratio and rank

In [136]:
matches_schedule = matches_schedule.merge(historical_win_loose_draw_ratios, left_on=['home_team', 'away_team'], right_on=['country1', 'country2'], how='left').rename(columns={
    'wins' : 'home_wins',
    'looses' : 'home_looses'
})[['date', 'home_team', 'away_team', 'home_score', 'away_score', 'phase', 'games', 'home_wins', 'home_looses', 'draws']]

In [137]:
matches_schedule = matches_schedule.merge(ranking.query('rank_date > 2021').groupby('country_full').mean().round(0),
                                          left_on='home_team',
                                          right_on='country_full',
                                          how='left').rename(columns={
                                                                      'rank' : 'home_rank',
                                                                      'total_points' : 'home_ranking_points'
})[['date', 'home_team', 'away_team', 'home_score', 'away_score',
    'phase', 'games', 'home_wins', 'home_looses', 'draws', 'home_rank', 'home_ranking_points']]

  matches_schedule = matches_schedule.merge(ranking.query('rank_date > 2021').groupby('country_full').mean().round(0),


In [138]:
matches_schedule = matches_schedule.merge(ranking.query('rank_date > 2021').groupby('country_full').mean().round(0),
                                          left_on='away_team',
                                          right_on='country_full',
                                          how='left').rename(columns={
                                                                      'rank' : 'away_rank',
                                                                      'total_points' : 'away_ranking_points'
})[['date', 'home_team', 'away_team', 'home_score', 'away_score',
    'phase', 'games', 'home_wins', 'home_looses', 'draws', 'home_rank',
    'home_ranking_points', 'away_rank', 'away_ranking_points']]

  matches_schedule = matches_schedule.merge(ranking.query('rank_date > 2021').groupby('country_full').mean().round(0),


Final data for schedule

In [140]:
matches_schedule.head(10)

Unnamed: 0,date,home_team,away_team,home_score,away_score,phase,games,home_wins,home_looses,draws,home_rank,home_ranking_points,away_rank,away_ranking_points
0,2022-11-21,Qatar,Ecuador,,,group matches,3.0,0.333333,0.333333,0.333333,50.0,1431.0,49.0,1436.0
1,2022-11-21,Senegal,Netherlands,,,group matches,,,,,20.0,1567.0,11.0,1647.0
2,2022-11-21,England,Iran,,,group matches,,,,,4.0,1733.0,,
3,2022-11-21,USA,Wales,,,group matches,,,,,15.0,1620.0,19.0,1574.0
4,2022-11-22,France,Australia,,,group matches,5.0,0.6,0.2,0.2,3.0,1769.0,38.0,1478.0
5,2022-11-22,Denmark,Tunisia,,,group matches,1.0,1.0,0.0,0.0,10.0,1651.0,29.0,1510.0
6,2022-11-22,Mexico,Poland,,,group matches,9.0,0.333333,0.333333,0.333333,11.0,1647.0,25.0,1540.0
7,2022-11-22,Argentina,Saudi Arabia,,,group matches,4.0,0.5,0.0,0.5,5.0,1727.0,55.0,1413.0
8,2022-11-23,Belgium,Canada,,,group matches,1.0,1.0,0.0,0.0,1.0,1816.0,50.0,1430.0
9,2022-11-23,Spain,Costa Rica,,,group matches,3.0,0.666667,0.0,0.333333,7.0,1689.0,43.0,1457.0


## Running algorithm