# Imports

Removing warnings from deprecated functions

In [431]:
import warnings

def fxn():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()

Importing libraries

In [432]:
import numpy as np
import pandas as pd

pd.options.mode.chained_assignment = None  # default='warn'

In [433]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

# Defining Functions

In [434]:
def add_rank(first_data, second_data, left_on, right_on, way_or_home):
    first_data = first_data.merge(second_data,
                                  left_on=left_on,
                                  right_on=right_on,
                                  how='left').rename(columns={
                                    'rank' : f'{way_or_home}_rank',
                                    'total_points' : f'{way_or_home}_ranking_points'
                                  })
    return first_data

In [435]:
def train_test(dataset, train_feature, test_feature, away_or_home):
    # one-hot encode the data using pandas get_dummies
    features = pd.get_dummies(dataset)

    # labels are the values we want to predict
    train_labels = np.array(features.query(train_feature)[away_or_home])

    # remove labels from feature, axis 1 refers to the columns
    train_features = features.query(train_feature).drop(away_or_home, axis=1).drop('date', axis=1)
    # convert to numpy array
    train_features = np.array(train_features)

    # creating labels and features
    test_labels = np.array(features.query(test_feature)[away_or_home])
    test_features = features.query(test_feature).drop(away_or_home).drop('date', axis=1)
    test_features = np.array(test_features)

    # checking labels and features
    print('Training Features Shape:', train_features.shape)
    print('Training Labels Shape:', train_labels.shape)
    print('Testing Features Shape:', test_features.shape)
    print('Testing Labels Shape:', test_labels.shape)

    # instantiate model with 1000 decision trees
    rf = RandomForestRegressor(n_estimators=1000, random_state=42)

    # train the model on training data
    rf.fit(train_features, train_labels)

    # use the forest's predict method on the test data
    predictions = rf.predict(test_features)

    # calculate the absolute errors
    errors = abs(predictions - test_labels)

    # print out the mean absolute error (mae)
    print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

    # checking how many gor right
    # merging test data and predicts data
    df_predict_test = pd.merge(pd.DataFrame(predictions.round(0)), pd.DataFrame(test_labels), left_index=True, right_index=True).rename(columns={'0_x' : 'predicts', '0_y' : 'reality'})
    df_predict_test['is_correct'] = df_predict_test['predicts'] - df_predict_test['reality']
    predict_right = ((df_predict_test['reality'].count() - df_predict_test.query('is_correct != 0')['is_correct'].count()) / df_predict_test['reality'].count() * 100).round(2)

    print(f"The algorithm predicted right: {predict_right}% of the values")

In [436]:
def run_predict(teams_to_query, features_df, labels_df):
    teams_to_query = labels_df[f'{teams_to_query}']

    # one-hot encode the data using pandas get_dummies
    features = pd.get_dummies(
        features_df.query('home_team in @teams_to_query & away_team in @teams_to_query'
        )[['date', 'home_team', 'away_team', 'home_score', 'away_score',
           'games', 'home_wins', 'home_looses', 'draws', 'home_rank',
           'home_ranking_points', 'away_rank', 'away_ranking_points']]
    )

    features_to_predict = pd.get_dummies(
        labels_df[['date', 'home_team', 'away_team', 'home_score', 'away_score',
                   'games', 'home_wins', 'home_looses', 'draws', 'home_rank',
                   'home_ranking_points', 'away_rank', 'away_ranking_points']]
    )

    # labels are the values we want to predict
    train_labels = np.array(features['away_score'])

    # remove the labels from the features, axis 1 refers to the columns
    train_features = features.drop('away_score', axis = 1).drop('date', axis = 1)
    # convert to numpy array
    train_features = np.array(train_features)

    test_labels = np.array(features_to_predict['away_score'])
    test_features = features_to_predict.drop('away_score', axis = 1).drop('date', axis = 1)
    test_features = np.array(test_features)

    # instantiate model with 1000 decision trees
    rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

    # train the model on training data
    rf.fit(train_features, train_labels)

    # use the forest's predict method on the test data
    predictions = rf.predict(test_features)

    return predictions

# Read Datasets

Historical results dataset

In [437]:
historical_results = pd.read_csv('https://raw.githubusercontent.com/rafabandoni/world-cup-22-predict/main/data/historical-results.csv')
historical_results.head(5)

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,England,False
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,England,False
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,Scotland,False


In [438]:
historical_results = historical_results.replace('United States','USA', regex=True) # changing for pattern

Win, loose and draw ratio dataset

In [439]:
historical_win_loose_draw_ratios = pd.read_csv('https://raw.githubusercontent.com/rafabandoni/world-cup-22-predict/main/data/historical_win-loose-draw_ratios.csv')
historical_win_loose_draw_ratios.head(5)

Unnamed: 0,country1,country2,games,wins,looses,draws
0,Argentina,Australia,7,0.714286,0.142857,0.142857
1,Australia,Argentina,7,0.142857,0.714286,0.142857
2,Argentina,Belgium,4,0.75,0.25,0.0
3,Belgium,Argentina,4,0.25,0.75,0.0
4,Argentina,Brazil,108,0.361111,0.398148,0.240741


FIFA ranking dataset

In [440]:
ranking = pd.read_csv('https://raw.githubusercontent.com/rafabandoni/world-cup-22-predict/main/data/ranking.csv')
ranking.head(5)

Unnamed: 0,rank,country_full,country_abrv,total_points,previous_points,rank_change,confederation,rank_date
0,1,Germany,GER,57.0,0.0,0,UEFA,1992-12-31
1,96,Syria,SYR,11.0,0.0,0,AFC,1992-12-31
2,97,Burkina Faso,BFA,11.0,0.0,0,CAF,1992-12-31
3,99,Latvia,LVA,10.0,0.0,0,UEFA,1992-12-31
4,100,Burundi,BDI,10.0,0.0,0,CAF,1992-12-31


Game shootouts historic dataset

In [441]:
shootouts = pd.read_csv('https://raw.githubusercontent.com/rafabandoni/world-cup-22-predict/main/data/shootouts.csv')
shootouts.head(5)

Unnamed: 0,date,home_team,away_team,winner
0,1967-08-22,India,Taiwan,Taiwan
1,1971-11-14,South Korea,Vietnam Republic,South Korea
2,1972-05-17,Thailand,South Korea,South Korea
3,1972-05-19,Thailand,Cambodia,Thailand
4,1973-04-21,Senegal,Ghana,Ghana


# Treating Historical Results Dataset

Checking data

In [442]:
historical_results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,England,False
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,England,False
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,Scotland,False


In [443]:
historical_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44060 entries, 0 to 44059
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        44060 non-null  object 
 1   home_team   44060 non-null  object 
 2   away_team   44060 non-null  object 
 3   home_score  44059 non-null  float64
 4   away_score  44059 non-null  float64
 5   tournament  44060 non-null  object 
 6   city        44060 non-null  object 
 7   country     44060 non-null  object 
 8   neutral     44060 non-null  bool   
dtypes: bool(1), float64(2), object(6)
memory usage: 2.7+ MB


Top 10 tournaments on database:

In [444]:
historical_results.groupby('tournament').count().sort_values('date', ascending=False)['date'].head(10)

tournament
Friendly                                17425
FIFA World Cup qualification             7774
UEFA Euro qualification                  2593
African Cup of Nations qualification     1932
FIFA World Cup                            900
Copa América                              841
AFC Asian Cup qualification               764
African Cup of Nations                    742
CECAFA Cup                                620
CFU Caribbean Cup qualification           606
Name: date, dtype: int64

## Aggregations

Filtering to only world cup data, adding Qatar and Wales data since there's nothing about them on world cup data

In [445]:
historical_results_world_cup = historical_results.query('tournament == "FIFA World Cup" \
                                                        | home_team == "Qatar" \
                                                        | away_team == "Qatar" \
                                                        | home_team == "Wales" \
                                                        | away_team == "Wales" \
                                                        | home_team == "Croatia" \
                                                        | away_team == "Croatia" \
                                                        | away_team == "Senegal" \
                                                        | home_team == "Senegal"').sort_values('date').reset_index(drop=True)

historical_results_world_cup['date'] = historical_results_world_cup['date'].astype('datetime64') # fixing date format

historical_results_world_cup.head(10)

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1876-03-25,Scotland,Wales,4.0,0.0,Friendly,Glasgow,Scotland,False
1,1877-03-05,Wales,Scotland,0.0,2.0,Friendly,Wrexham,Wales,False
2,1878-03-23,Scotland,Wales,9.0,0.0,Friendly,Glasgow,Scotland,False
3,1879-01-18,England,Wales,2.0,1.0,Friendly,London,England,False
4,1879-04-07,Wales,Scotland,0.0,3.0,Friendly,Wrexham,Wales,False
5,1880-03-15,Wales,England,2.0,3.0,Friendly,Wrexham,Wales,False
6,1880-03-27,Scotland,Wales,5.0,1.0,Friendly,Glasgow,Scotland,False
7,1881-02-26,England,Wales,0.0,1.0,Friendly,Blackburn,England,False
8,1881-03-14,Wales,Scotland,1.0,5.0,Friendly,Wrexham,Wales,False
9,1882-02-25,Wales,Northern Ireland,7.0,1.0,Friendly,Wrexham,Wales,False


Identifying finals

In [446]:
historical_results_world_cup['is_final'] = ''

for i in range(len(historical_results_world_cup['date'])):
  if i < 899 and historical_results_world_cup[i:i+1]['date'].astype(str).str[:4][i] != historical_results_world_cup[i+1:i+2]['date'].astype(str).str[:4][i+1]:
    historical_results_world_cup['is_final'][i] = 'TRUE'
  elif i < 899:
    historical_results_world_cup['is_final'][i] = 'FALSE'
  elif i == 899:
    historical_results_world_cup['is_final'][i] = 'TRUE'

# Historical Results with possibily to win

Addying winner and loosers

In [447]:
historical_results_world_cup['winner'] = ''

for i in range(len(historical_results_world_cup['date'])):
  if historical_results_world_cup['home_score'][i] > historical_results_world_cup['away_score'][i]:
    historical_results_world_cup['winner'][i] = 'home_win'
  elif historical_results_world_cup['home_score'][i] < historical_results_world_cup['away_score'][i]:
    historical_results_world_cup['winner'][i] = 'away_win'
  else:
    historical_results_world_cup['winner'][i] = 'draw'

In [448]:
# adding winners and loosers
historical_results_world_cup = historical_results_world_cup.merge(historical_win_loose_draw_ratios, left_on=['home_team', 'away_team'], right_on=['country1', 'country2'], how='left').rename(columns={
    'wins' : 'home_wins',
    'looses' : 'home_looses'
})[['date', 'home_team', 'away_team', 'home_score', 'away_score', 'tournament', 'city', 'country', 'neutral', 'is_final', 'winner', 'games', 'home_wins', 'home_looses', 'draws']]

historical_results_world_cup = historical_results_world_cup.replace(np.nan, 0) # adding for teams that never played before

# Historical result with ranking on date

Checking data

In [449]:
ranking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63916 entries, 0 to 63915
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   rank             63916 non-null  int64  
 1   country_full     63916 non-null  object 
 2   country_abrv     63916 non-null  object 
 3   total_points     63916 non-null  float64
 4   previous_points  63916 non-null  float64
 5   rank_change      63916 non-null  int64  
 6   confederation    63916 non-null  object 
 7   rank_date        63916 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 3.9+ MB


Adding home and away rank

In [450]:
# home rank
# historical_results_world_cup = historical_results_world_cup.merge(ranking, left_on=[historical_results_world_cup['date'].astype(str).str[:7], 'home_team'],
#                                             right_on=[ranking['rank_date'].str[:7], 'country_full'],
#                                             how='left').rename(columns={
#                                                 'rank' : 'home_rank',
#                                                 'total_points' : 'home_ranking_points'
# })[['date', 'home_team', 'away_team', 'home_score', 'away_score',
#     'tournament', 'city', 'country', 'neutral', 'is_final', 'winner',
#     'games', 'home_wins', 'home_looses', 'draws', 'home_rank', 'home_ranking_points']]

historical_results_world_cup = add_rank(historical_results_world_cup,
                                        ranking,
                                        [historical_results_world_cup['date'].astype(str).str[:7], 'home_team'],
                                        [ranking['rank_date'].str[:7], 'country_full'],
                                        'home')

historical_results_world_cup = historical_results_world_cup[['date', 'home_team', 'away_team', 'home_score', 'away_score',
                                 'tournament', 'city', 'country', 'neutral', 'is_final', 'winner',
                                 'games', 'home_wins', 'home_looses', 'draws', 'home_rank', 'home_ranking_points']]

# away rank
# historical_results_world_cup = historical_results_world_cup.merge(ranking, left_on=[historical_results_world_cup['date'].astype(str).str[:7], 'away_team'],
#                                             right_on=[ranking['rank_date'].str[:7], 'country_full'],
#                                             how='left').rename(columns={
#                                                 'rank' : 'away_rank',
#                                                 'total_points' : 'away_ranking_points'
# })[['date', 'home_team', 'away_team', 'home_score', 'away_score',
#     'tournament', 'city', 'country', 'neutral', 'is_final', 'winner',
#     'games', 'home_wins', 'home_looses', 'draws', 'home_rank', 'home_ranking_points',
#     'away_rank', 'away_ranking_points']]

historical_results_world_cup = add_rank(historical_results_world_cup,
                                        ranking,
                                        [historical_results_world_cup['date'].astype(str).str[:7], 'away_team'],
                                        [ranking['rank_date'].str[:7], 'country_full'],
                                        'away')

historical_results_world_cup = historical_results_world_cup[['date', 'home_team', 'away_team', 'home_score', 'away_score',
                                                             'tournament', 'city', 'country', 'neutral', 'is_final', 'winner',
                                                             'games', 'home_wins', 'home_looses', 'draws', 'home_rank', 'home_ranking_points',
                                                             'away_rank', 'away_ranking_points']]

In [451]:
historical_results_world_cup

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,is_final,winner,games,home_wins,home_looses,draws,home_rank,home_ranking_points,away_rank,away_ranking_points
0,1876-03-25,Scotland,Wales,4.0,0.0,Friendly,Glasgow,Scotland,False,TRUE,home_win,0.0,0.000000,0.000000,0.000000,,,,
1,1877-03-05,Wales,Scotland,0.0,2.0,Friendly,Wrexham,Wales,False,TRUE,away_win,0.0,0.000000,0.000000,0.000000,,,,
2,1878-03-23,Scotland,Wales,9.0,0.0,Friendly,Glasgow,Scotland,False,TRUE,home_win,0.0,0.000000,0.000000,0.000000,,,,
3,1879-01-18,England,Wales,2.0,1.0,Friendly,London,England,False,FALSE,home_win,104.0,0.653846,0.144231,0.201923,,,,
4,1879-04-07,Wales,Scotland,0.0,3.0,Friendly,Wrexham,Wales,False,TRUE,away_win,0.0,0.000000,0.000000,0.000000,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3045,2022-09-24,Bolivia,Senegal,0.0,2.0,Friendly,Orléans,France,True,,away_win,0.0,0.000000,0.000000,0.000000,,,,
3046,2022-09-25,Austria,Croatia,1.0,3.0,UEFA Nations League,Vienna,Austria,False,,away_win,0.0,0.000000,0.000000,0.000000,,,,
3047,2022-09-25,Wales,Poland,0.0,1.0,UEFA Nations League,Cardiff,Wales,False,,away_win,9.0,0.111111,0.666667,0.222222,,,,
3048,2022-09-27,Iran,Senegal,1.0,1.0,Friendly,Maria Enzersdorf,Austria,True,,draw,1.0,0.000000,0.000000,1.000000,,,,


# Final historical dataset

In [452]:
historical_results_world_cup = historical_results_world_cup.replace(np.nan, 0) # replacing null ranks

# fixing data types
historical_results_world_cup['home_rank'] = historical_results_world_cup['home_rank'].astype('float64')
historical_results_world_cup['home_ranking_points'] = historical_results_world_cup['home_ranking_points'].astype('float64')
historical_results_world_cup['away_rank'] = historical_results_world_cup['away_rank'].astype('float64')
historical_results_world_cup['away_ranking_points'] = historical_results_world_cup['away_ranking_points'].astype('float64')

# Algoritmo

Feature engineering

In [453]:
train_test(historical_results_world_cup, 'date < 2018', 'date >= 2018', 'home_score')

KeyError: "['home_score'] not found in axis"

In [275]:
# one-hot encode the data using pandas get_dummies
features = pd.get_dummies(historical_results_world_cup)

# labels are the values we want to predict
train_labels = np.array(features.query('date < 2018')['home_score'])

# remove the labels from the features
# axis 1 refers to the columns
train_features = features.query('date < 2018').drop('home_score', axis = 1).drop('date', axis = 1)
# convert to numpy array
train_features = np.array(train_features)

# creating labels and features
test_labels = np.array(features.query('date >= 2018')['home_score'])
test_features = features.query('date >= 2018').drop('home_score', axis = 1).drop('date', axis = 1)
test_features = np.array(test_features)

# checking labels and features
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (2767, 995)
Training Labels Shape: (2767,)
Testing Features Shape: (283, 995)
Testing Labels Shape: (283,)


Running train and test

In [276]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators=1000, random_state=42)

# Train the model on training data
rf.fit(train_features, train_labels)

# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 0.55 degrees.


In [277]:
# merging test data and predicts data
df_predict_test = pd.merge(pd.DataFrame(predictions.round(0)), pd.DataFrame(test_labels), left_index=True, right_index=True).rename(columns={'0_x' : 'predicts', '0_y' : 'reality'})
df_predict_test['is_correct'] = df_predict_test['predicts'] - df_predict_test['reality']
predict_right = ((df_predict_test['reality'].count() - df_predict_test.query('is_correct != 0')['is_correct'].count()) / df_predict_test['reality'].count() * 100).round(2)

print(f"The algorithm predicted right: {predict_right}% of the values")

The algorithm predicted right: 54.77% of the values


# 2022 calendar predictions

Reading and treating data

In [278]:
matches_schedule = pd.read_csv('https://raw.githubusercontent.com/rafabandoni/world-cup-22-predict/main/data/matches-schedule.csv')

# fixing date format
matches_schedule['date'] = matches_schedule['date'].str[-4:] + '-' + matches_schedule['date'].str[3:5] + '-' + matches_schedule['date'].str[:2]

# adding score columns
matches_schedule['home_score'] = 0
matches_schedule['away_score'] = 0

# renaming for pattern
matches_schedule.rename(columns={'country1' : 'home_team', 'country2' : 'away_team'}, inplace=True)
matches_schedule = matches_schedule[['date', 'home_team', 'home_score', 'away_score', 'away_team', 'phase']]

# fixing datatype for date
matches_schedule['date'] = matches_schedule['date'].astype('datetime64')

Adding historical ratio and rank

In [279]:
# adding outcome ratio
matches_schedule = matches_schedule.merge(historical_win_loose_draw_ratios, left_on=['home_team', 'away_team'], right_on=['country1', 'country2'], how='left').rename(columns={
    'wins' : 'home_wins',
    'looses' : 'home_looses'
})[['date', 'home_team', 'away_team', 'home_score', 'away_score', 'phase', 'games', 'home_wins', 'home_looses', 'draws']]

# adding home rank
matches_schedule = matches_schedule.merge(ranking.query('rank_date.astype("datetime64") > 2021').groupby('country_full').mean().round(0),
                                          left_on='home_team',
                                          right_on='country_full',
                                          how='left').rename(columns={
                                                                      'rank' : 'home_rank',
                                                                      'total_points' : 'home_ranking_points'
})[['date', 'home_team', 'away_team', 'home_score', 'away_score',
    'phase', 'games', 'home_wins', 'home_looses', 'draws', 'home_rank', 'home_ranking_points']]

# adding away rank
matches_schedule = matches_schedule.merge(ranking.query('rank_date.astype("datetime64") > 2021').groupby('country_full').mean().round(0),
                                          left_on='away_team',
                                          right_on='country_full',
                                          how='left').rename(columns={
                                                                      'rank' : 'away_rank',
                                                                      'total_points' : 'away_ranking_points'
})[['date', 'home_team', 'away_team', 'home_score', 'away_score',
    'phase', 'games', 'home_wins', 'home_looses', 'draws', 'home_rank',
    'home_ranking_points', 'away_rank', 'away_ranking_points']]

  matches_schedule = matches_schedule.merge(ranking.query('rank_date.astype("datetime64") > 2021').groupby('country_full').mean().round(0),
  matches_schedule = matches_schedule.merge(ranking.query('rank_date.astype("datetime64") > 2021').groupby('country_full').mean().round(0),


In [280]:
matches_schedule = matches_schedule.fillna(0) # to fix countries with no historical data

## Running algorithm

### Group Stage

In [281]:
matches_schedule

Unnamed: 0,date,home_team,away_team,home_score,away_score,phase,games,home_wins,home_looses,draws,home_rank,home_ranking_points,away_rank,away_ranking_points
0,2022-11-21,Qatar,Ecuador,0,0,group matches,3.0,0.333333,0.333333,0.333333,50.0,1431.0,49.0,1436.0
1,2022-11-21,Senegal,Netherlands,0,0,group matches,0.0,0.0,0.0,0.0,20.0,1567.0,11.0,1647.0
2,2022-11-21,England,Iran,0,0,group matches,0.0,0.0,0.0,0.0,4.0,1733.0,0.0,0.0
3,2022-11-21,USA,Wales,0,0,group matches,0.0,0.0,0.0,0.0,15.0,1620.0,19.0,1574.0
4,2022-11-22,France,Australia,0,0,group matches,5.0,0.6,0.2,0.2,3.0,1769.0,38.0,1478.0
5,2022-11-22,Denmark,Tunisia,0,0,group matches,1.0,1.0,0.0,0.0,10.0,1651.0,29.0,1510.0
6,2022-11-22,Mexico,Poland,0,0,group matches,9.0,0.333333,0.333333,0.333333,11.0,1647.0,25.0,1540.0
7,2022-11-22,Argentina,Saudi Arabia,0,0,group matches,4.0,0.5,0.0,0.5,5.0,1727.0,55.0,1413.0
8,2022-11-23,Belgium,Canada,0,0,group matches,1.0,1.0,0.0,0.0,1.0,1816.0,50.0,1430.0
9,2022-11-23,Spain,Costa Rica,0,0,group matches,3.0,0.666667,0.0,0.333333,7.0,1689.0,43.0,1457.0


Home prediction:

In [282]:
run_predict('home_team', historical_results_world_cup, matches_schedule)

array([1.51925238, 1.50783333, 1.09683333, 0.757     , 0.80365   ,
       0.62666667, 1.21616667, 0.697     , 0.52538095, 0.99      ,
       0.706     , 0.8668    , 0.783     , 1.10863333, 0.48838095,
       0.665     , 0.65108333, 0.9818    , 0.5705    , 0.861     ,
       0.669     , 0.46466667, 0.91541667, 0.94992421, 0.90446667,
       0.76413095, 0.692     , 2.357     , 2.95      , 1.15376667,
       1.036     , 1.17368333, 1.48524808, 1.164     , 2.279     ,
       0.655     , 1.824     , 1.564     , 1.37916667, 2.09895455,
       1.25333333, 2.20456667, 1.944     , 2.23      , 1.112     ,
       0.95983929, 1.144     , 2.50025758])

In [283]:
matches_schedule_predicted = pd.merge(pd.DataFrame(predictions.round(0)), matches_schedule, left_index=True, right_index=True)

matches_schedule_predicted.rename(columns={0 : "home_score_predicted"}, inplace=True)

matches_schedule_predicted = matches_schedule_predicted[['date', 'home_team', 'home_score_predicted', 'away_score', 'away_team']]

In [284]:
matches_schedule_predicted

Unnamed: 0,date,home_team,home_score_predicted,away_score,away_team
0,2022-11-21,Qatar,1.0,0,Ecuador
1,2022-11-21,Senegal,1.0,0,Netherlands
2,2022-11-21,England,2.0,0,Iran
3,2022-11-21,USA,1.0,0,Wales
4,2022-11-22,France,2.0,0,Australia
5,2022-11-22,Denmark,0.0,0,Tunisia
6,2022-11-22,Mexico,0.0,0,Poland
7,2022-11-22,Argentina,0.0,0,Saudi Arabia
8,2022-11-23,Belgium,0.0,0,Canada
9,2022-11-23,Spain,0.0,0,Costa Rica


In [237]:
teams_to_query = matches_schedule['home_team']

# one-hot encode the data using pandas get_dummies
features = pd.get_dummies(
    historical_results_world_cup.query('home_team in @teams_to_query').query(
        'away_team in @teams_to_query'
    )[['date', 'home_team', 'away_team', 'home_score', 'away_score',
       'games', 'home_wins', 'home_looses', 'draws', 'home_rank',
       'home_ranking_points', 'away_rank', 'away_ranking_points']]
    )
    
features_to_predict = pd.get_dummies(
    matches_schedule[['date', 'home_team', 'away_team', 'home_score', 'away_score',
                      'games', 'home_wins', 'home_looses', 'draws', 'home_rank',
                      'home_ranking_points', 'away_rank', 'away_ranking_points']])

# labels are the values we want to predict
train_labels = np.array(features['home_score'])

# remove the labels from the features
# axis 1 refers to the columns
train_features = features.drop('home_score', axis = 1).drop('date', axis = 1)
# convert to numpy array
train_features = np.array(train_features)

test_labels = np.array(features_to_predict['home_score'])
test_features = features_to_predict.drop('home_score', axis = 1).drop('date', axis = 1)
test_features = np.array(test_features)

# instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# train the model on training data
rf.fit(train_features, train_labels)

# use the forest's predict method on the test data
predictions = rf.predict(test_features)

Merging data predicted with match table

In [238]:
matches_schedule_predicted = pd.merge(pd.DataFrame(predictions.round(0)), matches_schedule, left_index=True, right_index=True)

matches_schedule_predicted.rename(columns={0 : "home_score_predicted"}, inplace=True)

matches_schedule_predicted = matches_schedule_predicted[['date', 'home_team', 'home_score_predicted', 'away_score', 'away_team']]

Away prediction:

In [239]:
teams_to_query = matches_schedule['away_team']

# one-hot encode the data using pandas get_dummies
features = pd.get_dummies(
    historical_results_world_cup.query('home_team in @teams_to_query').query(
        'away_team in @teams_to_query'
    )[['date', 'home_team', 'away_team', 'home_score', 'away_score',
       'games', 'home_wins', 'home_looses', 'draws', 'home_rank',
       'home_ranking_points', 'away_rank', 'away_ranking_points']]
    )
    
features_to_predict = pd.get_dummies(
    matches_schedule[['date', 'home_team', 'away_team', 'home_score', 'away_score',
                      'games', 'home_wins', 'home_looses', 'draws', 'home_rank',
                      'home_ranking_points', 'away_rank', 'away_ranking_points']])

# labels are the values we want to predict
train_labels = np.array(features['away_score'])

# remove the labels from the features
# axis 1 refers to the columns
train_features = features.drop('away_score', axis = 1).drop('date', axis = 1)
# convert to numpy array
train_features = np.array(train_features)

test_labels = np.array(features_to_predict['away_score'])
test_features = features_to_predict.drop('away_score', axis = 1).drop('date', axis = 1)
test_features = np.array(test_features)

# instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# train the model on training data
rf.fit(train_features, train_labels)

# use the forest's predict method on the test data
predictions = rf.predict(test_features)

Mergind data predicted with match data

In [240]:
matches_schedule_predicted = pd.merge(pd.DataFrame(predictions.round(0)), matches_schedule_predicted, left_index=True, right_index=True)

matches_schedule_predicted.rename(columns={0 : "away_score_predicted"}, inplace=True)

matches_schedule_predicted = matches_schedule_predicted[['date', 'home_team', 'home_score_predicted', 'away_score_predicted', 'away_team']]

In [241]:
matches_schedule_predicted

Unnamed: 0,date,home_team,home_score_predicted,away_score_predicted,away_team
0,2022-11-21,Qatar,2.0,2.0,Ecuador
1,2022-11-21,Senegal,1.0,2.0,Netherlands
2,2022-11-21,England,1.0,1.0,Iran
3,2022-11-21,USA,1.0,1.0,Wales
4,2022-11-22,France,2.0,1.0,Australia
5,2022-11-22,Denmark,4.0,1.0,Tunisia
6,2022-11-22,Mexico,1.0,1.0,Poland
7,2022-11-22,Argentina,1.0,1.0,Saudi Arabia
8,2022-11-23,Belgium,2.0,1.0,Canada
9,2022-11-23,Spain,2.0,1.0,Costa Rica


Exporting CSV

In [242]:
matches_schedule_predicted.to_csv('matches_schedule_predicted.csv', index=False)

### Playoffs

Phase of 16

In [243]:
# ingesting data
data = [
    ['2022-12-03', 'Netherlands', 0, 0, 'USA', 'stage of 16'],
    ['2022-12-03', 'Argentina', 0, 0, 'Australia', 'stage of 16'],
    ['2022-12-04', 'France', 0, 0, 'Poland', 'stage of 16'],
    ['2022-12-04', 'England', 0, 0, 'Senegal', 'stage of 16'],
    ['2022-12-05', 'Japan', 0, 0, 'Croatia', 'stage of 16'],
    ['2022-12-05', 'Brazil', 0, 0, 'South Korea', 'stage of 16'],
    ['2022-12-06', 'Morocco', 0, 0, 'Spain', 'stage of 16'],
    ['2022-12-06', 'Portugal', 0, 0, 'Switzerland', 'stage of 16']
    ]
 
# creating pandas DataFrame
matches_schedule_phase16 = pd.DataFrame(data, columns=['date', 'home_team', 'home_score', 'away_score', 'away_team', 'phase'])
 
# checking data
matches_schedule_phase16

Unnamed: 0,date,home_team,home_score,away_score,away_team,phase
0,2022-12-03,Netherlands,0,0,USA,stage of 16
1,2022-12-03,Argentina,0,0,Australia,stage of 16
2,2022-12-04,France,0,0,Poland,stage of 16
3,2022-12-04,England,0,0,Senegal,stage of 16
4,2022-12-05,Japan,0,0,Croatia,stage of 16
5,2022-12-05,Brazil,0,0,South Korea,stage of 16
6,2022-12-06,Morocco,0,0,Spain,stage of 16
7,2022-12-06,Portugal,0,0,Switzerland,stage of 16


Enriching data

In [244]:
matches_schedule_phase16['date'] = matches_schedule_phase16['date'].astype('datetime64')

# adding outcome ratio
matches_schedule_phase16 = matches_schedule_phase16.merge(historical_win_loose_draw_ratios, left_on=['home_team', 'away_team'], right_on=['country1', 'country2'], how='left').rename(columns={
    'wins' : 'home_wins',
    'looses' : 'home_looses'
})[['date', 'home_team', 'away_team', 'home_score', 'away_score', 'phase', 'games', 'home_wins', 'home_looses', 'draws']]

# adding home rank
matches_schedule_phase16 = matches_schedule_phase16.merge(ranking.query('rank_date.astype("datetime64") > 2021').groupby('country_full').mean().round(0),
                                          left_on='home_team',
                                          right_on='country_full',
                                          how='left').rename(columns={
                                                                      'rank' : 'home_rank',
                                                                      'total_points' : 'home_ranking_points'
})[['date', 'home_team', 'away_team', 'home_score', 'away_score',
    'phase', 'games', 'home_wins', 'home_looses', 'draws', 'home_rank', 'home_ranking_points']]

# adding away rank
matches_schedule_phase16 = matches_schedule_phase16.merge(ranking.query('rank_date.astype("datetime64") > 2021').groupby('country_full').mean().round(0),
                                          left_on='away_team',
                                          right_on='country_full',
                                          how='left').rename(columns={
                                                                      'rank' : 'away_rank',
                                                                      'total_points' : 'away_ranking_points'
})[['date', 'home_team', 'away_team', 'home_score', 'away_score',
    'phase', 'games', 'home_wins', 'home_looses', 'draws', 'home_rank',
    'home_ranking_points', 'away_rank', 'away_ranking_points']]

matches_schedule_phase16 = matches_schedule_phase16.fillna(0)

  matches_schedule_phase16 = matches_schedule_phase16.merge(ranking.query('rank_date.astype("datetime64") > 2021').groupby('country_full').mean().round(0),
  matches_schedule_phase16 = matches_schedule_phase16.merge(ranking.query('rank_date.astype("datetime64") > 2021').groupby('country_full').mean().round(0),


Home prediction

In [245]:
teams_to_query = matches_schedule_phase16['home_team']

# one-hot encode the data using pandas get_dummies
features = pd.get_dummies(
    historical_results_world_cup.query('home_team in @teams_to_query').query(
        'away_team in @teams_to_query'
    )[['date', 'home_team', 'away_team', 'home_score', 'away_score',
       'games', 'home_wins', 'home_looses', 'draws', 'home_rank',
       'home_ranking_points', 'away_rank', 'away_ranking_points']]
    )
    
features_to_predict = pd.get_dummies(
    matches_schedule_phase16[['date', 'home_team', 'away_team', 'home_score', 'away_score',
                      'games', 'home_wins', 'home_looses', 'draws', 'home_rank',
                      'home_ranking_points', 'away_rank', 'away_ranking_points']])

# labels are the values we want to predict
train_labels = np.array(features['home_score'])

# remove the labels from the features
# axis 1 refers to the columns
train_features = features.drop('home_score', axis = 1).drop('date', axis = 1)
# convert to numpy array
train_features = np.array(train_features)

test_labels = np.array(features_to_predict['home_score'])
test_features = features_to_predict.drop('home_score', axis = 1).drop('date', axis = 1)
test_features = np.array(test_features)

# instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# train the model on training data
rf.fit(train_features, train_labels)

# use the forest's predict method on the test data
predictions = rf.predict(test_features)

# adding prediction to df
matches_schedule_phase16_predicted = pd.merge(pd.DataFrame(predictions.round(0)), matches_schedule_phase16, left_index=True, right_index=True)
matches_schedule_phase16_predicted.rename(columns={0 : "home_score_predicted"}, inplace=True)
matches_schedule_phase16_predicted = matches_schedule_phase16_predicted[['date', 'home_team', 'home_score_predicted', 'away_score', 'away_team']]

Away prediction

In [246]:
teams_to_query = matches_schedule_phase16.query('away_team != "Senegal"')['away_team']

# one-hot encode the data using pandas get_dummies
features = pd.get_dummies(
    historical_results_world_cup.query('home_team in @teams_to_query & away_team in @teams_to_query'
    )[['date', 'home_team', 'away_team', 'home_score', 'away_score',
       'games', 'home_wins', 'home_looses', 'draws', 'home_rank',
       'home_ranking_points', 'away_rank', 'away_ranking_points']]
    )
    
features_to_predict = pd.get_dummies(
    matches_schedule_phase16.query('away_team != "Senegal"')[['date', 'home_team', 'away_team', 'home_score', 'away_score',
                      'games', 'home_wins', 'home_looses', 'draws', 'home_rank',
                      'home_ranking_points', 'away_rank', 'away_ranking_points']])

# labels are the values we want to predict
train_labels = np.array(features['away_score'])

# remove the labels from the features
# axis 1 refers to the columns
train_features = features.drop('away_score', axis = 1).drop('date', axis = 1)
# convert to numpy array
train_features = np.array(train_features)

test_labels = np.array(features_to_predict['away_score'])
test_features = features_to_predict.drop('away_score', axis = 1).drop('date', axis = 1)
test_features = np.array(test_features)

# instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# train the model on training data
rf.fit(train_features, train_labels)

# use the forest's predict method on the test data
predictions = rf.predict(test_features)

# adding predicts to df
# matches_schedule_phase16_predicted = pd.merge(pd.DataFrame(predictions.round(0)), matches_schedule_phase16_predicted, left_index=True, right_index=True)
# matches_schedule_phase16_predicted.rename(columns={0 : "away_score_predicted"}, inplace=True)
# matches_schedule_phase16_predicted = matches_schedule_phase16_predicted[['date', 'home_team', 'home_score_predicted', 'away_score_predicted', 'away_team']]

In [247]:
predictions

array([1.984, 2.196, 2.116, 1.661, 0.959, 1.656, 1.844])

In [None]:
predictions = np.array([1.984, 2.196, 2.116, 0, 1.661, 0.959, 1.656, 1.844]) # workaround to add 0 to Senegal

In [None]:
matches_schedule_phase16_predicted = pd.merge(pd.DataFrame(predictions.round(0)), matches_schedule_phase16_predicted, left_index=True, right_index=True)
matches_schedule_phase16_predicted.rename(columns={0 : "away_score_predicted"}, inplace=True)
matches_schedule_phase16_predicted = matches_schedule_phase16_predicted[['date', 'home_team', 'home_score_predicted', 'away_score_predicted', 'away_team']]

Checking predictions

In [None]:
matches_schedule_phase16_predicted

Unnamed: 0,date,home_team,home_score_predicted,away_score_predicted,away_team
0,2022-12-03,Netherlands,1.0,2.0,USA
1,2022-12-03,Argentina,1.0,2.0,Australia
2,2022-12-04,France,2.0,2.0,Poland
3,2022-12-04,England,1.0,0.0,Senegal
4,2022-12-05,Japan,1.0,2.0,Croatia
5,2022-12-05,Brazil,1.0,1.0,South Korea
6,2022-12-06,Morocco,1.0,2.0,Spain
7,2022-12-06,Portugal,0.0,2.0,Switzerland


In [None]:
matches_schedule_phase16_predicted.to_csv('phase_of_16_predictions.csv')

Phase of 8

In [None]:
# ingesting data
data = [
    ['2022-12-09', 'Netherlands', 0, 0, 'Argentina', 'stage of 8'],
    ['2022-12-09', 'Croatia', 0, 0, 'Brazil', 'stage of 8'],
    ['2022-12-10', 'England', 0, 0, 'France', 'stage of 8'],
    ['2022-12-10', 'Morocco', 0, 0, 'Portugal', 'stage of 8']
    ]
 
# creating pandas DataFrame
matches_schedule_phase8 = pd.DataFrame(data, columns=['date', 'home_team', 'home_score', 'away_score', 'away_team', 'phase'])
 
# checking data
matches_schedule_phase8

Unnamed: 0,date,home_team,home_score,away_score,away_team,phase
0,2022-12-09,Netherlands,0,0,Argentina,stage of 8
1,2022-12-09,Croatia,0,0,Brazil,stage of 8
2,2022-12-10,England,0,0,France,stage of 8
3,2022-12-10,Morocco,0,0,Portugal,stage of 8


Enriching data

In [None]:
matches_schedule_phase8['date'] = matches_schedule_phase8['date'].astype('datetime64')

# adding outcome ratio
matches_schedule_phase8 = matches_schedule_phase8.merge(historical_win_loose_draw_ratios, left_on=['home_team', 'away_team'], right_on=['country1', 'country2'], how='left').rename(columns={
    'wins' : 'home_wins',
    'looses' : 'home_looses'
})[['date', 'home_team', 'away_team', 'home_score', 'away_score', 'phase', 'games', 'home_wins', 'home_looses', 'draws']]

# adding home rank
matches_schedule_phase8 = matches_schedule_phase8.merge(ranking.query('rank_date.astype("datetime64") > 2021').groupby('country_full').mean().round(0),
                                          left_on='home_team',
                                          right_on='country_full',
                                          how='left').rename(columns={
                                                                      'rank' : 'home_rank',
                                                                      'total_points' : 'home_ranking_points'
})[['date', 'home_team', 'away_team', 'home_score', 'away_score',
    'phase', 'games', 'home_wins', 'home_looses', 'draws', 'home_rank', 'home_ranking_points']]

# adding away rank
matches_schedule_phase8 = matches_schedule_phase8.merge(ranking.query('rank_date.astype("datetime64") > 2021').groupby('country_full').mean().round(0),
                                          left_on='away_team',
                                          right_on='country_full',
                                          how='left').rename(columns={
                                                                      'rank' : 'away_rank',
                                                                      'total_points' : 'away_ranking_points'
})[['date', 'home_team', 'away_team', 'home_score', 'away_score',
    'phase', 'games', 'home_wins', 'home_looses', 'draws', 'home_rank',
    'home_ranking_points', 'away_rank', 'away_ranking_points']]

matches_schedule_phase8 = matches_schedule_phase8.fillna(0)

  matches_schedule_phase8 = matches_schedule_phase8.merge(ranking.query('rank_date.astype("datetime64") > 2021').groupby('country_full').mean().round(0),
  matches_schedule_phase8 = matches_schedule_phase8.merge(ranking.query('rank_date.astype("datetime64") > 2021').groupby('country_full').mean().round(0),


Home prediction

In [None]:
teams_to_query = matches_schedule_phase8['home_team']

# one-hot encode the data using pandas get_dummies
features = pd.get_dummies(
    historical_results_world_cup.query('home_team in @teams_to_query').query(
        'away_team in @teams_to_query'
    )[['date', 'home_team', 'away_team', 'home_score', 'away_score',
       'games', 'home_wins', 'home_looses', 'draws', 'home_rank',
       'home_ranking_points', 'away_rank', 'away_ranking_points']]
    )
    
features_to_predict = pd.get_dummies(
    matches_schedule_phase8[['date', 'home_team', 'away_team', 'home_score', 'away_score',
                      'games', 'home_wins', 'home_looses', 'draws', 'home_rank',
                      'home_ranking_points', 'away_rank', 'away_ranking_points']])

# labels are the values we want to predict
train_labels = np.array(features['home_score'])

# remove the labels from the features
# axis 1 refers to the columns
train_features = features.drop('home_score', axis = 1).drop('date', axis = 1)
# convert to numpy array
train_features = np.array(train_features)

test_labels = np.array(features_to_predict['home_score'])
test_features = features_to_predict.drop('home_score', axis = 1).drop('date', axis = 1)
test_features = np.array(test_features)

# instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# train the model on training data
rf.fit(train_features, train_labels)

# use the forest's predict method on the test data
predictions = rf.predict(test_features)

# adding prediction to df
matches_schedule_phase8_predicted = pd.merge(pd.DataFrame(predictions.round(0)), matches_schedule_phase8, left_index=True, right_index=True)
matches_schedule_phase8_predicted.rename(columns={0 : "home_score_predicted"}, inplace=True)
matches_schedule_phase8_predicted = matches_schedule_phase8_predicted[['date', 'home_team', 'home_score_predicted', 'away_score', 'away_team']]

Away prediction

In [None]:
teams_to_query = matches_schedule_phase8.query('away_team != "Senegal"')['away_team']

# one-hot encode the data using pandas get_dummies
features = pd.get_dummies(
    historical_results_world_cup.query('home_team in @teams_to_query').query(
        'away_team in @teams_to_query'
    )[['date', 'home_team', 'away_team', 'home_score', 'away_score',
       'games', 'home_wins', 'home_looses', 'draws', 'home_rank',
       'home_ranking_points', 'away_rank', 'away_ranking_points']]
    )
    
features_to_predict = pd.get_dummies(
    matches_schedule_phase8[['date', 'home_team', 'away_team', 'home_score', 'away_score',
                      'games', 'home_wins', 'home_looses', 'draws', 'home_rank',
                      'home_ranking_points', 'away_rank', 'away_ranking_points']])

# labels are the values we want to predict
train_labels = np.array(features['away_score'])

# remove the labels from the features
# axis 1 refers to the columns
train_features = features.drop('away_score', axis = 1).drop('date', axis = 1)
# convert to numpy array
train_features = np.array(train_features)

test_labels = np.array(features_to_predict['away_score'])
test_features = features_to_predict.drop('away_score', axis = 1).drop('date', axis = 1)
test_features = np.array(test_features)

# instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# train the model on training data
rf.fit(train_features, train_labels)

# use the forest's predict method on the test data
predictions = rf.predict(test_features)

# adding predicts to df
matches_schedule_phase8_predicted = pd.merge(pd.DataFrame(predictions.round(0)), matches_schedule_phase8_predicted, left_index=True, right_index=True)
matches_schedule_phase8_predicted.rename(columns={0 : "away_score_predicted"}, inplace=True)
matches_schedule_phase8_predicted = matches_schedule_phase8_predicted[['date', 'home_team', 'home_score_predicted', 'away_score_predicted', 'away_team']]

Checking predictions

In [None]:
matches_schedule_phase8_predicted

Unnamed: 0,date,home_team,home_score_predicted,away_score_predicted,away_team
0,2022-12-09,Netherlands,2.0,1.0,Argentina
1,2022-12-09,Croatia,1.0,0.0,Brazil
2,2022-12-10,England,1.0,1.0,France
3,2022-12-10,Morocco,1.0,1.0,Portugal


In [None]:
matches_schedule_phase8_predicted.to_csv('phase_of_8_predicts.csv')