In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, cohen_kappa_score, accuracy_score, classification_report, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler


In [49]:
data = pd.read_csv('FIFA_World_Cup_Tidy.csv')

In [50]:
data_train = data[data['season'] != 2022]  # Exclude 2022 data

In [51]:
data_continent = pd.get_dummies(data, columns = ['continent'])

In [52]:
data_predict = data_continent[['players_used', 'age', 'matches_played', 'goals', 'yellow_cards', 'red_cards', 'world_cup_winner', 'continent_Africa', 'continent_Americas', 'continent_Asia', 'continent_Europe', 'continent_Oceania']]

In [53]:
X = data_predict.drop(columns=['world_cup_winner'])
y = data_predict[['world_cup_winner']]
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_valid, y_train, y_valid = train_test_split(X, y)

In [54]:
y = y.values.ravel()
y_train = y_train.values.ravel()
y_valid = y_valid.values.ravel()

In [55]:
model_dct = model_dct = DecisionTreeClassifier(max_depth=5, max_features=5, max_leaf_nodes=5, random_state=42, class_weight='balanced')
model_dct.fit(X_train, y_train)

In [56]:
y_pred = model_dct.predict(X_valid)
y_prob = model_dct.predict_proba(X_valid)[:, 1]

In [57]:
print("Classification Report:\n", classification_report(y_valid, y_pred), "\n")
print("Model prediction score:", model_dct.score(X_valid, y_valid))
print("Accuracy score:", accuracy_score(y_valid, y_pred))
print("Baseline accuracy score:", len(y[y == 0]) / len(y))
print("Cohen’s Kappa score", cohen_kappa_score(y_valid, y_pred))
print("ROC-AUC score:", roc_auc_score(y_valid, y_prob))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.81      0.90       112
           1       0.16      1.00      0.28         4

    accuracy                           0.82       116
   macro avg       0.58      0.91      0.59       116
weighted avg       0.97      0.82      0.88       116
 

Model prediction score: 0.8189655172413793
Accuracy score: 0.8189655172413793
Baseline accuracy score: 0.9525862068965517
Cohen’s Kappa score 0.23008849557522115
ROC-AUC score: 0.9375


In [58]:
data_predict['win_probability'] = model_dct.predict_proba(X)[:, 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_predict['win_probability'] = model_dct.predict_proba(X)[:, 1]


## Simulation

In [59]:
# using 2022 World Cup

data_2022 = data[data['season'] == 2022]

data_2022_continent = pd.get_dummies(data_2022, columns=['continent'])
data_2022_predict = data_2022_continent[['players_used', 'age', 'matches_played', 'goals', 'yellow_cards', 'red_cards', 
                                         'continent_Africa', 'continent_Americas', 'continent_Asia', 'continent_Europe', 'continent_Oceania']]

X_2022 = scaler.transform(data_2022_predict)

data_2022['win_probability'] = model_dct.predict_proba(X_2022)[:, 1]
data_2022['predicted_winner'] = model_dct.predict(X_2022)

predicted_winner = data_2022.loc[data_2022['win_probability'].idxmax(), 'team']

data_2022_predictions = data_2022[['team', 'win_probability']].sort_values(by='win_probability', ascending=False)

print("Predicted Winner:", predicted_winner)
print("2022 World Cup Predictions:")
print(data_2022_predictions.head(10))

Predicted Winner: Argentina
2022 World Cup Predictions:
              team  win_probability
433      Argentina         0.898876
443        England         0.898876
444         France         0.898876
454       Portugal         0.898876
451        Morocco         0.859375
452    Netherlands         0.567010
450         Mexico         0.000000
462  United States         0.000000
461        Tunisia         0.000000
460    Switzerland         0.000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2022['win_probability'] = model_dct.predict_proba(X_2022)[:, 1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2022['predicted_winner'] = model_dct.predict(X_2022)


Countries with highest probability to win the 2026 FIFA World Cup given historical data

In [60]:
data_predict['team'] = data['team'] 
data_predict['season'] = data['season']

# Group by 'team' and calculate the mean probability
grouped_predictions = data_predict.groupby('team', as_index=False)['win_probability'].mean()

# Sort the grouped data by win probability in descending order
grouped_predictions = grouped_predictions.sort_values(by='win_probability', ascending=False)

# Display the top 10 teams
print(grouped_predictions.head(10))

           team  win_probability
27      Germany         0.567676
69      Türkiye         0.567010
8        Brazil         0.524962
45  Netherlands         0.460127
26       France         0.337079
16      Croatia         0.299625
65       Sweden         0.299625
32      Hungary         0.299625
2     Argentina         0.262751
37        Italy         0.260557


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_predict['team'] = data['team']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_predict['season'] = data['season']
