In [1]:
import pandas as pd
import numpy as np
import re
import plotly.graph_objs as go
from sklearn.model_selection import train_test_split, KFold, cross_validate, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, confusion_matrix, accuracy_score, mean_absolute_error, f1_score, mean_squared_error, recall_score, accuracy_score, log_loss, precision_score
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, LabelEncoder
from collections import Counter
from xgboost.sklearn import XGBClassifier
import lightgbm as lgb
import dtale

# Processing Past International Results

In [2]:
international_results = pd.read_csv("./international_results.csv")

In [3]:
# Teams that qualified for the Euros 2021
teams = ['Turkey', 'Wales', 'Denmark', 'Belgium', 'England', 'Austria',
       'Netherlands', 'Scotland', 'Poland', 'Spain', 'Hungary', 'France',
       'Finland', 'Italy', 'Ukraine', 'Sweden', 'Croatia', 'Portugal',
       'Switzerland', 'North Macedonia', 'Russia', 'Czech Republic',
       'Slovakia', 'Germany']

In [4]:
# Current Fifa ranking for each team
fifa_uefa_rankings = {'Turkey':29, 'Wales':17, 'Denmark':10, 'Belgium':1, 'England':4, 'Austria':23,
       'Netherlands':16, 'Scotland':44, 'Poland':21, 'Spain':6, 'Hungary':37, 'France':2,
       'Finland':54, 'Italy':7, 'Ukraine':24, 'Sweden':18, 'Croatia':14, 'Portugal':5,
       'Switzerland':13, 'North Macedonia':62, 'Russia':38, 'Czech Republic':40,
       'Slovakia':36, 'Germany':12}

In [5]:
# Remove irrelevant features from Results dataset
remove_feats = ["city", "country", "neutral", "tournament", "date"]
international_results = international_results.drop(remove_feats, 1)

# Taking all results from after the end of the 2018 world cup (start of a "new" international football cycle)
international_results = international_results[36794:]

# Selecting games with at least one Euro2021 team
international_results = international_results[(international_results.home_team.isin([x for x in teams])) & international_results.away_team.isin([x for x in teams])]

In [6]:
# Adding new results column to replace the home/away score 
results = []
for x, y in zip(international_results["home_score"], international_results["away_score"]):
    if x > y:
        results.append("H")
    elif y > x:
        results.append("A")
    elif x == y:
        results.append("D")

international_results["result"] = results

international_results = international_results.drop(["home_score", "away_score"], 1)

In [7]:
# Adding Fifa rankings feature to the results dataset
for key, value in fifa_uefa_rankings.items():
    international_results.loc[international_results["home_team"] == key, "home_team_ranking"]= value
    international_results.loc[international_results["away_team"] == key, "away_team_ranking"]= value

In [8]:
# Final dataset structure
international_results.head()

Unnamed: 0,home_team,away_team,result,home_team_ranking,away_team_ranking
36800,Spain,Ukraine,H,6.0,24.0
36816,Netherlands,Turkey,D,16.0,29.0
36823,France,Denmark,H,2.0,10.0
36853,Italy,England,D,7.0,4.0
36860,Netherlands,Spain,H,16.0,6.0


In [9]:
# Encoding Full time result: 0 for away team win, 2 for home team win, and 1 for draw
encoder = OrdinalEncoder(categories=[["A", "D", "H"]])
international_results[["result"]] = encoder.fit_transform(international_results[["result"]])

# Encoding home/away teams
encoder = OrdinalEncoder()
international_results[["home_team"]] = encoder.fit_transform(international_results[["home_team"]])
international_results[["away_team"]] = encoder.fit_transform(international_results[["away_team"]])

# Dividing labels and features
labels = international_results["result"]
labels.replace({0.00:0, 1.00:1, 2.00:2}, inplace = True)
X = international_results.drop(["result"], axis=1)

# Train test split
x_train, x_test, y_train, y_test = train_test_split(X, labels, test_size = 0.3, random_state=42)

In [10]:
# Randomised Search to tuna hyperparameters
model = LogisticRegression()

param_grid = {
    # Parameters for tuning.
    'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
    'penalty': ['none','l2'],
    'C':[100, 10, 1.0, 0.1, 0.01],
    'max_iter':[500]
    
}


scoring = {'f1': 'f1_weighted', 'accuracy':'accuracy', "log_loss":"neg_log_loss"}

rnd_search = RandomizedSearchCV(model, 
                                param_grid,
                                n_iter =20,
                                scoring=scoring,
                                n_jobs=-1,
                                cv=10,
                                refit='log_loss')

rnd_search.fit(x_train, y_train)
  
print("F1 ", rnd_search.cv_results_['mean_test_f1'][rnd_search.best_index_])
print("Accuracy ", rnd_search.cv_results_['mean_test_accuracy'][rnd_search.best_index_])
print("Log Loss ", -rnd_search.cv_results_['mean_test_log_loss'][rnd_search.best_index_])
print(f"Best Params: {rnd_search.best_params_}")

F1  0.4387596056543985
Accuracy  0.5207977207977208
Log Loss  0.982639765005054
Best Params: {'solver': 'saga', 'penalty': 'l2', 'max_iter': 500, 'C': 0.01}


In [11]:
# Fitting the best model and assessing F1 and Accuracy scores
model = LogisticRegression(solver="saga", penalty="l2", C=0.01, max_iter=500)
# model = XGBClassifier(use_label_encoder=False)

model.fit(x_train, y_train)


pred = model.predict(x_train)

print("Train Set: ")
print(f"F1:  {f1_score(y_train, pred, average='weighted'):.3f}")
print (f'Accuracy: {accuracy_score(y_train, pred):.3f}')

Train Set: 
F1:  0.451
Accuracy: 0.529


In [12]:
# Test/validation Set scoring 
pred = model.predict(x_test)

print("Test Set: ")
print(f"F1:  {f1_score(y_test, pred, average='weighted'):.3f}")
print (f'Accuracy: {accuracy_score(y_test, pred):.3f}')

Test Set: 
F1:  0.423
Accuracy: 0.513


# Processing Euro2021 Fixtures

In [13]:
fixtures = pd.read_csv("./uefa-euro-2020-GMTStandardTime.csv")

In [14]:
# Remove Irrelevant Features
remove_feats = ["Location", "Group", "Match Number", "Round Number", "Date", "Result"]
fixtures = fixtures.drop(remove_feats, 1)

# Rename Features
fixtures.columns = ["home_team", "away_team"]

# For initial predictions use only the group stage matches
fixtures = fixtures[:36]

In [15]:
fixtures

Unnamed: 0,home_team,away_team
0,Turkey,Italy
1,Wales,Switzerland
2,Denmark,Finland
3,Belgium,Russia
4,England,Croatia
5,Austria,North Macedonia
6,Netherlands,Ukraine
7,Scotland,Czech Republic
8,Poland,Slovakia
9,Spain,Sweden


**A class for processing a dictionary representing teams remaining in the competition, adding the appropriate fifa rankings and outputting model predictions**

In [16]:
class MyProcess():
    def __init__(self):
        self.wt = []
        self.lt = []
        self.dt = []

    def processing(self, group_stage):
        # Adding Fifa rankings feature to the group stages
        for key, value in fifa_uefa_rankings.items():
            group_stage.loc[group_stage["home_team"] == key, "home_team_ranking"]= value
            group_stage.loc[group_stage["away_team"] == key, "away_team_ranking"]= value

        # Replace the teams with their originally encoded values
        group_stage['home_team'] = group_stage['home_team'].map(encoded_values)
        group_stage['away_team'] = group_stage['away_team'].map(encoded_values)

        #Applying the model and recording the win,loss and draws for future schedules
        predictions = model.predict(group_stage)
        for i in range(len(group_stage)):
            print(list(encoded_values.keys())[list(encoded_values.values()).index(group_stage.iloc[i, 0])], " and ",  list(encoded_values.keys())[list(encoded_values.values()).index(group_stage.iloc[i, 1])])
            if predictions[i] == 2:
                print("Winner: ", list(encoded_values.keys())[list(encoded_values.values()).index(group_stage.iloc[i, 0])])
                self.wt.append(list(encoded_values.keys())[list(encoded_values.values()).index(group_stage.iloc[i, 0])])
                self.lt.append(list(encoded_values.keys())[list(encoded_values.values()).index(group_stage.iloc[i, 1])])
            elif predictions[i] == 1:
                print("Draw")
                self.dt.append(list(encoded_values.keys())[list(encoded_values.values()).index(group_stage.iloc[i, 0])])
                self.dt.append(list(encoded_values.keys())[list(encoded_values.values()).index(group_stage.iloc[i, 1])])
            elif predictions[i] == 0:
                print("Winner: ", list(encoded_values.keys())[list(encoded_values.values()).index(group_stage.iloc[i, 1])])
                self.wt.append(list(encoded_values.keys())[list(encoded_values.values()).index(group_stage.iloc[i, 1])])
                self.lt.append(list(encoded_values.keys())[list(encoded_values.values()).index(group_stage.iloc[i, 0])])

            print('Probability of ', list(encoded_values.keys())[list(encoded_values.values()).index(group_stage.iloc[i, 0])], ' winning: ', '%.3f'%(model.predict_proba(group_stage)[i][2]))
            print('Probability of Draw: ', '%.3f'%(model.predict_proba(group_stage)[i][1]))
            print('Probability of ', list(encoded_values.keys())[list(encoded_values.values()).index(group_stage.iloc[i, 1])], ' winning: ', '%.3f'%(model.predict_proba(group_stage)[i][0]))
            print("")

In [17]:
# Values that each team is replaced with after encoding - will be used as a reference after predictions
encoded_values = {'Turkey':21, 'Wales':23, 'Denmark':4, 'Belgium':1, 'England':5, 'Austria':0,
       'Netherlands':11, 'Scotland':16, 'Poland':13, 'Spain':18, 'Hungary':9, 'France':7,
       'Finland':6, 'Italy':10, 'Ukraine':22, 'Sweden':19, 'Croatia':2, 'Portugal':14,
       'Switzerland':20, 'North Macedonia':12, 'Russia':15, 'Czech Republic':3,
       'Slovakia':17, 'Germany':8}

# Predictions Using Fifa Ranking Feature

# **Group Stages**

In [18]:
# Initialise the processing class
pe = MyProcess()

In [19]:
# Pass the fixture list to return the initial predictions for the group stages
pe.processing(fixtures)

Turkey  and  Italy
Winner:  Italy
Probability of  Turkey  winning:  0.303
Probability of Draw:  0.254
Probability of  Italy  winning:  0.443

Wales  and  Switzerland
Winner:  Wales
Probability of  Wales  winning:  0.587
Probability of Draw:  0.223
Probability of  Switzerland  winning:  0.190

Denmark  and  Finland
Winner:  Denmark
Probability of  Denmark  winning:  0.764
Probability of Draw:  0.170
Probability of  Finland  winning:  0.066

Belgium  and  Russia
Winner:  Belgium
Probability of  Belgium  winning:  0.734
Probability of Draw:  0.198
Probability of  Russia  winning:  0.068

England  and  Croatia
Winner:  England
Probability of  England  winning:  0.465
Probability of Draw:  0.292
Probability of  Croatia  winning:  0.243

Austria  and  North Macedonia
Winner:  Austria
Probability of  Austria  winning:  0.707
Probability of Draw:  0.206
Probability of  North Macedonia  winning:  0.088

Netherlands  and  Ukraine
Winner:  Netherlands
Probability of  Netherlands  winning:  0.598


In [20]:
# Creating a final table of teams and points based on the predictions
finalwinners=pd.DataFrame.from_dict(Counter(pe.wt), orient='index').reset_index()
finalwinners.columns=(['Team','Pred_Wins'])
finallosers=pd.DataFrame.from_dict(Counter(pe.lt), orient='index').reset_index()
finallosers.columns=(['Team','Pred_Loss'])

# Merging the predicted winners and losers
finalwin_lose=finalwinners.merge(finallosers,on='Team',how='outer')
finaltab=finalwin_lose.fillna(0) #Replace NaN values with 0

# calculating the points for the predicted games
finaltab['Points']=finaltab['Pred_Wins']*3 

In [21]:
# Euro 2021 Groups
groupA_teams = ["Italy", "Switzerland", "Turkey", "Wales"]
groupB_teams = ["Belgium", "Denmark", "Finland", "Russia"]
groupC_teams = ["Austria", "Netherlands", "North Macedonia", "Ukraine"]
groupD_teams = ["Croatia", "Czech Republic", "England", "Scotland"]
groupE_teams = ["Poland", "Slovakia", "Spain", "Sweden"]
groupF_teams = ["France", "Germany", "Hungary", "Portugal"]

In [22]:
# Sorting Groups by wins/points
groupA = finaltab.loc[finaltab["Team"].isin(groupA_teams)].sort_values(by=["Points"], ascending=False)
groupB = finaltab.loc[finaltab["Team"].isin(groupB_teams)].sort_values(by=["Points"], ascending=False)
groupC = finaltab.loc[finaltab["Team"].isin(groupC_teams)].sort_values(by=["Points"], ascending=False)
groupD = finaltab.loc[finaltab["Team"].isin(groupD_teams)].sort_values(by=["Points"], ascending=False)
groupE = finaltab.loc[finaltab["Team"].isin(groupE_teams)].sort_values(by=["Points"], ascending=False)
groupF = finaltab.loc[finaltab["Team"].isin(groupF_teams)].sort_values(by=["Points"], ascending=False)

**Group Stage Final Standings**

In [23]:
groupA

Unnamed: 0,Team,Pred_Wins,Pred_Loss,Points
0,Italy,3.0,0.0,9.0
1,Wales,1.0,2.0,3.0
13,Turkey,1.0,2.0,3.0
17,Switzerland,1.0,2.0,3.0


In [24]:
groupB

Unnamed: 0,Team,Pred_Wins,Pred_Loss,Points
3,Belgium,3.0,0.0,9.0
2,Denmark,2.0,1.0,6.0
12,Russia,1.0,2.0,3.0
19,Finland,0.0,3.0,0.0


In [25]:
groupC

Unnamed: 0,Team,Pred_Wins,Pred_Loss,Points
6,Netherlands,3.0,0.0,9.0
14,Ukraine,2.0,1.0,6.0
5,Austria,1.0,2.0,3.0
20,North Macedonia,0.0,3.0,0.0


In [26]:
groupD

Unnamed: 0,Team,Pred_Wins,Pred_Loss,Points
4,England,3.0,0.0,9.0
16,Croatia,2.0,1.0,6.0
7,Czech Republic,1.0,2.0,3.0
21,Scotland,0.0,3.0,0.0


In [27]:
groupE

Unnamed: 0,Team,Pred_Wins,Pred_Loss,Points
9,Spain,3.0,0.0,9.0
15,Sweden,2.0,1.0,6.0
8,Poland,1.0,2.0,3.0
22,Slovakia,0.0,3.0,0.0


In [28]:
groupF

Unnamed: 0,Team,Pred_Wins,Pred_Loss,Points
10,Portugal,3.0,0.0,9.0
11,France,2.0,1.0,6.0
18,Germany,1.0,2.0,3.0
23,Hungary,0.0,3.0,0.0


# **Round of 16**

In [29]:
# Matches based on the group stage results 
d = {'home_team': ["Wales", "Italy", "Netherlands", "Belgium", "Croatia", "Portugal", "England", "Spain"], 'away_team': ["Denmark", "Ukraine", "Czech Republic", "Turkey", "Sweden", "Russia", "France", "Austria"]}

group_16 = pd.DataFrame(data=d)

In [30]:
group_16

Unnamed: 0,home_team,away_team
0,Wales,Denmark
1,Italy,Ukraine
2,Netherlands,Czech Republic
3,Belgium,Turkey
4,Croatia,Sweden
5,Portugal,Russia
6,England,France
7,Spain,Austria


In [31]:
pe.processing(group_16)

Wales  and  Denmark
Winner:  Wales
Probability of  Wales  winning:  0.458
Probability of Draw:  0.237
Probability of  Denmark  winning:  0.305

Italy  and  Ukraine
Winner:  Italy
Probability of  Italy  winning:  0.679
Probability of Draw:  0.217
Probability of  Ukraine  winning:  0.103

Netherlands  and  Czech Republic
Winner:  Netherlands
Probability of  Netherlands  winning:  0.646
Probability of Draw:  0.212
Probability of  Czech Republic  winning:  0.142

Belgium  and  Turkey
Winner:  Belgium
Probability of  Belgium  winning:  0.692
Probability of Draw:  0.226
Probability of  Turkey  winning:  0.082

Croatia  and  Sweden
Winner:  Croatia
Probability of  Croatia  winning:  0.457
Probability of Draw:  0.316
Probability of  Sweden  winning:  0.227

Portugal  and  Russia
Winner:  Portugal
Probability of  Portugal  winning:  0.798
Probability of Draw:  0.146
Probability of  Russia  winning:  0.056

England  and  France
Winner:  England
Probability of  England  winning:  0.376
Probabilit

# **Quarter Finals**

In [32]:
# Matches based on the initial fixture list
d = {'home_team': ["Wales", "Italy", "Netherlands", "Belgium"], 'away_team': ["Croatia", "Portugal", "England", "Spain"]}

quarters = pd.DataFrame(data=d)

In [33]:
quarters

Unnamed: 0,home_team,away_team
0,Wales,Croatia
1,Italy,Portugal
2,Netherlands,England
3,Belgium,Spain


In [34]:
pe.processing(quarters)

Wales  and  Croatia
Winner:  Wales
Probability of  Wales  winning:  0.488
Probability of Draw:  0.229
Probability of  Croatia  winning:  0.283

Italy  and  Portugal
Winner:  Italy
Probability of  Italy  winning:  0.460
Probability of Draw:  0.292
Probability of  Portugal  winning:  0.249

Netherlands  and  England
Winner:  England
Probability of  Netherlands  winning:  0.304
Probability of Draw:  0.290
Probability of  England  winning:  0.405

Belgium  and  Spain
Winner:  Belgium
Probability of  Belgium  winning:  0.471
Probability of Draw:  0.323
Probability of  Spain  winning:  0.206



# **Semi Finals**

In [35]:
n = {'home_team': ["Wales", "Italy"], 'away_team': ["England", "Belgium"]}

semis = pd.DataFrame(data=n)

In [36]:
semis

Unnamed: 0,home_team,away_team
0,Wales,England
1,Italy,Belgium


In [37]:
pe.processing(semis)

Wales  and  England
Winner:  Wales
Probability of  Wales  winning:  0.402
Probability of Draw:  0.246
Probability of  England  winning:  0.353

Italy  and  Belgium
Winner:  Belgium
Probability of  Italy  winning:  0.344
Probability of Draw:  0.294
Probability of  Belgium  winning:  0.362



# **Final**

In [38]:
n = {'home_team': ["Wales"], 'away_team': ["Belgium"]}

final = pd.DataFrame(data=n)

In [39]:
final

Unnamed: 0,home_team,away_team
0,Wales,Belgium


In [40]:
pe.processing(final)

Wales  and  Belgium
Winner:  Belgium
Probability of  Wales  winning:  0.346
Probability of Draw:  0.245
Probability of  Belgium  winning:  0.409



Belgium is the predicted winner of the Euros 2021!