In [788]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, cross_validate, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, confusion_matrix, accuracy_score, mean_absolute_error, f1_score, mean_squared_error, recall_score, accuracy_score, log_loss, precision_score
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, LabelEncoder
from collections import Counter

# Processing Euro2021 Fixtures

In [789]:
fixtures = pd.read_csv("./uefa-euro-2020-GMTStandardTime.csv")

In [790]:
remove_feats = ["Location", "Group", "Match Number", "Round Number", "Date"]
fixtures = fixtures.drop(remove_feats, 1)

fixtures.columns = ["home_team", "away_team", "result"]

fixtures = df[:36]

# Processing International Results

In [791]:
international_results = pd.read_csv("./international_results.csv")

In [792]:
teams = ['Turkey', 'Wales', 'Denmark', 'Belgium', 'England', 'Austria',
       'Netherlands', 'Scotland', 'Poland', 'Spain', 'Hungary', 'France',
       'Finland', 'Italy', 'Ukraine', 'Sweden', 'Croatia', 'Portugal',
       'Switzerland', 'North Macedonia', 'Russia', 'Czech Republic',
       'Slovakia', 'Germany']

In [793]:
remove_feats = ["city", "country", "neutral", "tournament", "date"]
international_results = international_results.drop(remove_feats, 1)

# Taking all results from after the end of the 2018 world cup (start of a new international football cycle)
international_results = international_results[36794:]

# Selecting games with at least one Euro2021 team
international_results = international_results[(international_results.home_team.isin([x for x in teams])) & international_results.away_team.isin([x for x in teams])]

In [794]:
# Adding new result column
results = []
for x, y in zip(international_results["home_score"], international_results["away_score"]):
    if x > y:
        results.append("H")
    elif y > x:
        results.append("A")
    elif x == y:
        results.append("D")

international_results["result"] = results

In [795]:
international_results = international_results.drop(["home_score", "away_score"], 1)

In [796]:
# Encoding Full time result
# 0 for the away team winning,2 for the home team winning and 1 for a draw

# Results Feature
encoder = OrdinalEncoder(categories=[["A", "D", "H"]])
international_results[["result"]] = encoder.fit_transform(international_results[["result"]])

In [797]:
# Substituting the team names with dummy values
final_df = pd.get_dummies(international_results, prefix=['home_team','away_team'], columns=['home_team','away_team'])

In [798]:
# Labels and train test split
labels = final_df["result"]
labels.replace({0.00:0, 1.00:1, 2.00:2}, inplace = True)
X = final_df.drop(["result"], axis=1)


# Train test split
x_train, x_test, y_train, y_test = train_test_split(X, labels, test_size = 0.3, random_state=42)

In [799]:
model = LogisticRegression()

model.fit(x_train, y_train)


pred = model.predict(x_train)

print("Train Set: ")
print(f"F1:  {f1_score(y_train, pred, average='weighted'):.3f}")
print (f'Accuracy: {accuracy_score(y_train, pred):.3f}')
print(f"Precision : {precision_score(y_train, pred, average='weighted'):.3f}")

Train Set: 
F1:  0.628
Accuracy: 0.643
Precision : 0.637


In [800]:
pred = model.predict(x_test)

print("Test Set: ")
print(f"F1:  {f1_score(y_test, pred, average='weighted'):.3f}")
print (f'Accuracy: {accuracy_score(y_test, pred):.3f}')
print(f"Precision : {precision_score(y_test, pred, average='weighted'):.3f}")

Test Set: 
F1:  0.420
Accuracy: 0.442
Precision : 0.432


# Predictions

Group Stages

In [801]:
pred_set = pd.get_dummies(fixtures, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

pred_set = pred_set.drop(['result'], axis=1)
pred_set.head()

Unnamed: 0,home_team_Austria,home_team_Belgium,home_team_Croatia,home_team_Czech Republic,home_team_Denmark,home_team_England,home_team_Finland,home_team_France,home_team_Germany,home_team_Hungary,...,away_team_Portugal,away_team_Russia,away_team_Scotland,away_team_Slovakia,away_team_Spain,away_team_Sweden,away_team_Switzerland,away_team_Turkey,away_team_Ukraine,away_team_Wales
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [802]:
#Applying the model and recording the win,loss and draws for future schedules
wt=[]
lt=[]
dt=[]
predictions = model.predict(pred_set)
for i in range(len(fixtures)):
    print(fixtures.iloc[i, 0] + " and " +  fixtures.iloc[i, 1])
    if predictions[i] == 2:
        print("Winner: " + fixtures.iloc[i, 0])
        wt.append(fixtures.iloc[i, 0])
        lt.append(fixtures.iloc[i, 1])
    elif predictions[i] == 1:
        print("Draw")
        dt.append(fixtures.iloc[i, 0])
        dt.append(fixtures.iloc[i, 1])
    elif predictions[i] == 0:
        print("Winner: " + fixtures.iloc[i, 1])
        wt.append(fixtures.iloc[i, 0])
        lt.append(fixtures.iloc[i, 1])
   
    print('Probability of ' + fixtures.iloc[i, 0] + ' winning: ', '%.3f'%(model.predict_proba(pred_set)[i][2]))
    print('Probability of Draw: ', '%.3f'%(model.predict_proba(pred_set)[i][1]))
    print('Probability of ' + fixtures.iloc[i, 1] + ' winning: ', '%.3f'%(model.predict_proba(pred_set)[i][0]))
    print("")

Turkey and Italy
Winner: Italy
Probability of Turkey winning:  0.214
Probability of Draw:  0.306
Probability of Italy winning:  0.480

Wales and Switzerland
Winner: Wales
Probability of Wales winning:  0.794
Probability of Draw:  0.063
Probability of Switzerland winning:  0.142

Denmark and Finland
Winner: Denmark
Probability of Denmark winning:  0.439
Probability of Draw:  0.414
Probability of Finland winning:  0.147

Belgium and Russia
Winner: Belgium
Probability of Belgium winning:  0.682
Probability of Draw:  0.208
Probability of Russia winning:  0.110

England and Croatia
Winner: England
Probability of England winning:  0.433
Probability of Draw:  0.258
Probability of Croatia winning:  0.309

Austria and North Macedonia
Winner: Austria
Probability of Austria winning:  0.362
Probability of Draw:  0.296
Probability of North Macedonia winning:  0.342

Netherlands and Ukraine
Winner: Netherlands
Probability of Netherlands winning:  0.436
Probability of Draw:  0.229
Probability of Ukra

In [803]:
groupA_teams = ["Italy", "Switzerland", "Turkey", "Wales"]
groupB_teams = ["Belgium", "Denmark", "Finland", "Russia"]
groupC_teams = ["Austria", "Netherlands", "North Macedonia", "Ukraine"]
groupD_teams = ["Croatia", "Czech Republic", "England", "Scotland"]
groupE_teams = ["Poland", "Slovakia", "Spain", "Sweden"]
groupF_teams = ["France", "Germany", "Hungary", "Portugal"]

In [804]:
# Creating a final table of teams and points based on the predictions

finalwt=pd.DataFrame.from_dict(Counter(wt), orient='index').reset_index()
finalwt.columns=(['Team','Pred_Wins'])
finall=pd.DataFrame.from_dict(Counter(lt), orient='index').reset_index()
finall.columns=(['Team','Pred_Loss'])
finald=pd.DataFrame.from_dict(Counter(dt), orient='index').reset_index()
finald.columns=(['Team','Pred_Draw'])

#merging the predicted win,loss and draws from the model
finalwl=finalwt.merge(finall,on='Team',how='outer')
finaltab=finalwl.merge(finald,on='Team',how='outer')
finaltab=finaltab.fillna(0) #Replace NaN values with 0
finaltab['Points']=finaltab['Pred_Wins']*3+finaltab['Pred_Draw']*1 # calculating the final predicted point from the remaining games

In [805]:
groupA = finaltab.loc[finaltab["Team"].isin(groupA_teams)].sort_values(by=["Points"], ascending=False)
groupB = finaltab.loc[finaltab["Team"].isin(groupB_teams)].sort_values(by=["Points"], ascending=False)
groupC = finaltab.loc[finaltab["Team"].isin(groupC_teams)].sort_values(by=["Points"], ascending=False)
groupD = finaltab.loc[finaltab["Team"].isin(groupD_teams)].sort_values(by=["Points"], ascending=False)
groupE = finaltab.loc[finaltab["Team"].isin(groupE_teams)].sort_values(by=["Points"], ascending=False)
groupF = finaltab.loc[finaltab["Team"].isin(groupF_teams)].sort_values(by=["Points"], ascending=False)

In [806]:
groupA

Unnamed: 0,Team,Pred_Wins,Pred_Loss,Pred_Draw,Points
0,Turkey,2.0,0.0,1.0,7.0
13,Italy,2.0,1.0,0.0,6.0
1,Wales,1.0,2.0,0.0,3.0
22,Switzerland,0.0,2.0,1.0,1.0


In [807]:
groupB

Unnamed: 0,Team,Pred_Wins,Pred_Loss,Pred_Draw,Points
2,Denmark,2.0,1.0,0.0,6.0
12,Finland,2.0,1.0,0.0,6.0
3,Belgium,1.0,2.0,0.0,3.0
19,Russia,1.0,2.0,0.0,3.0


In [808]:
groupC

Unnamed: 0,Team,Pred_Wins,Pred_Loss,Pred_Draw,Points
6,Netherlands,2.0,1.0,0.0,6.0
14,Ukraine,2.0,1.0,0.0,6.0
5,Austria,1.0,2.0,0.0,3.0
18,North Macedonia,1.0,2.0,0.0,3.0


In [809]:
groupD

Unnamed: 0,Team,Pred_Wins,Pred_Loss,Pred_Draw,Points
4,England,2.0,1.0,0.0,6.0
16,Croatia,2.0,1.0,0.0,6.0
7,Scotland,1.0,2.0,0.0,3.0
20,Czech Republic,1.0,2.0,0.0,3.0


In [810]:
groupE

Unnamed: 0,Team,Pred_Wins,Pred_Loss,Pred_Draw,Points
9,Spain,2.0,0.0,1.0,7.0
15,Sweden,2.0,1.0,0.0,6.0
8,Poland,1.0,2.0,0.0,3.0
23,Slovakia,0.0,2.0,1.0,1.0


In [811]:
groupF

Unnamed: 0,Team,Pred_Wins,Pred_Loss,Pred_Draw,Points
10,Hungary,2.0,1.0,0.0,6.0
17,Portugal,2.0,1.0,0.0,6.0
11,France,1.0,2.0,0.0,3.0
21,Germany,1.0,2.0,0.0,3.0


**Round of 16**

In [812]:
d = {'home_team': ["Italy", "Turkey", "Netherlands", "Denmark", "Croatia", "Hungary", "England", "Sweden"], 'away_team': ["Finland", "Ukraine", "Scotland", "Wales", "Spain", "Belgium", "Portugal", "Austria"], 'result': np.nan}

knockouts = pd.DataFrame(data=d)

In [813]:
group_16 = fixtures.copy()

In [814]:
group_16 = group_16.append(knockouts)
group_16 = group_16.reset_index(drop=True)

In [815]:
pred_set = pd.get_dummies(group_16, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

pred_set = pred_set.drop(['result'], axis=1)
pred_set.head()

Unnamed: 0,home_team_Austria,home_team_Belgium,home_team_Croatia,home_team_Czech Republic,home_team_Denmark,home_team_England,home_team_Finland,home_team_France,home_team_Germany,home_team_Hungary,...,away_team_Portugal,away_team_Russia,away_team_Scotland,away_team_Slovakia,away_team_Spain,away_team_Sweden,away_team_Switzerland,away_team_Turkey,away_team_Ukraine,away_team_Wales
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [816]:
#Applying the model and recording the win,loss and draws for future schedules
wt=[]
lt=[]
dt=[]
predictions = model.predict_proba(pred_set)
predictions = predictions[36:]
group_16 = group_16[36:]

for i in range(len(group_16)):
    print(group_16.iloc[i, 0] + " and " +  group_16.iloc[i, 1])
    
    if predictions[i][0] > predictions[i][2]:
        print("Winner: " + group_16.iloc[i, 1])
        wt.append(group_16.iloc[i, 1])
        lt.append(group_16.iloc[i, 0])
    
    elif predictions[i][0] < predictions[i][2]:
        print("Winner: " + group_16.iloc[i, 0])
        wt.append(group_16.iloc[i, 0])
        lt.append(group_16.iloc[i, 1])

    print('Probability of ' + group_16.iloc[i, 0] + ' winning: ', '%.3f'%(model.predict_proba(pred_set[36:])[i][2]))
    print('Probability of Draw: ', '%.3f'%(model.predict_proba(pred_set[36:])[i][1]))
    print('Probability of ' + group_16.iloc[i, 1] + ' winning: ', '%.3f'%(model.predict_proba(pred_set[36:])[i][0]))
    print("")

Italy and Finland
Winner: Italy
Probability of Italy winning:  0.449
Probability of Draw:  0.467
Probability of Finland winning:  0.085

Turkey and Ukraine
Winner: Ukraine
Probability of Turkey winning:  0.307
Probability of Draw:  0.367
Probability of Ukraine winning:  0.327

Netherlands and Scotland
Winner: Netherlands
Probability of Netherlands winning:  0.664
Probability of Draw:  0.075
Probability of Scotland winning:  0.261

Denmark and Wales
Winner: Denmark
Probability of Denmark winning:  0.547
Probability of Draw:  0.262
Probability of Wales winning:  0.191

Croatia and Spain
Winner: Croatia
Probability of Croatia winning:  0.295
Probability of Draw:  0.561
Probability of Spain winning:  0.144

Hungary and Belgium
Winner: Belgium
Probability of Hungary winning:  0.200
Probability of Draw:  0.125
Probability of Belgium winning:  0.675

England and Portugal
Winner: Portugal
Probability of England winning:  0.332
Probability of Draw:  0.294
Probability of Portugal winning:  0.374

**Quarter Finals**

In [817]:
quarters = fixtures.copy()

In [818]:
n = {'home_team': ["Italy", "Ukraine", "Netherlands", "Denmark"], 'away_team': ["Croatia", "Belgium", "Portugal", "Sweden"], 'result': np.nan}

quarter_teams = pd.DataFrame(data=n)

In [819]:
quarters = quarters.append(quarter_teams)
quarters = quarters.reset_index(drop=True)

In [820]:
pred_set = pd.get_dummies(quarters, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

pred_set = pred_set.drop(['result'], axis=1)
pred_set.head()

Unnamed: 0,home_team_Austria,home_team_Belgium,home_team_Croatia,home_team_Czech Republic,home_team_Denmark,home_team_England,home_team_Finland,home_team_France,home_team_Germany,home_team_Hungary,...,away_team_Portugal,away_team_Russia,away_team_Scotland,away_team_Slovakia,away_team_Spain,away_team_Sweden,away_team_Switzerland,away_team_Turkey,away_team_Ukraine,away_team_Wales
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [821]:
#Applying the model and recording the win,loss and draws for future schedules
wt=[]
lt=[]
dt=[]
predictions = model.predict_proba(pred_set)
predictions = predictions[36:]
quarters = quarters[36:]

for i in range(len(quarters)):
    print(quarters.iloc[i, 0] + " and " +  quarters.iloc[i, 1])
    
    if predictions[i][0] > predictions[i][2]:
        print("Winner: " + quarters.iloc[i, 1])
        wt.append(quarters.iloc[i, 1])
        lt.append(quarters.iloc[i, 0])
    
    elif predictions[i][0] < predictions[i][2]:
        print("Winner: " + quarters.iloc[i, 0])
        wt.append(quarters.iloc[i, 0])
        lt.append(quarters.iloc[i, 1])

    print('Probability of ' + quarters.iloc[i, 0] + ' winning: ', '%.3f'%(model.predict_proba(pred_set[36:])[i][2]))
    print('Probability of Draw: ', '%.3f'%(model.predict_proba(pred_set[36:])[i][1]))
    print('Probability of ' + quarters.iloc[i, 1] + ' winning: ', '%.3f'%(model.predict_proba(pred_set[36:])[i][0]))
    print("")

Italy and Croatia
Winner: Italy
Probability of Italy winning:  0.323
Probability of Draw:  0.455
Probability of Croatia winning:  0.222

Ukraine and Belgium
Winner: Belgium
Probability of Ukraine winning:  0.358
Probability of Draw:  0.097
Probability of Belgium winning:  0.545

Netherlands and Portugal
Winner: Portugal
Probability of Netherlands winning:  0.235
Probability of Draw:  0.263
Probability of Portugal winning:  0.502

Denmark and Sweden
Winner: Denmark
Probability of Denmark winning:  0.464
Probability of Draw:  0.223
Probability of Sweden winning:  0.313



In [822]:
n = {'home_team': ["Italy", "Belgium"], 'away_team': ["Portugal", "Denmark"], 'result': np.nan}

semi_teams = pd.DataFrame(data=n)

In [823]:
semis = fixtures.copy()

In [824]:
semis = semis.append(semi_teams)
semis = semis.reset_index(drop=True)

In [825]:
pred_set = pd.get_dummies(semis, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

pred_set = pred_set.drop(['result'], axis=1)
pred_set.head()

Unnamed: 0,home_team_Austria,home_team_Belgium,home_team_Croatia,home_team_Czech Republic,home_team_Denmark,home_team_England,home_team_Finland,home_team_France,home_team_Germany,home_team_Hungary,...,away_team_Portugal,away_team_Russia,away_team_Scotland,away_team_Slovakia,away_team_Spain,away_team_Sweden,away_team_Switzerland,away_team_Turkey,away_team_Ukraine,away_team_Wales
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [826]:
#Applying the model and recording the win,loss and draws for future schedules
wt=[]
lt=[]
dt=[]
predictions = model.predict_proba(pred_set)
predictions = predictions[36:]
semis = semis[36:]

for i in range(len(semis)):
    print(semis.iloc[i, 0] + " and " +  semis.iloc[i, 1])
    
    if predictions[i][0] > predictions[i][2]:
        print("Winner: " + semis.iloc[i, 1])
        wt.append(semis.iloc[i, 1])
        lt.append(semis.iloc[i, 0])
    
    elif predictions[i][0] < predictions[i][2]:
        print("Winner: " + semis.iloc[i, 0])
        wt.append(semis.iloc[i, 0])
        lt.append(semis.iloc[i, 1])

    print('Probability of ' + semis.iloc[i, 0] + ' winning: ', '%.3f'%(model.predict_proba(pred_set[36:])[i][2]))
    print('Probability of Draw: ', '%.3f'%(model.predict_proba(pred_set[36:])[i][1]))
    print('Probability of ' + semis.iloc[i, 1] + ' winning: ', '%.3f'%(model.predict_proba(pred_set[36:])[i][0]))
    print("")

Italy and Portugal
Winner: Portugal
Probability of Italy winning:  0.240
Probability of Draw:  0.501
Probability of Portugal winning:  0.260

Belgium and Denmark
Winner: Belgium
Probability of Belgium winning:  0.674
Probability of Draw:  0.209
Probability of Denmark winning:  0.117



**Final**

In [827]:
n = {'home_team': ["Belgium"], 'away_team': ["Portugal"], 'result': np.nan}

final_teams = pd.DataFrame(data=n)

In [828]:
final = fixtures.copy()

In [829]:
final = final.append(final_teams)
final = final.reset_index(drop=True)

In [830]:
pred_set = pd.get_dummies(final, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

pred_set = pred_set.drop(['result'], axis=1)
pred_set.head()

Unnamed: 0,home_team_Austria,home_team_Belgium,home_team_Croatia,home_team_Czech Republic,home_team_Denmark,home_team_England,home_team_Finland,home_team_France,home_team_Germany,home_team_Hungary,...,away_team_Portugal,away_team_Russia,away_team_Scotland,away_team_Slovakia,away_team_Spain,away_team_Sweden,away_team_Switzerland,away_team_Turkey,away_team_Ukraine,away_team_Wales
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [831]:
#Applying the model and recording the win,loss and draws for future schedules
wt=[]
lt=[]
dt=[]
predictions = model.predict_proba(pred_set)
predictions = predictions[36:]
final = final[36:]

for i in range(len(final)):
    print(final.iloc[i, 0] + " and " +  final.iloc[i, 1])
    
    if predictions[i][0] > predictions[i][2]:
        print("Winner: " + final.iloc[i, 1])
        wt.append(final.iloc[i, 1])
        lt.append(final.iloc[i, 0])
    
    elif predictions[i][0] < predictions[i][2]:
        print("Winner: " + final.iloc[i, 0])
        wt.append(final.iloc[i, 0])
        lt.append(final.iloc[i, 1])

    print('Probability of ' + final.iloc[i, 0] + ' winning: ', '%.3f'%(model.predict_proba(pred_set[36:])[i][2]))
    print('Probability of Draw: ', '%.3f'%(model.predict_proba(pred_set[36:])[i][1]))
    print('Probability of ' + final.iloc[i, 1] + ' winning: ', '%.3f'%(model.predict_proba(pred_set[36:])[i][0]))
    print("")

Belgium and Portugal
Winner: Belgium
Probability of Belgium winning:  0.463
Probability of Draw:  0.277
Probability of Portugal winning:  0.259



Belgium Winners of the Euros 2021