In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.io as pio


In [19]:
df = pd.read_csv('C:\\Users\\shann\\Documents\\GitHub\\15095-project\\data\\international_matches_FINAL.csv')

In [20]:
df.head()

Unnamed: 0,date,home_team,away_team,home_team_continent,away_team_continent,home_team_fifa_rank,away_team_fifa_rank,home_team_total_fifa_points,away_team_total_fifa_points,home_team_score,...,shoot_out,home_team_result,home_team_goalkeeper_score,away_team_goalkeeper_score,home_team_mean_defense_score,home_team_mean_offense_score,home_team_mean_midfield_score,away_team_mean_defense_score,away_team_mean_offense_score,away_team_mean_midfield_score
0,2004-09-03,Spain,Scotland,Europe,Europe,3,67,788,535,1,...,No,Draw,94.0,84.0,86.5,89.3,89.5,80.2,79.7,81.8
1,2004-09-04,Austria,England,Europe,Europe,90,7,488,732,2,...,No,Draw,83.0,88.0,76.2,73.0,74.0,90.5,88.7,91.2
2,2004-09-04,Croatia,Hungary,Europe,Europe,25,76,662,523,3,...,No,Win,77.0,74.0,80.5,78.7,79.0,71.8,75.7,70.2
3,2004-09-04,Iceland,Bulgaria,Europe,Europe,80,41,514,595,1,...,No,Lose,78.0,78.0,68.8,77.0,69.2,70.5,79.7,78.5
4,2004-09-04,Italy,Norway,Europe,Europe,9,38,722,610,2,...,No,Win,97.0,79.0,91.8,92.3,87.5,79.2,81.3,79.0


In [21]:
df.shape

(4303, 25)

## EDA
### removing variables

In [18]:
df.columns

Index(['date', 'home_team', 'away_team', 'home_team_continent',
       'away_team_continent', 'home_team_fifa_rank', 'away_team_fifa_rank',
       'home_team_total_fifa_points', 'away_team_total_fifa_points',
       'home_team_score', 'away_team_score', 'tournament', 'city', 'country',
       'neutral_location', 'shoot_out', 'home_team_result',
       'home_team_goalkeeper_score', 'away_team_goalkeeper_score',
       'home_team_mean_defense_score', 'home_team_mean_offense_score',
       'home_team_mean_midfield_score', 'away_team_mean_defense_score',
       'away_team_mean_offense_score', 'away_team_mean_midfield_score'],
      dtype='object')

In [24]:
# variance of shoot_out column
# make shoot_out column binary
df['shoot_out'] = df['shoot_out'].apply(lambda x: 1 if x == 'yes' else 0)
df['shoot_out'].var() # no variance so we can drop this column

0.0

In [25]:
df["home_team_result"].unique() # target variable

array(['Draw', 'Win', 'Lose'], dtype=object)

In [26]:
df['tournament'].nunique()

31

In [27]:
df['neutral_location'].unique()

array([False,  True])

In [28]:
df['city'].nunique(), df['country'].nunique()

(623, 103)

In [29]:
# remove variables that may cause leakage 
# remove variables that are not useful for modelling
df.drop(['home_team_score', # leakage
        'away_team_score', # leakage
        'date', # not useful for modelling
        'shoot_out', # no variance
        'city', # too many unique values
        'country', # not useful for this problem
        'neutral_location', # not useful for this problem
        'tournament'], # not useful for modelling
        inplace=True, axis=1)

In [30]:
df.shape

(4303, 17)

In [31]:
df.head()

Unnamed: 0,home_team,away_team,home_team_continent,away_team_continent,home_team_fifa_rank,away_team_fifa_rank,home_team_total_fifa_points,away_team_total_fifa_points,home_team_result,home_team_goalkeeper_score,away_team_goalkeeper_score,home_team_mean_defense_score,home_team_mean_offense_score,home_team_mean_midfield_score,away_team_mean_defense_score,away_team_mean_offense_score,away_team_mean_midfield_score
0,Spain,Scotland,Europe,Europe,3,67,788,535,Draw,94.0,84.0,86.5,89.3,89.5,80.2,79.7,81.8
1,Austria,England,Europe,Europe,90,7,488,732,Draw,83.0,88.0,76.2,73.0,74.0,90.5,88.7,91.2
2,Croatia,Hungary,Europe,Europe,25,76,662,523,Win,77.0,74.0,80.5,78.7,79.0,71.8,75.7,70.2
3,Iceland,Bulgaria,Europe,Europe,80,41,514,595,Lose,78.0,78.0,68.8,77.0,69.2,70.5,79.7,78.5
4,Italy,Norway,Europe,Europe,9,38,722,610,Win,97.0,79.0,91.8,92.3,87.5,79.2,81.3,79.0


### train-test-split

In [32]:
# split into X,y
y = df['home_team_result']
y = y.map({'Win': 2, 'Draw': 1, 'Lose': 0})
X = df.drop('home_team_result', axis=1)

# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [33]:
# categorical variables in X
cat_columns = [col for col in X_train.columns if X_train[col].dtype == 'object']

# numerical variables in X
num_columns = [col for col in X_train.columns if X_train[col].dtype != 'object']

### plots

In [34]:
train = pd.concat([X_train, y_train], axis=1)

In [35]:
train[num_columns].head()

Unnamed: 0,home_team_fifa_rank,away_team_fifa_rank,home_team_total_fifa_points,away_team_total_fifa_points,home_team_goalkeeper_score,away_team_goalkeeper_score,home_team_mean_defense_score,home_team_mean_offense_score,home_team_mean_midfield_score,away_team_mean_defense_score,away_team_mean_offense_score,away_team_mean_midfield_score
3276,29,47,1483,1418,78.0,82.0,77.0,75.3,80.2,77.0,75.7,76.5
386,42,8,685,1291,70.0,88.0,71.2,79.0,73.8,85.5,85.3,86.0
847,68,57,481,535,77.0,80.0,72.2,73.7,76.2,75.8,71.7,69.8
2918,27,33,843,778,79.0,79.0,76.8,74.3,77.8,79.2,76.7,80.0
1839,20,88,879,413,78.0,67.0,76.0,80.7,75.0,68.0,73.3,68.5


In [36]:
# plotly histogram of home_team_fifa_rank
fig = px.histogram(train, x='home_team_fifa_rank', 
                    color='home_team_result',
                    # show percentages
                    histnorm='percent',
                    )
fig.show()

In [37]:
# home_team_total_fifa_points
fig = px.histogram(train, x='home_team_total_fifa_points', 
                    color='home_team_result',
                    histnorm='percent')
fig.show()

In [29]:
# home_team_goalkeeper_score
fig = px.histogram(train, x='home_team_goalkeeper_score', 
                    color='home_team_result',
                    histnorm='percent',
                    #histnorm='probability density'
                    )
fig.show()

In [30]:
# away_team_goalkeeper_score
fig = px.histogram(train, x='away_team_goalkeeper_score', 
                    color='home_team_result',
                    histnorm='percent')
fig.show()

In [31]:
# home_team_defence_score
fig = px.histogram(train, x='home_team_mean_defense_score', color='home_team_result')
fig.show()

### get dummies

In [32]:
# one hot encode categorical variables
X_train = pd.get_dummies(X_train, columns=cat_columns, drop_first=True)
X_test = pd.get_dummies(X_test, columns=cat_columns, drop_first=True)

# check if there are any missing columns in X_test and X_train
missing_in_test = set(X_train.columns) - set(X_test.columns)
for c in missing_in_test:
    X_test[c] = 0
missing_in_train = set(X_test.columns) - set(X_train.columns)
for c in missing_in_train:
    X_train[c] = 0

# ensure columns are in the same order
X_test = X_test[X_train.columns]

In [33]:
X_train.shape

(2041, 191)

In [34]:
X = pd.concat([X_train, X_test], axis=0)
y = pd.concat([y_train, y_test], axis=0)
X.to_csv('X.csv', index=False)
y.to_csv('y.csv', index=False)

In [35]:
y_binary = y.map({2: 1, 1: 0, 0: 0})

In [101]:
y_binary.value_counts()

0    1581
1    1335
Name: home_team_result, dtype: int64

In [102]:
y_binary.to_csv('y_binary.csv', index=False)

## Data wrangling group stages

In [66]:
group_df = pd.read_csv("fifa_worldcup_2022_groupstages.csv")

In [67]:
group_df.columns

Index(['date', 'home_team', 'away_team', 'home_team_continent',
       'away_team_continent', 'home_team_fifa_rank', 'away_team_fifa_rank',
       'home_team_total_fifa_points', 'away_team_total_fifa_points',
       'tournament', 'city', 'country', 'neutral_location',
       'home_team_goalkeeper_score', 'away_team_goalkeeper_score',
       'home_team_mean_defense_score', 'home_team_mean_offense_score',
       'home_team_mean_midfield_score', 'away_team_mean_defense_score',
       'away_team_mean_offense_score', 'away_team_mean_midfield_score',
       'group'],
      dtype='object')

In [68]:
# remove variables that may cause leakage 
# remove variables that are not useful for modelling
group_df.drop(['date', # not useful for modelling
                'city', # too many unique values
                'country', # not useful for this problem
                'neutral_location', # not useful for this problem
                'tournament', # not useful for modelling
                'group'], # not useful for modelling
                inplace=True, axis=1)

In [69]:
group_df.shape

(48, 16)

In [70]:
group_cat_cols = [col for col in group_df.columns if group_df[col].dtype == 'object']

In [71]:
group_df.shape

(48, 16)

In [72]:
# encode categorical variables
group_df = pd.get_dummies(group_df, columns=group_cat_cols, drop_first=True)



In [75]:
# check if there are any missing columns in X_test and X_train
missing_in_groups = set(X_train.columns) - set(group_df.columns)

for col in missing_in_groups:
    group_df[col] = 0


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

In [76]:
group_df = group_df[X_train.columns]

In [79]:
group_df.to_csv('group_stage_X.csv', index=False)