In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.io as pio


In [3]:
df = pd.read_csv('international_matches_cleaned.csv')

In [4]:
df.head()

Unnamed: 0,date,home_team,away_team,home_team_continent,away_team_continent,home_team_fifa_rank,away_team_fifa_rank,home_team_total_fifa_points,away_team_total_fifa_points,home_team_score,...,shoot_out,home_team_result,home_team_goalkeeper_score,away_team_goalkeeper_score,home_team_mean_defense_score,home_team_mean_offense_score,home_team_mean_midfield_score,away_team_mean_defense_score,away_team_mean_offense_score,away_team_mean_midfield_score
0,2011-09-02,Argentina,Venezuela,South America,South America,9,44,1016,615,1,...,No,Win,77.0,64.0,80.8,88.3,85.2,72.2,73.7,75.5
1,2011-09-02,Belarus,Bosnia and Herzegovina,Europe,Europe,41,39,645,655,0,...,No,Lose,75.0,76.0,63.2,67.3,68.0,72.0,77.7,78.0
2,2011-09-02,Bulgaria,England,Europe,Europe,51,4,585,1176,0,...,No,Lose,80.0,83.0,70.8,73.7,73.5,84.0,81.3,85.8
3,2011-09-02,Ecuador,Jamaica,South America,North America,66,43,491,622,5,...,No,Win,69.0,63.0,70.5,74.7,76.8,65.2,72.3,71.8
4,2011-09-02,Germany,Austria,Europe,Europe,3,68,1329,472,6,...,No,Win,87.0,75.0,83.0,83.7,85.0,75.8,74.0,74.5


In [5]:
df.shape

(2916, 25)

## EDA
### removing variables

In [6]:
df.columns

Index(['date', 'home_team', 'away_team', 'home_team_continent',
       'away_team_continent', 'home_team_fifa_rank', 'away_team_fifa_rank',
       'home_team_total_fifa_points', 'away_team_total_fifa_points',
       'home_team_score', 'away_team_score', 'tournament', 'city', 'country',
       'neutral_location', 'shoot_out', 'home_team_result',
       'home_team_goalkeeper_score', 'away_team_goalkeeper_score',
       'home_team_mean_defense_score', 'home_team_mean_offense_score',
       'home_team_mean_midfield_score', 'away_team_mean_defense_score',
       'away_team_mean_offense_score', 'away_team_mean_midfield_score'],
      dtype='object')

In [7]:
# variance of shoot_out column
# make shoot_out column binary
df['shoot_out'] = df['shoot_out'].apply(lambda x: 1 if x == 'yes' else 0)
df['shoot_out'].var() # no variance so we can drop this column

0.0

In [8]:
df["home_team_result"].unique() # target variable

array(['Win', 'Lose', 'Draw'], dtype=object)

In [9]:
df['tournament'].nunique()

27

In [10]:
df['neutral_location'].unique()

array([ True, False])

In [11]:
df['city'].nunique(), df['country'].nunique()

(504, 95)

In [12]:
# remove variables that may cause leakage 
# remove variables that are not useful for modelling
df.drop(['home_team_score', # leakage
        'away_team_score', # leakage
        'date', # not useful for modelling
        'shoot_out', # no variance
        'city', # too many unique values
        'country', # not useful for this problem
        'neutral_location'], # not useful for this problem
        #'tournament', # may/may not be useful for modelling
        inplace=True, axis=1)

In [13]:
df.shape

(2916, 18)

In [14]:
df.head()

Unnamed: 0,home_team,away_team,home_team_continent,away_team_continent,home_team_fifa_rank,away_team_fifa_rank,home_team_total_fifa_points,away_team_total_fifa_points,tournament,home_team_result,home_team_goalkeeper_score,away_team_goalkeeper_score,home_team_mean_defense_score,home_team_mean_offense_score,home_team_mean_midfield_score,away_team_mean_defense_score,away_team_mean_offense_score,away_team_mean_midfield_score
0,Argentina,Venezuela,South America,South America,9,44,1016,615,Friendly,Win,77.0,64.0,80.8,88.3,85.2,72.2,73.7,75.5
1,Belarus,Bosnia and Herzegovina,Europe,Europe,41,39,645,655,UEFA Euro qualification,Lose,75.0,76.0,63.2,67.3,68.0,72.0,77.7,78.0
2,Bulgaria,England,Europe,Europe,51,4,585,1176,UEFA Euro qualification,Lose,80.0,83.0,70.8,73.7,73.5,84.0,81.3,85.8
3,Ecuador,Jamaica,South America,North America,66,43,491,622,Friendly,Win,69.0,63.0,70.5,74.7,76.8,65.2,72.3,71.8
4,Germany,Austria,Europe,Europe,3,68,1329,472,UEFA Euro qualification,Win,87.0,75.0,83.0,83.7,85.0,75.8,74.0,74.5


### train-test-split

In [19]:
# split into X,y
y = df['home_team_result']
y = y.map({'Win': 2, 'Draw': 1, 'Lose': 0})
X = df.drop('home_team_result', axis=1)

# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [20]:
# categorical variables in X
cat_columns = [col for col in X_train.columns if X_train[col].dtype == 'object']

# numerical variables in X
num_columns = [col for col in X_train.columns if X_train[col].dtype != 'object']

### plots

In [29]:
train = pd.concat([X_train, y_train], axis=1)

In [30]:
train[num_columns].head()

Unnamed: 0,home_team_fifa_rank,away_team_fifa_rank,home_team_total_fifa_points,away_team_total_fifa_points,home_team_goalkeeper_score,away_team_goalkeeper_score,home_team_mean_defense_score,home_team_mean_offense_score,home_team_mean_midfield_score,away_team_mean_defense_score,away_team_mean_offense_score,away_team_mean_midfield_score
1362,6,9,1273,1174,85.0,83.0,77.8,80.0,82.0,82.8,85.0,78.2
1071,8,57,1207,582,83.0,73.0,85.2,84.7,83.2,73.5,72.3,73.0
84,35,60,713,540,56.0,67.0,71.0,73.3,74.8,75.0,72.3,71.8
2398,39,62,1461,1379,80.0,91.0,77.5,77.7,78.2,72.0,78.3,76.5
776,35,3,650,1506,85.0,79.0,74.2,73.7,75.8,78.2,87.3,81.8


In [43]:
# plotly histogram of home_team_fifa_rank
fig = px.histogram(train, x='home_team_fifa_rank', 
                    color='home_team_result',
                    # show percentages
                    histnorm='percent',
                    )
fig.show()

In [44]:
# home_team_total_fifa_points
fig = px.histogram(train, x='home_team_total_fifa_points', 
                    color='home_team_result',
                    histnorm='percent')
fig.show()

In [48]:
# home_team_goalkeeper_score
fig = px.histogram(train, x='home_team_goalkeeper_score', 
                    color='home_team_result',
                    histnorm='percent',
                    #histnorm='probability density'
                    )
fig.show()

In [51]:
# away_team_goalkeeper_score
fig = px.histogram(train, x='away_team_goalkeeper_score', 
                    color='home_team_result',
                    histnorm='percent')
fig.show()

In [39]:
# home_team_defence_score
fig = px.histogram(train, x='home_team_mean_defense_score', color='home_team_result')
fig.show()

### get dummies

In [52]:
# one hot encode categorical variables
X_train = pd.get_dummies(X_train, columns=cat_columns, drop_first=True)
X_test = pd.get_dummies(X_test, columns=cat_columns, drop_first=True)

# check if there are any missing columns in X_test
missing_in_test = set(X_train.columns) - set(X_test.columns)
for c in missing_in_test:
    X_test[c] = 0

# ensure columns are in the same order
#X_test = X_test[X_train.columns]
X_train, X_test = X_train.align(X_test, axis=1)

In [53]:
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)