# Soccer Match Outcome Predictor

The following code will build a soccer match outcome prediction model using nural networks with team embeddings and match history embeddings.

In [42]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Input, Dense, Concatenate, Dropout, Flatten, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split


## Preprocessing the Dataset
Loading the Dataset

In [43]:
df = pd.read_csv('matches_expanded.csv')
df.head(10)

Unnamed: 0,match_id,home_team,away_team,home_goals,away_goals,match_outcome,Home_Ball_Possession,Home_Pass_Accuracy,Home_Total_Shots,Home_Expected_Goals,Away_Ball_Possession,Away_Pass_Accuracy,Away_Total_Shots,Away_Expected_Goals
0,1035037,Burnley,Manchester City,0,3,Loss,34%,79%,6.0,0.33,66%,90%,17.0,2.08
1,1035038,Arsenal,Nottingham Forest,2,1,Win,78%,90%,15.0,0.83,22%,69%,6.0,1.18
2,1035039,Bournemouth,West Ham,1,1,Draw,63%,82%,14.0,1.37,37%,68%,16.0,1.07
3,1035041,Everton,Fulham,0,1,Loss,41%,79%,19.0,2.73,59%,85%,9.0,1.5
4,1035040,Brighton,Luton,4,1,Win,71%,91%,27.0,4.01,29%,72%,9.0,1.48
5,1035042,Sheffield Utd,Crystal Palace,0,1,Loss,32%,62%,8.0,0.51,68%,83%,24.0,1.89
6,1035043,Newcastle,Aston Villa,5,1,Win,53%,85%,17.0,3.41,47%,85%,16.0,1.77
7,1035044,Brentford,Tottenham,2,2,Draw,30%,77%,11.0,2.22,70%,90%,18.0,1.27
8,1035045,Chelsea,Liverpool,1,1,Draw,65%,87%,10.0,1.35,35%,80%,13.0,1.28
9,1035046,Manchester United,Wolves,1,0,Win,51%,82%,15.0,2.21,49%,81%,23.0,2.35


Cleaning the Dataframe

In [44]:
df.columns = df.columns.str.lower()

df['home_ball_possession'] = df['home_ball_possession'].str.replace('%', '', regex=True).astype(float)
df['home_pass_accuracy'] = df['home_pass_accuracy'].str.replace('%', '', regex=True).astype(float)
df['away_ball_possession'] = df['away_ball_possession'].str.replace('%', '', regex=True).astype(float)
df['away_pass_accuracy'] = df['away_pass_accuracy'].str.replace('%', '', regex=True).astype(float)


df.head(10)

Unnamed: 0,match_id,home_team,away_team,home_goals,away_goals,match_outcome,home_ball_possession,home_pass_accuracy,home_total_shots,home_expected_goals,away_ball_possession,away_pass_accuracy,away_total_shots,away_expected_goals
0,1035037,Burnley,Manchester City,0,3,Loss,34.0,79.0,6.0,0.33,66.0,90.0,17.0,2.08
1,1035038,Arsenal,Nottingham Forest,2,1,Win,78.0,90.0,15.0,0.83,22.0,69.0,6.0,1.18
2,1035039,Bournemouth,West Ham,1,1,Draw,63.0,82.0,14.0,1.37,37.0,68.0,16.0,1.07
3,1035041,Everton,Fulham,0,1,Loss,41.0,79.0,19.0,2.73,59.0,85.0,9.0,1.5
4,1035040,Brighton,Luton,4,1,Win,71.0,91.0,27.0,4.01,29.0,72.0,9.0,1.48
5,1035042,Sheffield Utd,Crystal Palace,0,1,Loss,32.0,62.0,8.0,0.51,68.0,83.0,24.0,1.89
6,1035043,Newcastle,Aston Villa,5,1,Win,53.0,85.0,17.0,3.41,47.0,85.0,16.0,1.77
7,1035044,Brentford,Tottenham,2,2,Draw,30.0,77.0,11.0,2.22,70.0,90.0,18.0,1.27
8,1035045,Chelsea,Liverpool,1,1,Draw,65.0,87.0,10.0,1.35,35.0,80.0,13.0,1.28
9,1035046,Manchester United,Wolves,1,0,Win,51.0,82.0,15.0,2.21,49.0,81.0,23.0,2.35


Converting Team Names to Integer IDs

In [45]:
team_encoder = LabelEncoder()
df['home_team_id'] = team_encoder.fit_transform(df['home_team'])
df['away_team_id'] = team_encoder.transform(df['away_team'])
df.head(10)

Unnamed: 0,match_id,home_team,away_team,home_goals,away_goals,match_outcome,home_ball_possession,home_pass_accuracy,home_total_shots,home_expected_goals,away_ball_possession,away_pass_accuracy,away_total_shots,away_expected_goals,home_team_id,away_team_id
0,1035037,Burnley,Manchester City,0,3,Loss,34.0,79.0,6.0,0.33,66.0,90.0,17.0,2.08,5,12
1,1035038,Arsenal,Nottingham Forest,2,1,Win,78.0,90.0,15.0,0.83,22.0,69.0,6.0,1.18,0,15
2,1035039,Bournemouth,West Ham,1,1,Draw,63.0,82.0,14.0,1.37,37.0,68.0,16.0,1.07,2,18
3,1035041,Everton,Fulham,0,1,Loss,41.0,79.0,19.0,2.73,59.0,85.0,9.0,1.5,8,9
4,1035040,Brighton,Luton,4,1,Win,71.0,91.0,27.0,4.01,29.0,72.0,9.0,1.48,4,11
5,1035042,Sheffield Utd,Crystal Palace,0,1,Loss,32.0,62.0,8.0,0.51,68.0,83.0,24.0,1.89,16,7
6,1035043,Newcastle,Aston Villa,5,1,Win,53.0,85.0,17.0,3.41,47.0,85.0,16.0,1.77,14,1
7,1035044,Brentford,Tottenham,2,2,Draw,30.0,77.0,11.0,2.22,70.0,90.0,18.0,1.27,3,17
8,1035045,Chelsea,Liverpool,1,1,Draw,65.0,87.0,10.0,1.35,35.0,80.0,13.0,1.28,6,10
9,1035046,Manchester United,Wolves,1,0,Win,51.0,82.0,15.0,2.21,49.0,81.0,23.0,2.35,13,19


Generating Historical Performace Features



In [46]:
N = 5  # Number of past matches to consider

def get_past_stats(df, team_col, stat_col, N):
    """
    Computes rolling average for the last N matches per team.
    If rolling value is NaN (not enough history), fill it with the current match's stat.
    """
    rolling_avg = df.groupby(team_col)[stat_col].transform(lambda x: x.shift(1).rolling(N, min_periods=1).mean())
    
    # Fill NaNs with the current match stat as a fallback
    rolling_avg = rolling_avg.fillna(df[stat_col])
    
    return rolling_avg

historical_features = ["goals", "ball_possession", "pass_accuracy", "total_shots", "expected_goals"]

for feature in historical_features:
    home_feature_col = f'home_{feature}'
    away_feature_col = f'away_{feature}'

    df[f'{home_feature_col}_hist'] = get_past_stats(df, 'home_team', home_feature_col, N=5)
    df[f'{away_feature_col}_hist'] = get_past_stats(df, 'away_team', away_feature_col, N=5)


df.head(10)
df.to_csv('cleaned_matches_expanded.csv', index=False)

Standard Scaling

In [47]:
numerical_cols = [col for col in df.columns if '_hist' in col or col in [
    'home_ball_possession', 'home_pass_accuracy', 'home_total_shots', 'home_expected_goals'
    'away_ball_possession', 'away_pass_accuracy', 'away_total_shots', 'away_expected_goals'
]]

scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

Label Encode Target (Win/Draw/Loss)

In [None]:
# Assuming perspective is home team
# Win = 2, Draw = 0, Loss = 1

outcome_encoder = LabelEncoder()
df['outcome_encoded'] = outcome_encoder.fit_transform(df['match_outcome'])
y = to_categorical(df['outcome_encoded'], num_classes=3)

print(df.columns)
df.head(10)

Index(['match_id', 'home_team', 'away_team', 'home_goals', 'away_goals',
       'match_outcome', 'home_ball_possession', 'home_pass_accuracy',
       'home_total_shots', 'home_expected_goals', 'away_ball_possession',
       'away_pass_accuracy', 'away_total_shots', 'away_expected_goals',
       'home_team_id', 'away_team_id', 'home_goals_hist', 'away_goals_hist',
       'home_ball_possession_hist', 'away_ball_possession_hist',
       'home_pass_accuracy_hist', 'away_pass_accuracy_hist',
       'home_total_shots_hist', 'away_total_shots_hist',
       'home_expected_goals_hist', 'away_expected_goals_hist',
       'outcome_encoded'],
      dtype='object')


Unnamed: 0,match_id,home_team,away_team,home_goals,away_goals,match_outcome,home_ball_possession,home_pass_accuracy,home_total_shots,home_expected_goals,...,away_goals_hist,home_ball_possession_hist,away_ball_possession_hist,home_pass_accuracy_hist,away_pass_accuracy_hist,home_total_shots_hist,away_total_shots_hist,home_expected_goals_hist,away_expected_goals_hist,outcome_encoded
0,1035037,Burnley,Manchester City,0,3,Loss,-1.259454,-0.335618,-1.499365,0.33,...,2.215913,-1.707908,1.590768,-0.403914,1.564515,-2.269026,1.250858,-2.259556,1.36223,1
1,1035038,Arsenal,Nottingham Forest,2,1,Win,1.873255,1.134908,-0.067011,0.83,...,-0.63502,2.511828,-2.365174,1.38123,-1.805179,-0.037465,-1.761608,-1.47007,-0.440598,2
2,1035039,Bournemouth,West Ham,1,1,Draw,0.805286,0.065435,-0.226161,1.37,...,-0.63502,1.073282,-1.016557,0.082944,-1.96564,-0.285416,0.976998,-0.617425,-0.660944,0
3,1035041,Everton,Fulham,0,1,Loss,-0.761068,-0.335618,0.569591,2.73,...,-0.63502,-1.036586,0.961414,-0.403914,0.762207,0.95434,-0.940026,1.529977,0.200407,1
4,1035040,Brighton,Luton,4,1,Win,1.374869,1.268592,1.842796,4.01,...,-0.63502,1.840506,-1.735819,1.543516,-1.323794,2.937951,-0.940026,3.551061,0.160344,2
5,1035042,Sheffield Utd,Crystal Palace,0,1,Loss,-1.401849,-2.608248,-1.181064,0.51,...,-0.63502,-1.899714,1.770584,-3.162771,0.441283,-1.773123,3.167882,-1.975341,0.981633,1
6,1035043,Newcastle,Aston Villa,5,1,Win,0.093307,0.466487,0.25129,3.41,...,-0.63502,0.114251,-0.117479,0.569801,0.762207,0.458438,0.976998,2.603678,0.741256,2
7,1035044,Brentford,Tottenham,2,2,Draw,-1.544245,-0.602986,-0.703613,2.22,...,0.790446,-2.09152,1.950399,-0.728485,1.564515,-1.02927,1.524719,0.724701,-0.260315,0
8,1035045,Chelsea,Liverpool,1,1,Draw,0.947682,0.733856,-0.862763,1.35,...,-0.63502,1.265088,-1.196373,0.894372,-0.040101,-1.277221,0.155416,-0.649005,-0.240284,0
9,1035046,Manchester United,Wolves,1,0,Win,-0.049089,0.065435,-0.067011,2.21,...,-2.060487,-0.077555,0.062336,0.082944,0.12036,-0.037465,2.894022,0.708911,1.903078,2
