In [None]:
# import libraries
import pandas as pd, numpy as np, joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from google.colab import files

In [None]:
# upload dataset
uploaded = files.upload()
fname = list(uploaded.keys())[0]
df = pd.read_csv(fname)

Saving Match Winner.csv to Match Winner (2).csv


Here we have dropped unwanted columns so that the model learns exactly what it reuires for best prediction

In [None]:
# drop leakage columns
drop_cols = ['HalfTimeHomeGoals','HalfTimeAwayGoals','HalfTimeResult',
             'HomeShots','AwayShots','HomeShotsOnTarget','AwayShotsOnTarget',
             'HomeCorners','AwayCorners','HomeFouls','AwayFouls',
             'HomeYellowCards','AwayYellowCards','HomeRedCards','AwayRedCards']
df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True, errors='ignore')


Printing dropped columns and remaining coluns actually gives us the overview of how data is present in it 

In [None]:
print("Dropped columns:", drop_cols)
print("Remaining columns:", df.columns.tolist())

Dropped columns: ['HalfTimeHomeGoals', 'HalfTimeAwayGoals', 'HalfTimeResult', 'HomeShots', 'AwayShots', 'HomeShotsOnTarget', 'AwayShotsOnTarget', 'HomeCorners', 'AwayCorners', 'HomeFouls', 'AwayFouls', 'HomeYellowCards', 'AwayYellowCards', 'HomeRedCards', 'AwayRedCards']
Remaining columns: ['match_id', 'Season', 'MatchDate', 'HomeTeam', 'AwayTeam', 'FullTimeHomeGoals', 'FullTimeAwayGoals', 'FullTimeResult', 'home_form', 'home_avg_gf', 'home_avg_ga', 'home_h2h', 'home_h2h_prev_meetings', 'away_form', 'away_avg_gf', 'away_avg_ga', 'away_h2h', 'away_h2h_prev_meetings', 'month', 'day_of_week', 'result_label', 'home_team_enc', 'away_team_enc']


In [None]:
# create match_id and season
df = df.sort_values('MatchDate').reset_index(drop=True).reset_index().rename(columns={'index':'match_id'})
if 'Season' not in df.columns:
    df['Season'] = df['MatchDate'].dt.year

Manipulating the names of the columns so that it is easily undersood and also changing home and away to "1" , "0"

In [None]:
# expand to team-match rows
home = df[['match_id','MatchDate','Season','HomeTeam','AwayTeam','FullTimeHomeGoals','FullTimeAwayGoals']].rename(
    columns={'HomeTeam':'team','AwayTeam':'opponent','FullTimeHomeGoals':'goals_for','FullTimeAwayGoals':'goals_against'})
home['is_home']=1
away = df[['match_id','MatchDate','Season','HomeTeam','AwayTeam','FullTimeHomeGoals','FullTimeAwayGoals']].rename(
    columns={'AwayTeam':'team','HomeTeam':'opponent','FullTimeAwayGoals':'goals_for','FullTimeHomeGoals':'goals_against'})
away['is_home']=0
tm = pd.concat([home,away],ignore_index=True).sort_values(['team','MatchDate']).reset_index(drop=True)

Here we are creating a head to head record of teams with rest others using a rolling approach based on past 5 matches against each other 

In [None]:
# add points and rolling features
tm['points'] = np.where(tm['goals_for']>tm['goals_against'],3,
                        np.where(tm['goals_for']==tm['goals_against'],1,0))
tm['form_rolling'] = tm.groupby('team')['points'].transform(lambda s: s.shift().rolling(5,1).mean())
tm['avg_goals_for'] = tm.groupby('team')['goals_for'].transform(lambda s: s.shift().expanding().mean())
tm['avg_goals_against'] = tm.groupby('team')['goals_against'].transform(lambda s: s.shift().expanding().mean())
tm['h2h_rolling'] = tm.groupby(['team','opponent'])['points'].transform(lambda s: s.shift().rolling(5,1).mean())
tm['h2h_prev_meetings'] = tm.groupby(['team','opponent']).cumcount()

In [None]:
# handle missing values
df.fillna(df.median(numeric_only=True), inplace=True)
df.fillna('Unknown', inplace=True)

In [None]:
# encode teams
le = LabelEncoder()
teams = pd.concat([df['HomeTeam'],df['AwayTeam']],ignore_index=True).unique()
le.fit(teams)
df['home_team_enc'] = le.transform(df['HomeTeam'])
df['away_team_enc'] = le.transform(df['AwayTeam'])

In [None]:
print("Encoded teams:", list(le.classes_)[:10], "...")
print("Shape of features:", X.shape)

Encoded teams: ['Arsenal', 'Aston Villa', 'Birmingham', 'Blackburn', 'Blackpool', 'Bolton', 'Bournemouth', 'Bradford', 'Brentford', 'Brighton'] ...
Shape of features: (9380, 13)
