# Imports

In [1]:
import pandas as pd

# Constants

In [10]:
DATA_PATH_ROOT = '../data/'
INTERNATIONAL_FILE = 'international_rugby_results.csv'

# Run

In [16]:
df = pd.read_csv(DATA_PATH_ROOT + INTERNATIONAL_FILE)

In [17]:
df.tail()

Unnamed: 0,date,home_team,away_team,home_score,away_score,competition,stadium,city,country,neutral,world_cup
2778,2024-07-13,Argentina,France,33,25,2024 France tour of Argentina,José Amalfitani Stadium,Buenos Aires,Argentina,False,False
2779,2024-08-10,Australia,South Africa,7,33,2024 Rugby Championship,Suncorp Stadium,Brisbane,Australia,False,False
2780,2024-08-10,New Zealand,Argentina,30,38,2024 Rugby Championship,Wellington Regional Stadium,Wellington,New Zealand,False,False
2781,2024-08-17,New Zealand,Argentina,42,10,2024 Rugby Championship,Eden Park,Auckland,New Zealand,True,False
2782,2024-08-17,Australia,South Africa,12,30,2024 Rugby Championship,Optus Stadium,Perth,Australia,False,False


In [19]:
df.describe()

Unnamed: 0,home_score,away_score
count,2783.0,2783.0
mean,18.806324,15.637082
std,13.411217,11.43964
min,0.0,0.0
25%,9.0,7.0
50%,17.0,14.0
75%,26.0,21.0
max,101.0,70.0


In [18]:
print(df.isnull().sum())

date            0
home_team       0
away_team       0
home_score      0
away_score      0
competition    23
stadium         0
city            0
country         0
neutral         0
world_cup       0
dtype: int64


In [30]:
# ensure date data is real date
df['date'] = pd.to_datetime(df['date'])

In [32]:
print(df['away_score'].max())
print(df['away_score'].min())
print(df['home_score'].max())
print(df['home_score'].min())


70
0
101
0


In [33]:
bins = range(0, df[['home_score', 'away_score']].max().max() + 7, 7)

df['home_score_bins'] = pd.cut(df['home_score'], bins=bins, right=True)
df['away_score_bins'] = pd.cut(df['away_score'], bins=bins, right=True)

home_score_counts = df['home_score_bins'].value_counts().sort_index()
away_score_counts = df['away_score_bins'].value_counts().sort_index()

print(home_score_counts)
print(away_score_counts)

home_score_bins
(0, 7]       437
(7, 14]      612
(14, 21]     604
(21, 28]     436
(28, 35]     287
(35, 42]     134
(42, 49]      64
(49, 56]      35
(56, 63]      27
(63, 70]      10
(70, 77]       4
(77, 84]       1
(84, 91]       0
(91, 98]       3
(98, 105]      2
Name: count, dtype: int64
away_score_bins
(0, 7]       517
(7, 14]      758
(14, 21]     626
(21, 28]     330
(28, 35]     187
(35, 42]      92
(42, 49]      42
(49, 56]      27
(56, 63]      10
(63, 70]       4
(70, 77]       0
(77, 84]       0
(84, 91]       0
(91, 98]       0
(98, 105]      0
Name: count, dtype: int64


# Feature engineering

Create a binary feature (home_advantage) that indicates whether the match was played at home or away. This can be helpful because home teams often have an advantage.

In [37]:
df['home_advantage'] = 1  # Since all home teams have the advantage in this dataset

Calculate each team’s recent form, i.e., how well a team performed in its last few matches (e.g., win/loss streak).

In [39]:
# Rolling average of home team performance over last 5 matches
df['home_team_recent5_score'] = df.groupby('home_team')['home_score'].rolling(5, min_periods=1).mean().reset_index(level=0, drop=True)
# Rolling average of away team performance over last 5 matches
df['away_team_recent5_score'] = df.groupby('away_team')['away_score'].rolling(5, min_periods=1).mean().reset_index(level=0, drop=True)

In [40]:
df.tail()

Unnamed: 0,date,home_team,away_team,home_score,away_score,competition,stadium,city,country,neutral,world_cup,home_score_bins,away_score_bins,home_advantage,home_team_recent5_score,away_team_recent5_score
2778,2024-07-13,Argentina,France,33,25,2024 France tour of Argentina,José Amalfitani Stadium,Buenos Aires,Argentina,False,False,"(28, 35]","(21, 28]",1,17.6,27.8
2779,2024-08-10,Australia,South Africa,7,33,2024 Rugby Championship,Suncorp Stadium,Brisbane,Australia,False,False,"(0, 7]","(28, 35]",1,21.2,26.2
2780,2024-08-10,New Zealand,Argentina,30,38,2024 Rugby Championship,Wellington Regional Stadium,Wellington,New Zealand,False,False,"(28, 35]","(35, 42]",1,35.4,26.4
2781,2024-08-17,New Zealand,Argentina,42,10,2024 Rugby Championship,Eden Park,Auckland,New Zealand,True,False,"(35, 42]","(7, 14]",1,24.6,21.6
2782,2024-08-17,Australia,South Africa,12,30,2024 Rugby Championship,Optus Stadium,Perth,Australia,False,False,"(7, 14]","(28, 35]",1,17.4,26.4


In [41]:
# DataFrame for home teams
home_df = df[['date', 'home_team', 'home_score']].copy()
home_df.rename(columns={'home_team': 'team', 'home_score': 'score'}, inplace=True)

# DataFrame for away teams
away_df = df[['date', 'away_team', 'away_score']].copy()
away_df.rename(columns={'away_team': 'team', 'away_score': 'score'}, inplace=True)

In [42]:
# Combine both DataFrames (home and away scores for each team)
combined_df = pd.concat([home_df, away_df])

# Sort by team and date
combined_df.sort_values(by=['team', 'date'], inplace=True)

In [44]:
# Calculate rolling average for each team (considering both home and away matches)
combined_df['team_recent_score'] = combined_df.groupby('team')['score'].rolling(5, min_periods=1).mean().reset_index(level=0, drop=True)

In [46]:
# Merge the recent score back into the original DataFrame
df = pd.merge(df, combined_df[['date', 'team', 'team_recent_score']], 
              how='left', left_on=['date', 'home_team'], right_on=['date', 'team'])

df.rename(columns={'team_recent_score': 'home_team_recent_score'}, inplace=True)

df = pd.merge(df, combined_df[['date', 'team', 'team_recent_score']], 
              how='left', left_on=['date', 'away_team'], right_on=['date', 'team'])

df.rename(columns={'team_recent_score': 'away_team_recent_score'}, inplace=True)

In [48]:
# Drop the extra 'team' columns from merging
df.drop(columns=['team_x', 'team_y'], inplace=True)
