In [2]:
# Import required libraries
import pandas as pd
# pandas is used for data manipulation and analysis


In [4]:
# Read the CSV file containing match data
matches = pd.read_csv("/content/Matches.csv", index_col=0 )
# Loading the dataset with matches information, using the first column as index


In [5]:
# Display first few rows of the dataset
matches.head()
# This helps us understand the structure and content of our data

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham,...,Match Report,,18.0,4.0,16.9,1.0,0.0,0.0,2022,Manchester City
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,...,Match Report,,16.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,...,Match Report,,25.0,10.0,14.3,0.0,0.0,0.0,2022,Manchester City
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,...,Match Report,,25.0,8.0,14.0,0.0,0.0,0.0,2022,Manchester City
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,...,Match Report,,16.0,1.0,15.7,1.0,0.0,0.0,2022,Manchester City


In [6]:
# Get the dimensions of the dataset
matches.shape
# Returns a tuple with (number of rows, number of columns)


(1389, 27)

In [7]:
# Calculate expected number of matches
38 * 20 * 2
# 38 matches per season × 20 teams × 2 seasons = 1520 total matches expected

1520

In [8]:
# Check number of matches per team
matches["team"].value_counts()
# This shows how many matches each team has played
# Reveals potential data completeness issues, like missing Liverpool matches


Unnamed: 0_level_0,count
team,Unnamed: 1_level_1
Southampton,72
Brighton and Hove Albion,72
Manchester United,72
West Ham United,72
Newcastle United,72
Burnley,71
Leeds United,71
Crystal Palace,71
Manchester City,71
Wolverhampton Wanderers,71


In [9]:
# This shows while majority of the teams have played around 72 games, a few have played around 38 games. While this is alright because the EPL follows a relagtion format,
# where 3 bottom placed teams are relegate each year, but liverpool wasnt relegated, so theres missing values for the team liverpool.

In [10]:
# Investigate Liverpool's matches specifically
matches[matches["team"] == 'Liverpool']
# Filters dataset to show only Liverpool's matches
# Confirms missing data for Liverpool

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2020-09-12,17:30,Premier League,Matchweek 1,Sat,Home,W,4.0,3.0,Leeds United,...,Match Report,,20.0,4.0,17.0,0.0,2.0,2.0,2021,Liverpool
2,2020-09-20,16:30,Premier League,Matchweek 2,Sun,Away,W,2.0,0.0,Chelsea,...,Match Report,,17.0,5.0,17.7,1.0,0.0,0.0,2021,Liverpool
4,2020-09-28,20:00,Premier League,Matchweek 3,Mon,Home,W,3.0,1.0,Arsenal,...,Match Report,,21.0,9.0,16.8,0.0,0.0,0.0,2021,Liverpool
6,2020-10-04,19:15,Premier League,Matchweek 4,Sun,Away,L,2.0,7.0,Aston Villa,...,Match Report,,14.0,8.0,15.8,1.0,0.0,0.0,2021,Liverpool
7,2020-10-17,12:30,Premier League,Matchweek 5,Sat,Away,D,2.0,2.0,Everton,...,Match Report,,22.0,8.0,15.0,1.0,0.0,0.0,2021,Liverpool
9,2020-10-24,20:00,Premier League,Matchweek 6,Sat,Home,W,2.0,1.0,Sheffield Utd,...,Match Report,,17.0,5.0,18.2,1.0,0.0,0.0,2021,Liverpool
11,2020-10-31,17:30,Premier League,Matchweek 7,Sat,Home,W,2.0,1.0,West Ham,...,Match Report,,8.0,2.0,18.6,1.0,1.0,1.0,2021,Liverpool
13,2020-11-08,16:30,Premier League,Matchweek 8,Sun,Away,D,1.0,1.0,Manchester City,...,Match Report,,9.0,2.0,21.5,0.0,1.0,1.0,2021,Liverpool
14,2020-11-22,19:15,Premier League,Matchweek 9,Sun,Home,W,3.0,0.0,Leicester City,...,Match Report,,24.0,12.0,11.9,0.0,0.0,0.0,2021,Liverpool
16,2020-11-28,12:30,Premier League,Matchweek 10,Sat,Away,D,1.0,1.0,Brighton,...,Match Report,,6.0,2.0,20.9,0.0,0.0,0.0,2021,Liverpool


In [11]:
# Check distribution of matches across rounds
matches["round"].value_counts()
# Shows how many matches were played in each round of the season

Unnamed: 0_level_0,count
round,Unnamed: 1_level_1
Matchweek 1,39
Matchweek 16,39
Matchweek 34,39
Matchweek 32,39
Matchweek 31,39
Matchweek 29,39
Matchweek 28,39
Matchweek 26,39
Matchweek 25,39
Matchweek 24,39


In [12]:
# Check data types of all columns
matches.dtypes
# Understanding the current data types helps in proper data preprocessing

Unnamed: 0,0
date,object
time,object
comp,object
round,object
day,object
venue,object
result,object
gf,float64
ga,float64
opponent,object


In [13]:
# Convert date column to datetime format
matches["date"] = pd.to_datetime(matches["date"])
# Enables proper date-based operations and analysis


In [14]:
# Create numeric codes for venues
matches["venue_code"] = matches["venue"].astype("category").cat.codes
# Converts text-based venue information into numerical codes for machine learning

In [15]:
# Create numeric codes for opponents
matches["opp_code"] = matches["opponent"].astype("category").cat.codes
# Converts opponent names into numerical codes for machine learning

In [16]:
# Extract hour from time string
matches["hour"] = matches["time"].str.replace(":+", "", regex=True).astype("int")
# Converts time information into numerical format for analysis

In [17]:
# Create day of week code
matches["day_code"] = matches["date"].dt.dayofweek
# Extracts day of week (0-6) from date for potential pattern analysis

In [21]:
# Create target variable for wins
matches["Target"] = (matches["result"] == "W").astype("int")
# Converts match results to binary format: 1 for win, 0 for loss/draw

In [22]:
# Creating ML Model
# Import Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
# Random Forest is used for its ability to handle non-linear relationships

In [23]:
# Initialize Random Forest model
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
# n_estimators: number of trees
# min_samples_split: minimum samples required to split internal node
# random_state: ensures reproducibility


In [24]:
# Split data into training and test sets based on date
train = matches[matches["date"] < '2022-01-01']
test = matches[matches["date"] > '2022-01-01']
# Using time-based split instead of random split to maintain temporal order

In [25]:
# Define predictor variables
predictors = ["venue_code", "opp_code", "hour", "day_code"]
# These features will be used to predict match outcomes

In [26]:
# Train the model
rf.fit(train[predictors], train["Target"])
# Fits the random forest model using training data

In [27]:
# Make predictions on test set
predictions = rf.predict(test[predictors])
# Uses trained model to predict outcomes for test data


In [28]:
# Import accuracy metric
from sklearn.metrics import accuracy_score
# For evaluating model performance

In [29]:
# Calculate accuracy
acc = accuracy_score(test["Target"], predictions)
# Computes the proportion of correct predictions
acc

0.6376811594202898

In [30]:
# Create confusion matrix
combined = pd.DataFrame(dict(actual=test["Target"], prediction=predictions))
pd.crosstab(index=combined["actual"], columns=combined["prediction"])
# Shows breakdown of correct and incorrect predictions

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,149,23
1,77,27


In [31]:
# Import and calculate precision score
from sklearn.metrics import precision_score
precision_score(test["Target"], predictions)
# Calculates proportion of correct positive predictions

0.54

In [33]:
#Improving precision and accuracy

In [34]:
# Group matches by team
grouped_matches = matches.groupby("team")
# Organizes data by team for team-specific analysis

In [35]:
# Extract Manchester City's matches as example
group = grouped_matches.get_group("Manchester City")
group.head()
# Shows all matches played by Manchester City

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,Target
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham,...,1.0,0.0,0.0,2022,Manchester City,0,18,1630,6,0
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,...,1.0,0.0,0.0,2022,Manchester City,1,15,1500,5,1
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,...,0.0,0.0,0.0,2022,Manchester City,1,0,1230,5,1
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,...,0.0,0.0,0.0,2022,Manchester City,0,10,1500,5,1
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,...,1.0,0.0,0.0,2022,Manchester City,1,17,1500,5,0


In [36]:
# Define function for calculating rolling averages
def rolling_averages(group, cols, new_cols):
    """
    Calculate rolling averages for specified columns

    Args:
        group: Grouped DataFrame
        cols: Original columns to average
        new_cols: Names for new columns with rolling averages

    Returns:
        DataFrame with added rolling average columns
    """
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [37]:
# Define columns for rolling averages
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]
# gf: goals for, ga: goals against, sh: shots, sot: shots on target
# dist: distance, fk: free kicks, pk: penalties, pkatt: penalty attempts

In [38]:
# Calculate rolling averages for example group
rolling_averages(group, cols, new_cols)
# Applies rolling average calculation to Manchester City's data

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,Target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
5,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Home,W,1.0,0.0,Arsenal,...,5,1,2.000000,2.333333,17.333333,4.666667,18.900000,1.333333,0.333333,0.333333
7,2020-10-24,12:30,Premier League,Matchweek 6,Sat,Away,D,1.0,1.0,West Ham,...,5,0,1.333333,2.000000,17.333333,3.666667,17.733333,0.666667,0.000000,0.000000
9,2020-10-31,12:30,Premier League,Matchweek 7,Sat,Away,W,1.0,0.0,Sheffield Utd,...,5,1,1.000000,0.666667,16.666667,4.333333,18.233333,0.666667,0.000000,0.000000
11,2020-11-08,16:30,Premier League,Matchweek 8,Sun,Home,D,1.0,1.0,Liverpool,...,6,0,1.000000,0.333333,14.333333,6.666667,18.466667,1.000000,0.000000,0.000000
12,2020-11-21,17:30,Premier League,Matchweek 9,Sat,Away,L,0.0,2.0,Tottenham,...,5,0,1.000000,0.666667,12.000000,5.666667,19.366667,1.000000,0.000000,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42,2022-03-14,20:00,Premier League,Matchweek 29,Mon,Away,D,0.0,0.0,Crystal Palace,...,0,0,2.333333,1.333333,19.000000,7.000000,15.366667,0.333333,0.333333,0.333333
44,2022-04-02,15:00,Premier League,Matchweek 31,Sat,Away,W,2.0,0.0,Burnley,...,5,1,1.666667,0.333333,18.333333,7.333333,16.000000,0.333333,0.000000,0.000000
46,2022-04-10,16:30,Premier League,Matchweek 32,Sun,Home,D,2.0,2.0,Liverpool,...,6,0,2.000000,0.333333,20.000000,6.666667,16.133333,0.333333,0.000000,0.000000
49,2022-04-20,20:00,Premier League,Matchweek 30,Wed,Home,W,3.0,0.0,Brighton,...,2,1,1.333333,0.666667,15.666667,4.666667,16.700000,0.333333,0.000000,0.000000


In [39]:
# Calculate rolling averages for all teams
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
# Applies rolling averages calculation to all teams

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


In [40]:
# Remove team index level
matches_rolling = matches_rolling.droplevel('team')
# Simplifies DataFrame structure

In [41]:
# Reset index
matches_rolling.index = range(matches_rolling.shape[0])
# Creates sequential index for easier handling

In [44]:
# Define prediction function with rolling averages
def make_predictions(data, predictors):
    """
    Make predictions using random forest model

    Args:
        data: DataFrame with match data
        predictors: List of predictor columns

    Returns:
        combined: DataFrame with actual and predicted values
        precision: Precision score of predictions
    """
    train = data[data["date"] < '2022-01-01']
    test = data[data["date"] > '2022-01-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision

In [47]:
# Merge predictions with match details
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)
# Adds context to predictions by including match details

In [51]:
# Create team name mappings
map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Leeds United": "Leeds Utd",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcastle Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves"
}
mapping = MissingDict(**map_values)
# Creates standardized team names for consistency

In [52]:
# Apply team name mapping
combined["new_team"] = combined["team"].map(mapping)
# Standardizes team names in the dataset

In [57]:
combined["new_team"] = combined["team"].map(mapping)
combined.head()

Unnamed: 0,actual,prediction,date_x,team_x,opponent_x,result_x,date_y,team_y,opponent_y,result_y,date,team,opponent,result,new_team
31,1,0,2021-05-09,Arsenal,West Brom,W,2021-05-09,Arsenal,West Brom,W,2021-05-09,Arsenal,West Brom,W,Arsenal
32,0,0,2021-05-12,Arsenal,Chelsea,W,2021-05-12,Arsenal,Chelsea,W,2021-05-12,Arsenal,Chelsea,W,Arsenal
34,1,0,2021-05-23,Arsenal,Brighton,W,2021-05-23,Arsenal,Brighton,W,2021-05-23,Arsenal,Brighton,W,Arsenal
35,1,1,2021-08-13,Arsenal,Brentford,L,2021-08-13,Arsenal,Brentford,L,2021-08-13,Arsenal,Brentford,L,Arsenal
37,0,1,2021-08-28,Arsenal,Manchester City,L,2021-08-28,Arsenal,Manchester City,L,2021-08-28,Arsenal,Manchester City,L,Arsenal


In [63]:
# Merge predictions for both teams in each match
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])
# Combines predictions for both teams in each match


In [68]:
# Check the columns in the 'merged' DataFrame
print(merged.columns)


Index(['actual_x', 'prediction_x', 'date_x_x', 'team_x_x', 'opponent_x_x',
       'result_x_x', 'date_y_x', 'team_y_x', 'opponent_y_x', 'result_y_x',
       'date', 'team_x', 'opponent_x', 'result_x', 'new_team_x', 'actual_y',
       'prediction_y', 'date_x_y', 'team_x_y', 'opponent_x_y', 'result_x_y',
       'date_y_y', 'team_y_y', 'opponent_y_y', 'result_y_y', 'team_y',
       'opponent_y', 'result_y', 'new_team_y'],
      dtype='object')


In [73]:
result_counts = merged[(merged["prediction_x"] == 1) & (merged["prediction_y"] ==0)]["actual_x"].value_counts()
print(result_counts)


Series([], Name: count, dtype: int64)


In [66]:
29/44

0.6590909090909091