In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
warnings.filterwarnings('ignore')

In [None]:
matches = pd.read_csv('Clean_dataset.csv')


Saving Clean_dataset.csv to Clean_dataset (1).csv


In [None]:
matches.head()


Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,xG,...,Attendance,Captain,Formation,Opp Formation,Referee,Venue_code,opp_code,Hour,Day_code,Target
0,2024-08-17,12:30,Premier League,Matchweek 1,Sat,Away,W,2,0,2.6,...,30014.0,Virgil van Dijk,4-2-3-1,4-2-3-1,Tim Robinson,0,10,12,5,1
1,2024-08-25,16:30,Premier League,Matchweek 2,Sun,Home,W,2,0,2.5,...,60017.0,Virgil van Dijk,4-2-3-1,04-04-2002,Stuart Attwell,1,3,16,6,1
2,2024-09-01,16:00,Premier League,Matchweek 3,Sun,Away,W,3,0,1.8,...,73738.0,Virgil van Dijk,4-2-3-1,4-2-3-1,Anthony Taylor,0,16,16,6,1
3,2024-09-14,15:00,Premier League,Matchweek 4,Sat,Home,L,0,1,0.9,...,60344.0,Virgil van Dijk,4-2-3-1,4-2-3-1,Michael Oliver,1,19,15,5,0
4,2024-09-21,15:00,Premier League,Matchweek 5,Sat,Home,W,3,0,2.0,...,60347.0,Virgil van Dijk,4-2-3-1,4-2-3-1,Tony Harrington,1,2,15,5,1


In [None]:
# Check matches shape
matches.shape

# Check matches dtype
matches.dtypes

Unnamed: 0,0
Date,object
Time,object
Comp,object
Round,object
Day,object
...,...
Attendance,float64
Captain,object
Formation,object
Opp Formation,object


In [None]:
# Convert date column to datetime
matches["Date"] = pd.to_datetime(matches["Date"])
matches.dtypes

Unnamed: 0,0
Date,datetime64[ns]
Time,object
Comp,object
Round,object
Day,object
...,...
Attendance,float64
Captain,object
Formation,object
Opp Formation,object


In [None]:
# Encode the Venue attribute
matches["Venue_code"] = matches["Venue"].astype("category").cat.codes

In [None]:
# Encode opponent attribute
matches["opp_code"] = matches["Opponent"].astype("category").cat.codes
# Keeping only hours from time
matches["Hour"] = matches["Time"].str.replace(":.+", "", regex=True).astype("int")
#Encode Day attribute
matches["Day_code"] = matches["Date"].dt.dayofweek
#Encode Results attribute
matches["Target"] = (matches["Result"] == "W").astype("int")

In [None]:
# Initialize model
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [None]:
# Split the dataset into training and testing sets
train = matches[matches["Date"] < '2023-07-01']
test_start_date = "2023-08-01"
test_end_date = "2024-05-25"
test = matches[(matches["Date"] >= test_start_date) & (matches["Date"] <= test_end_date)]

In [None]:
# Define the list of predictor variables (features) to be used in the model
predictors = ["Venue_code", "opp_code", "Day_code", "Hour"]

In [None]:
# Fit (train) the Random Forest model using the specified predictors and target variable
rf.fit(train[predictors], train["Target"])

In [None]:
# Use the trained Random Forest model to make predictions on the test set
preds = rf.predict(test[predictors])

In [None]:
# Evaluate the model's accuracy
acc = accuracy_score(test["Target"], preds)
acc

0.6197368421052631

In [None]:
# Create a DataFrame to compare actual and predicted target values
combined = pd.DataFrame(dict(actual=test["Target"], predicted=preds))

# Generate a confusion matrix in the form of a crosstab
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,367,95
1,194,104


In [None]:
# Calculate the precision of the model's predictions
# Precision is particularly useful when the cost of false positives is high and measures the model's accuracy in predicting positive cases
precision_score(test["Target"], preds)

0.5226130653266332

In [None]:
# Group the matches DataFrame by the "Team" column
grouped_matches = matches.groupby("Team")

In [None]:
group = grouped_matches.get_group("Manchester City").sort_values("Date")

In [None]:
# Define a function to calculate rolling averages for specified columns
def rolling_averages(group, cols, new_cols):
    # Sort the group DataFrame by the "Date" column to ensure chronological order
    group = group.sort_values("Date")
    # Calculate rolling averages over the last 3 rows (matches)
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    # Assign the rolling averages to new columns in the group DataFrame
    group[new_cols] = rolling_stats
    # Drop rows where rolling averages couldn't be calculated(to avoid NaN values)
    group = group.dropna(subset=new_cols)
    return group

In [None]:
# Define the list of columns for which rolling averages will be calculated
cols = ["GF", "GA", "Sh", "SoT", "Dist", "FK", "PK", "PKatt_x"]
# Create a list of new column names to store the rolling averages
new_cols = [f"{c}_rolling" for c in cols]

In [None]:
# Call the rolling_averages function
rolling_averages(group, cols, new_cols)

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,xG,...,Day_code,Target,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_x_rolling
2423,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Home,W,1,0,1.3,...,5,1,2.000000,2.333333,17.333333,4.666667,19.700000,1.333333,0.333333,0.333333
2424,2020-10-24,12:30,Premier League,Matchweek 6,Sat,Away,D,1,1,1.0,...,5,0,1.333333,2.000000,17.333333,3.666667,18.566667,0.666667,0.000000,0.000000
2425,2020-10-31,12:30,Premier League,Matchweek 7,Sat,Away,W,1,0,1.6,...,5,1,1.000000,0.666667,16.666667,4.333333,18.933333,0.666667,0.000000,0.000000
2426,2020-11-08,16:30,Premier League,Matchweek 8,Sun,Home,D,1,1,1.4,...,6,0,1.000000,0.333333,14.333333,6.666667,19.033333,1.000000,0.000000,0.000000
2427,2020-11-21,17:30,Premier League,Matchweek 9,Sat,Away,L,0,2,1.4,...,5,0,1.000000,0.666667,12.000000,5.666667,20.000000,1.000000,0.000000,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9,2024-08-31,17:30,Premier League,Matchweek 3,Sat,Away,W,3,1,3.0,...,5,1,3.000000,0.666667,17.333333,7.000000,18.266667,1.000000,0.333333,0.333333
10,2024-09-14,15:00,Premier League,Matchweek 4,Sat,Home,W,2,1,2.1,...,5,1,3.000000,0.666667,15.666667,5.666667,17.300000,0.666667,0.333333,0.333333
11,2024-09-22,16:30,Premier League,Matchweek 5,Sun,Home,D,2,2,2.1,...,6,0,3.000000,1.000000,18.000000,6.333333,16.666667,0.666667,0.333333,0.333333
12,2024-09-28,12:30,Premier League,Matchweek 6,Sat,Away,D,1,1,0.9,...,5,0,2.333333,1.333333,24.666667,8.666667,17.466667,0.666667,0.000000,0.000000


In [None]:
# Apply the rolling_averages function to each group of team in the matches DataFrame
matches_rolling = matches.groupby("Team").apply(lambda x: rolling_averages(x, cols, new_cols))

In [None]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,xG,...,Day_code,Target,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_x_rolling
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,2689,2020-10-04,14:00,Premier League,Matchweek 4,Sun,Home,W,2,1,0.6,...,6,1,2.000000,1.333333,8.000000,3.666667,14.633333,0.666667,0.000000,0.000000
Arsenal,2690,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Away,L,0,1,0.9,...,5,0,1.666667,1.666667,5.666667,3.666667,15.366667,0.000000,0.000000,0.000000
Arsenal,2691,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0,1,0.7,...,6,0,1.000000,1.666667,7.000000,3.666667,16.566667,0.666667,0.000000,0.000000
Arsenal,2692,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1,0,1.0,...,6,1,0.666667,1.000000,9.666667,4.000000,16.566667,1.000000,0.000000,0.000000
Arsenal,2693,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0,3,1.5,...,6,0,0.333333,0.666667,9.666667,2.666667,19.333333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,135,2024-08-31,15:00,Premier League,Matchweek 3,Sat,Away,D,1,1,0.7,...,5,0,0.666667,3.333333,8.333333,3.000000,19.133333,1.000000,0.000000,0.000000
Wolverhampton Wanderers,136,2024-09-15,16:30,Premier League,Matchweek 4,Sun,Home,L,1,2,1.2,...,6,0,1.000000,3.000000,10.666667,3.000000,19.700000,0.666667,0.000000,0.000000
Wolverhampton Wanderers,137,2024-09-21,15:00,Premier League,Matchweek 5,Sat,Away,L,1,3,0.5,...,5,0,1.333333,3.000000,11.666667,3.666667,18.333333,0.333333,0.000000,0.000000
Wolverhampton Wanderers,138,2024-09-28,17:30,Premier League,Matchweek 6,Sat,Home,L,1,2,0.6,...,5,0,1.000000,2.000000,11.000000,3.666667,16.933333,0.000000,0.000000,0.000000


In [None]:
# Drop the 'Team' level from the index in the matches_rolling DataFrame
matches_rolling = matches_rolling.droplevel('Team')
matches_rolling

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,xG,...,Day_code,Target,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_x_rolling
2689,2020-10-04,14:00,Premier League,Matchweek 4,Sun,Home,W,2,1,0.6,...,6,1,2.000000,1.333333,8.000000,3.666667,14.633333,0.666667,0.000000,0.000000
2690,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Away,L,0,1,0.9,...,5,0,1.666667,1.666667,5.666667,3.666667,15.366667,0.000000,0.000000,0.000000
2691,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0,1,0.7,...,6,0,1.000000,1.666667,7.000000,3.666667,16.566667,0.666667,0.000000,0.000000
2692,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1,0,1.0,...,6,1,0.666667,1.000000,9.666667,4.000000,16.566667,1.000000,0.000000,0.000000
2693,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0,3,1.5,...,6,0,0.333333,0.666667,9.666667,2.666667,19.333333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,2024-08-31,15:00,Premier League,Matchweek 3,Sat,Away,D,1,1,0.7,...,5,0,0.666667,3.333333,8.333333,3.000000,19.133333,1.000000,0.000000,0.000000
136,2024-09-15,16:30,Premier League,Matchweek 4,Sun,Home,L,1,2,1.2,...,6,0,1.000000,3.000000,10.666667,3.000000,19.700000,0.666667,0.000000,0.000000
137,2024-09-21,15:00,Premier League,Matchweek 5,Sat,Away,L,1,3,0.5,...,5,0,1.333333,3.000000,11.666667,3.666667,18.333333,0.333333,0.000000,0.000000
138,2024-09-28,17:30,Premier League,Matchweek 6,Sat,Home,L,1,2,0.6,...,5,0,1.000000,2.000000,11.000000,3.666667,16.933333,0.000000,0.000000,0.000000


In [None]:
# Reset the index of the matches_rolling DataFrame and assign a new ordered integer index
matches_rolling.index = range(matches_rolling.shape[0])
matches_rolling

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,xG,...,Day_code,Target,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_x_rolling
0,2020-10-04,14:00,Premier League,Matchweek 4,Sun,Home,W,2,1,0.6,...,6,1,2.000000,1.333333,8.000000,3.666667,14.633333,0.666667,0.000000,0.000000
1,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Away,L,0,1,0.9,...,5,0,1.666667,1.666667,5.666667,3.666667,15.366667,0.000000,0.000000,0.000000
2,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0,1,0.7,...,6,0,1.000000,1.666667,7.000000,3.666667,16.566667,0.666667,0.000000,0.000000
3,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1,0,1.0,...,6,1,0.666667,1.000000,9.666667,4.000000,16.566667,1.000000,0.000000,0.000000
4,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0,3,1.5,...,6,0,0.333333,0.666667,9.666667,2.666667,19.333333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3091,2024-08-31,15:00,Premier League,Matchweek 3,Sat,Away,D,1,1,0.7,...,5,0,0.666667,3.333333,8.333333,3.000000,19.133333,1.000000,0.000000,0.000000
3092,2024-09-15,16:30,Premier League,Matchweek 4,Sun,Home,L,1,2,1.2,...,6,0,1.000000,3.000000,10.666667,3.000000,19.700000,0.666667,0.000000,0.000000
3093,2024-09-21,15:00,Premier League,Matchweek 5,Sat,Away,L,1,3,0.5,...,5,0,1.333333,3.000000,11.666667,3.666667,18.333333,0.333333,0.000000,0.000000
3094,2024-09-28,17:30,Premier League,Matchweek 6,Sat,Home,L,1,2,0.6,...,5,0,1.000000,2.000000,11.000000,3.666667,16.933333,0.000000,0.000000,0.000000


In [None]:
# using make_predictions function on new set of attributes
def make_predictions(data, predictors):
    train = data[data["Date"] < '2023-07-01']
    test_start_date = "2023-08-01"
    test_end_date = "2024-05-25"
    test = data[(data["Date"] >= test_start_date) & (data["Date"] <= test_end_date)]
    rf.fit(train[predictors], train["Target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["Target"], predicted=preds), index=test.index)
    precision = precision_score(test["Target"], preds)
    return combined, precision

In [None]:
# Combine predictors and new_cols attributes
combined, precision = make_predictions(matches_rolling, predictors + new_cols)
precision

0.5594059405940595

In [None]:
# Merge few more attributes to combined (so that table makes more sense)
combined = combined.merge(matches_rolling[["Date", "Team", "Opponent", "Result"]], left_index=True, right_index=True)
combined

Unnamed: 0,actual,predicted,Date,Team,Opponent,Result
111,1,1,2023-08-12,Arsenal,Nott'ham Forest,W
112,1,0,2023-08-21,Arsenal,Crystal Palace,W
113,0,1,2023-08-26,Arsenal,Fulham,D
114,1,1,2023-09-03,Arsenal,Manchester Utd,W
115,1,1,2023-09-17,Arsenal,Everton,W
...,...,...,...,...,...,...
3084,0,0,2024-04-24,Wolverhampton Wanderers,Bournemouth,L
3085,1,0,2024-04-27,Wolverhampton Wanderers,Luton Town,W
3086,0,0,2024-05-04,Wolverhampton Wanderers,Manchester City,L
3087,0,0,2024-05-11,Wolverhampton Wanderers,Crystal Palace,L


In [None]:
# The `map_values` dictionary contains mappings of full football team names to their shortened versions for consistency
# For example, "Brighton and Hove Albion" is mapped to "Brighton", "Manchester United" to "Manchester Utd", etc.
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {"Brighton and Hove Albion": "Brighton", "Manchester United": "Manchester Utd", "Newcastle United": "Newcastle Utd", "Tottenham Hotspur": "Tottenham", "West Ham United": "West Ham", "Wolverhampton Wanderers": "Wolves", "Sheffield United": "Sheffield Utd", "Nottingham Forest": "Nott'ham Forest"}
mapping = MissingDict(**map_values)

In [None]:
# combine New_team names to table
combined["New_team"] = combined["Team"].map(mapping)

In [None]:
# Merge New_team with Opponent
merged = combined.merge(combined, left_on=["Date", "New_team"], right_on=["Date", "Opponent"])
# Table with Home team and Away team predictions
merged

Unnamed: 0,actual_x,predicted_x,Date,Team_x,Opponent_x,Result_x,New_team_x,actual_y,predicted_y,Team_y,Opponent_y,Result_y,New_team_y
0,1,1,2023-08-12,Arsenal,Nott'ham Forest,W,Arsenal,0,0,Nottingham Forest,Arsenal,L,Nott'ham Forest
1,1,0,2023-08-21,Arsenal,Crystal Palace,W,Arsenal,0,0,Crystal Palace,Arsenal,L,Crystal Palace
2,0,1,2023-08-26,Arsenal,Fulham,D,Arsenal,0,0,Fulham,Arsenal,D,Fulham
3,1,1,2023-09-03,Arsenal,Manchester Utd,W,Arsenal,0,1,Manchester United,Arsenal,L,Manchester Utd
4,1,1,2023-09-17,Arsenal,Everton,W,Arsenal,0,0,Everton,Arsenal,L,Everton
...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,0,0,2024-04-24,Wolverhampton Wanderers,Bournemouth,L,Wolves,1,0,Bournemouth,Wolves,W,Bournemouth
750,1,0,2024-04-27,Wolverhampton Wanderers,Luton Town,W,Wolves,0,0,Luton Town,Wolves,L,Luton Town
751,0,0,2024-05-04,Wolverhampton Wanderers,Manchester City,L,Wolves,1,1,Manchester City,Wolves,W,Manchester City
752,0,0,2024-05-11,Wolverhampton Wanderers,Crystal Palace,L,Wolves,1,1,Crystal Palace,Wolves,W,Crystal Palace


In [None]:
# Filter to compare predictions made for Home team and Away team
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] ==0)]["actual_x"].value_counts()

Unnamed: 0_level_0,count
actual_x,Unnamed: 1_level_1
1,102
0,74


In [None]:
# The models prediction performance improved
102/176

0.5795454545454546

**Let's try to compute precission of few other algorithm with same set of predictors**

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=1, use_label_encoder=False, eval_metric="logloss")
def make_xgb_predictions(data, predictors):
    # Split the dataset into training and testing sets
    train = data[data["Date"] < '2023-07-01']
    test_start_date = "2023-08-01"
    test_end_date = "2024-05-25"
    test = data[(data["Date"] >= test_start_date) & (data["Date"] <= test_end_date)]

    # Fit (train) the XGBoost model using the specified predictors and target variable
    xgb.fit(train[predictors], train["Target"])

    # Make predictions on the test set
    preds = xgb.predict(test[predictors])

    # Create a DataFrame to compare actual and predicted target values
    combined = pd.DataFrame(dict(actual=test["Target"], predicted=preds), index=test.index)

    # Calculate the precision score
    precision = precision_score(test["Target"], preds)

    return combined, precision

combined_xgb, precision_xgb = make_xgb_predictions(matches_rolling, predictors + new_cols)
# Display precision score
print(f"XGBoost Precision: {precision_xgb:.2f}")

XGBoost Precision: 0.56


In [None]:
!pip install catboost

In [None]:
from catboost import CatBoostClassifier

# Initialize the CatBoost model
catboost = CatBoostClassifier(iterations=1000, learning_rate=0.05, depth=6, random_state=1, verbose=0)

# Function for CatBoost predictions
def make_catboost_predictions(data, predictors):
    train = data[data["Date"] < '2023-07-01']
    test_start_date = "2023-08-01"
    test_end_date = "2024-05-25"
    test = data[(data["Date"] >= test_start_date) & (data["Date"] <= test_end_date)]

    catboost.fit(train[predictors], train["Target"])
    preds = catboost.predict(test[predictors])

    combined = pd.DataFrame(dict(actual=test["Target"], predicted=preds), index=test.index)
    precision = precision_score(test["Target"], preds)
    return combined, precision

# Evaluate CatBoost model
combined_cat, precision_cat = make_catboost_predictions(matches_rolling, predictors + new_cols)
print(f"CatBoost Precision: {precision_cat:.2f}")

CatBoost Precision: 0.57


In [None]:
import lightgbm as lgb

# Initialize the LightGBM model
lightgbm = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=1, verbose=-1)

# Function for LightGBM predictions
def make_lightgbm_predictions(data, predictors):
    train = data[data["Date"] < '2023-07-01']
    test_start_date = "2023-08-01"
    test_end_date = "2024-05-25"
    test = data[(data["Date"] >= test_start_date) & (data["Date"] <= test_end_date)]

    lightgbm.fit(train[predictors], train["Target"])
    preds = lightgbm.predict(test[predictors])

    combined = pd.DataFrame(dict(actual=test["Target"], predicted=preds), index=test.index)
    precision = precision_score(test["Target"], preds)
    return combined, precision

# Evaluate LightGBM model
combined_lgb, precision_lgb = make_lightgbm_predictions(matches_rolling, predictors + new_cols)
print(f"LightGBM Precision: {precision_lgb:.2f}")

LightGBM Precision: 0.59


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize Gradient Boosting
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=1)

# Function for Gradient Boosting predictions
def make_gbc_predictions(data, predictors):
    train = data[data["Date"] < '2023-07-01']
    test_start_date = "2023-08-01"
    test_end_date = "2024-05-25"
    test = data[(data["Date"] >= test_start_date) & (data["Date"] <= test_end_date)]

    gbc.fit(train[predictors], train["Target"])
    preds = gbc.predict(test[predictors])

    combined = pd.DataFrame(dict(actual=test["Target"], predicted=preds), index=test.index)
    precision = precision_score(test["Target"], preds)
    return combined, precision

# Evaluate Gradient Boosting model
combined_gbc, precision_gbc = make_gbc_predictions(matches_rolling, predictors + new_cols)
print(f"Gradient Boosting Precision: {precision_gbc:.2f}")

Gradient Boosting Precision: 0.58


In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# Define base models
base_models = [
    ('rf', RandomForestClassifier(n_estimators=50, random_state=1)),
    ('xgb', XGBClassifier(n_estimators=100, random_state=1, use_label_encoder=False, eval_metric="logloss")),
    ('lgb', lgb.LGBMClassifier(n_estimators=100, random_state=1, verbose=-1))
]

# Define meta-model
stack_model = StackingClassifier(estimators=base_models, final_estimator=LogisticRegression(), cv=5)

# Function for Stacking predictions
def make_stacking_predictions(data, predictors):
    train = data[data["Date"] < '2023-07-01']
    test_start_date = "2023-08-01"
    test_end_date = "2024-05-25"
    test = data[(data["Date"] >= test_start_date) & (data["Date"] <= test_end_date)]

    stack_model.fit(train[predictors], train["Target"])
    preds = stack_model.predict(test[predictors])

    combined = pd.DataFrame(dict(actual=test["Target"], predicted=preds), index=test.index)
    precision = precision_score(test["Target"], preds)
    return combined, precision

# Evaluate Stacking model
combined_stack, precision_stack = make_stacking_predictions(matches_rolling, predictors + new_cols)
print(f"Stacking Precision: {precision_stack:.2f}")
precision_stack

Stacking Precision: 0.62


0.6201550387596899

In [None]:
from sklearn.neural_network import MLPClassifier

# Initialize the Neural Network
mlp = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=1000, random_state=1)

# Function for MLP predictions
def make_mlp_predictions(data, predictors):
    train = data[data["Date"] < '2023-07-01']
    test_start_date = "2023-08-01"
    test_end_date = "2024-05-25"
    test = data[(data["Date"] >= test_start_date) & (data["Date"] <= test_end_date)]

    mlp.fit(train[predictors], train["Target"])
    preds = mlp.predict(test[predictors])

    combined = pd.DataFrame(dict(actual=test["Target"], predicted=preds), index=test.index)
    precision = precision_score(test["Target"], preds)
    return combined, precision

# Evaluate Neural Network model
combined_mlp, precision_mlp = make_mlp_predictions(matches_rolling, predictors + new_cols)
print(f"MLP Precision: {precision_mlp:.2f}")
precision_mlp

MLP Precision: 0.65


0.6527777777777778

In [None]:
# Collect precision scores for all algorithms
precision_scores = {
    "Random Forest": precision,
    "XGBoost": precision_xgb,
    "CatBoost": precision_cat,
    "LightGBM": precision_lgb,
    "Gradient Boosting": precision_gbc,
    "Stacking": precision_stack,
    "MLP (Neural Network)": precision_mlp
}

# Convert to DataFrame
comparison_table = pd.DataFrame(
    list(precision_scores.items()),
    columns=["Algorithm", "Precision"]
)

# Sort and display
comparison_table = comparison_table.sort_values(by="Precision", ascending=False)
comparison_table

Unnamed: 0,Algorithm,Precision
6,MLP (Neural Network),0.652778
5,Stacking,0.620155
3,LightGBM,0.590164
4,Gradient Boosting,0.575269
2,CatBoost,0.565789
0,Random Forest,0.559406
1,XGBoost,0.55914
