<a href="https://colab.research.google.com/github/saampaark/PL-Football-Predictor/blob/main/predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

# Loading and preprocessing data
matches = pd.read_csv('matches.csv', index_col=0)
matches["date"] = pd.to_datetime(matches["date"])
matches["venue_code"] = matches["venue"].astype("category").cat.codes
matches["opp_code"] = matches["opponent"].astype("category").cat.codes
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")
matches["day_code"] = matches["date"].dt.dayofweek
matches["target"] = (matches["result"] == "W").astype("int")

# Initializing the RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

# Spliting data into training and testing sets based on the date
train = matches[matches["date"] < '2022-01-01']
test = matches[matches["date"] > '2022-01-01']

# List of predictor columns
predictors = ["venue_code", "opp_code", "hour", "day_code"]

# Training RandomForestClassifier on the training data
rf.fit(train[predictors], train["target"])
RandomForestClassifier(min_samples_split=0, n_estimators=50, random_state=1)
preds = rf.predict(test[predictors])

combined = pd.DataFrame(dict(actual=test["target"], prediction=preds))
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

precision_score(test["target"], preds)

# Function to compute rolling averages for specified columns
def rolling_averages(group, cols, new_cols):
  group = group.sort_values("date")
  rolling_stats = group[cols].rolling(3, closed='left').mean()
  group[new_cols] = rolling_stats
  group = group.dropna(subset=new_cols)
  return group

# Columns for which to calculate rolling averages
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]

# New column names for the rolling averages
new_cols = [f"{c}_rolling" for c in cols]

# Computing rolling averages for each team and merging results
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('team')
matches_rolling.index = range(matches_rolling.shape[0])

# Function to train the model and make predictions
def make_predictions(data, predictors):
  train = data[data["date"] < '2022-01-01']
  test = data[data["date"] > '2022-01-01']
  rf.fit(train[predictors], train["target"])
  preds = rf.predict(test[predictors])
  combined = pd.DataFrame(dict(actual=test["target"], prediction=preds), index=test.index)
  precision = precision_score(test["target"], preds)
  return combined, precision

# Making predictions using the updated dataset with rolling averages
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

# Merging additional match details for better interpretation of results
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

# Class to handle missing values in dictionary mapping
class MissingDict(dict):
  __missing__ = lambda self, key: key

# Dictionary to map team names to shortened versions
map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcastle Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves"
}

mapping = MissingDict(**map_values)

# Merging the predictions to compare scenarios where one team is predicted to win while the other is not
combined["new_team"] = combined["team"].map(mapping)
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])
merged[(merged["prediction_x"] == 1) & (merged["prediction_y"] == 0)]["actual_x"].value_counts()