<a href="https://colab.research.google.com/github/prasannashrestha011/PL_prediction_2025-26-season/blob/main/ManU_vs_LiverPool_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [107]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [108]:
import pandas as pd

In [109]:
import os
data_path="/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files"
season_csv_list=os.listdir(data_path)



#file path for each csv
season_files=[os.path.join(data_path,s) for s in season_csv_list]
print(season_files)


['/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2024.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2023.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2022.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2021.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2020.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2019.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2018.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2017.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2016.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2015.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2014.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2013.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2012.csv'

In [111]:
def parse_date(x):
  try:
    return pd.to_datetime(x,dayfirst=True)
  except:
    return pd.to_datetime(x,dayfirst=True,format="%d/%m/%y")

In [112]:
def get_season(date):
    if date.month >= 8:  # August or later → season starts that year
        return f"{date.year}-{date.year+1}"
    else:  # before August → season started previous year
        return f"{date.year-1}-{date.year}"


In [120]:
def get_last5_stats(df: pd.DataFrame, team: str):
    team_matches = df[(df["HomeTeam"] == team) | (df["AwayTeam"] == team)].sort_values("Date", ascending=False).head(5)

    goals_for = ((team_matches["HomeTeam"] == team) * team_matches["FTHG"] +
                 (team_matches["AwayTeam"] == team) * team_matches["FTAG"]).mean()

    goals_against = ((team_matches["HomeTeam"] == team) * team_matches["FTAG"] +
                     (team_matches["AwayTeam"] == team) * team_matches["FTHG"]).mean()

    wins = ((team_matches["FTR"] == "H") & (team_matches["HomeTeam"] == team) |
            (team_matches["FTR"] == "A") & (team_matches["AwayTeam"] == team)).sum()

    win_rate = wins / len(team_matches)

    return {
        "avg_goals_for_last5": goals_for,
        "avg_goals_against_last5": goals_against,
        "win_rate_last5": win_rate
    }


In [121]:
def add_last5_features(fixtures: pd.DataFrame):
    features = []
    for _, row in fixtures.iterrows():
        home_stats = get_last5_stats(fixtures, row["HomeTeam"])
        away_stats = get_last5_stats(fixtures, row["AwayTeam"])

        features.append({
            **row,
            **{f"home_{k}": v for k, v in home_stats.items()},
            **{f"away_{k}": v for k, v in away_stats.items()}
        })

    return pd.DataFrame(features)


In [125]:
from typing import Dict
#preparing dataset
season_fixtures:Dict[str,pd.DataFrame]={}
selected_teams={"Liverpool","Man United"}
for file_path in season_files:
    raw = pd.read_csv(file_path, encoding='ISO-8859-1', on_bad_lines='skip')
    # parsing date and sorting them in desending order
    raw["Date"]=raw["Date"].apply(parse_date)
    raw=raw.sort_values(["Date"],ascending=False).reset_index(drop=True)

    #selecting fixtures of Man u and liverpool
    raw=raw[raw[["HomeTeam","AwayTeam"]].apply(set,axis=1)==selected_teams]
    raw['season']=raw['Date'].apply(get_season)

    for season_name, group in raw.groupby('season'):
        season_fixtures[season_name] = group
        season_fixtures[season_name] = add_last5_features(group)




In [129]:
combined=pd.concat(season_fixtures.values(),ignore_index=True)
selected_cols = [
    "season",
    "Date", "HomeTeam", "AwayTeam",
    "FTHG", "FTAG", "FTR",          # target info

    # Match performance stats
    "HS", "AS",                     # total shots
    "HST", "AST",                   # shots on target
    "HF", "AF",                     # fouls
    "HC", "AC",                     # corners
    "HY", "AY",                     # yellow cards
    "HR", "AR",                     # red cards

    #last 5 match stats
    "home_avg_goals_for_last5",
    "home_avg_goals_against_last5",
    "home_win_rate_last5",
    "away_avg_goals_for_last5",
    "away_avg_goals_against_last5",
    "away_win_rate_last5",

    # Betting odds
    "B365H", "B365D", "B365A",
    "AvgH", "AvgD", "AvgA"
]

combined=combined[selected_cols]
#filling missing values with their mean
combined[["B365H", "B365D", "B365A", "AvgH", "AvgD", "AvgA"]] = (
    combined[["B365H", "B365D", "B365A", "AvgH", "AvgD", "AvgA"]].fillna(combined.mean(numeric_only=True))
)

combined.head()

Unnamed: 0,season,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,HST,...,home_win_rate_last5,away_avg_goals_for_last5,away_avg_goals_against_last5,away_win_rate_last5,B365H,B365D,B365A,AvgH,AvgD,AvgA
0,2024-2025,2025-01-05,Liverpool,Man United,2.0,2.0,D,19.0,13.0,6.0,...,0.5,1.0,2.5,0.0,1.29,6.25,8.0,1.29,6.05,9.49
1,2024-2025,2024-09-01,Man United,Liverpool,0.0,3.0,A,8.0,11.0,3.0,...,0.0,2.5,1.0,0.5,3.7,4.2,1.85,3.82,4.2,1.85
2,2023-2024,2024-04-07,Man United,Liverpool,2.0,2.0,D,9.0,28.0,5.0,...,0.0,1.0,1.0,0.0,4.75,4.5,1.6,4.69,4.65,1.63
3,2023-2024,2023-12-17,Liverpool,Man United,0.0,0.0,D,34.0,6.0,8.0,...,0.0,1.0,1.0,0.0,1.3,6.0,8.0,1.32,6.09,8.5
4,2022-2023,2023-03-05,Liverpool,Man United,7.0,0.0,H,18.0,8.0,8.0,...,0.5,1.0,4.0,0.5,2.38,3.6,2.88,2.4,3.68,2.85
