<a href="https://colab.research.google.com/github/prasannashrestha011/PL_prediction_2025-26-season/blob/main/ManU_vs_LiverPool_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
import pandas as pd

In [4]:
import os
data_path="/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files"
season_csv_list=os.listdir(data_path)



#file path for each csv
season_files=[os.path.join(data_path,s) for s in season_csv_list]
print(season_files)


['/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2024.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2023.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2022.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2021.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2020.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2019.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2018.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2017.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2016.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2015.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2014.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2013.csv', '/content/drive/MyDrive/data/premier_league_2000_t_2025/csv_files/2012.csv'

In [5]:
def parse_date(x):
  try:
    return pd.to_datetime(x,dayfirst=True)
  except:
    return pd.to_datetime(x,dayfirst=True,format="%d/%m/%y")

In [6]:
def get_season(date):
    if date.month >= 8:  # August or later → season starts that year
        return f"{date.year}-{date.year+1}"
    else:  # before August → season started previous year
        return f"{date.year-1}-{date.year}"


In [7]:
def get_last5_stats(df:pd.DataFrame,team:str):
   team_matches5=df[(df["HomeTeam"]==team)|(df["AwayTeam"]==team)].sort_values(['Date'],ascending=False).head()
   goals_for=(  (team_matches5['HomeTeam']==team) * team_matches5['FTHG']   +
               (team_matches5['AwayTeam']==team) * team_matches5['FTAG'] ).mean()

   goals_against=((team_matches5['HomeTeam']==team) * team_matches5['FTAG']+
                   (team_matches5['AwayTeam']==team) * team_matches5['FTHG']).mean()

   avg_corners=((team_matches5['HomeTeam']==team)* team_matches5['HC']+
                (team_matches5['AwayTeam']==team)* team_matches5['AC']).mean()
   avg_redcards=((team_matches5['HomeTeam']==team) * team_matches5['HR']+
                 (team_matches5['AwayTeam']==team) * team_matches5['AR']).mean()
   # win rate in last 5 matches
   wins=((team_matches5['HomeTeam']==team) & (team_matches5['FTR']=='H') |
         (team_matches5['AwayTeam']==team)& (team_matches5['FTR']=='A')).sum()
   win_rate=wins/len(team_matches5)

   draws=((team_matches5["HomeTeam"]==team) & (team_matches5['FTR']=='D') |
          (team_matches5["AwayTeam"]==team) & (team_matches5['FTR']=='D')).sum()
   draw_rate=draws/len(team_matches5)

   return{
       "avg_goals_for_last5":goals_for,
       "avg_goals_against_last5":goals_against,
       "avg_corners_last5":avg_corners,
       "avg_redcards_last5":avg_redcards,
       "win_rate_last5":win_rate,
       "draw_rate_last5":draw_rate
   }


In [8]:
def add_last5_features(fixtures:pd.DataFrame)->pd.DataFrame:
    features=[]
    for _,row in fixtures.iterrows():
       home_stats_last5=get_last5_stats(fixtures,row['HomeTeam'])
       away_stats_last5=get_last5_stats(fixtures,row['AwayTeam'])

       features.append({
           **row,
           **{f"h_{k}": v for k,v in home_stats_last5.items()},
           **{f"a_{k}":v for k, v in away_stats_last5.items()},

       })
    return pd.DataFrame(features)


In [9]:
#calculating head to head stats between two teams in home and away fixtures
def get_h2h_win_rates(df,row):

  home=row["HomeTeam"] #liverpool
  away=row["AwayTeam"] #arsenal
  date=row["Date"]
  past_matches=df[(df["Date"]<date) &
             ((df["HomeTeam"]==home) & (df["AwayTeam"]==away)) |
             ((df["AwayTeam"]==home) & (df["HomeTeam"]==away))]
  if past_matches.empty:
    return {"home_away_win_rate": 0.5, "away_home_win_rate": 0.5}
  #home team h2h win rate
  home_wins=(((past_matches["HomeTeam"]==home) &(past_matches["FTR"]=='H'))
                |
             ((past_matches["AwayTeam"]==home) & (past_matches["FTR"]=='A'))).sum()
  #away team h2h win rate
  away_wins=(((past_matches["HomeTeam"]==away) & (past_matches["FTR"]=='H'))
                |
             ((past_matches["AwayTeam"]==away) & (past_matches["FTR"]=='A'))).sum()

  total_matches=len(past_matches)

  return {
      "home_away_win_rate":home_wins/total_matches,
      "away_home_win_rate":away_wins/total_matches
  }


In [10]:
from typing import Dict
#preparing dataset
season_fixtures:Dict[str,pd.DataFrame]={}
for file_path in season_files:
    raw = pd.read_csv(file_path, encoding='ISO-8859-1', on_bad_lines='skip')
    # parsing date and sorting them in desending order
    raw["Date"]=raw["Date"].apply(parse_date)
    raw=raw.sort_values(["Date"],ascending=False).reset_index(drop=True)

    #selecting fixtures of Man u and liverpool only

    raw['season']=raw['Date'].apply(get_season)

    for season_name, group in raw.groupby('season'):
        season_fixtures[season_name] = group
        season_fixtures[season_name] = add_last5_features(group)




  win_rate=wins/len(team_matches5)
  draw_rate=draws/len(team_matches5)


In [11]:

def calculate_h2h_win_rates(df: pd.DataFrame) -> pd.DataFrame:
    """Calculate head-to-head win rates for each match."""

    h2h: Dict = {}
    home_rates, away_rates,draw_rates = [], [], []

    for _, row in df.iterrows():
        home, away = row["HomeTeam"], row["AwayTeam"]
        key = tuple(sorted([home, away]))


        stats = h2h.get(key, {"team1": 0, "team2": 0,"draws":0, "matches": 0})
        total = stats["matches"]

        # Compute current H2H rates
        if total == 0:
            home_rate = away_rate = 0.5
            draw_rate=0.33
        else:
            if home == key[0]:
                home_rate = stats["team1"] / total
                away_rate = stats["team2"] / total
            else:
                home_rate = stats["team2"] / total
                away_rate = stats["team1"] / total
            draw_rate=stats["draws"]/total


        home_rates.append(home_rate)
        away_rates.append(away_rate)
        draw_rates.append(draw_rate)

        # Update stats with current match result
        stats["matches"]+=1
        if row["FTR"]=='H':
           stats["team1" if home==key[0] else "team2"]+=1
        elif row["FTR"]=='A':
           stats["team2" if home==key[0] else "team1"]+=1
        else:
          stats["draws"]+=1
        h2h[key] = stats

    # Assign to dataframe
    df["home_vs_away_winrate"] = home_rates
    df["away_vs_home_winrate"] = away_rates
    df["h2h_draw_rate"]=draw_rates
    return df


In [12]:
combined=pd.concat(season_fixtures.values(),ignore_index=True)
shot_eff=pd.DataFrame({
    "home_shot_effiency":combined["HST"].div(combined["HS"]).fillna(0),
    "away_shot_effiency":combined["AST"].div(combined["AS"]).fillna(0)
})

combined=pd.concat([combined,shot_eff],axis=1)
combined=calculate_h2h_win_rates(combined)
combined.sort_values(["season","Date"],ascending=False,inplace=True)
selected_cols = [


    "HomeTeam", "AwayTeam",
    "FTHG", "FTAG",
    # head to head win rates
    "home_vs_away_winrate",
    "away_vs_home_winrate",
    "h2h_draw_rate",
    # Match performance stats
    "home_shot_effiency",
    "away_shot_effiency",


    #last 5 match stats for home and away team
    "h_avg_goals_for_last5",
    "h_avg_goals_against_last5",
    "h_win_rate_last5",
    "a_avg_goals_for_last5",
    "a_avg_goals_against_last5",
    "a_win_rate_last5",

    #discipline and intensity of the match
    "h_avg_corners_last5",
    "a_avg_corners_last5",
    "h_avg_redcards_last5",
    "a_avg_redcards_last5",

    # Betting odds
    "AvgH", "AvgD", "AvgA",


    "FTR"
]

combined=combined[selected_cols]
#filling missing values with their mean
combined[["AvgH", "AvgD", "AvgA"]] = (
    combined[["AvgH", "AvgD", "AvgA"]].fillna(combined.mean(numeric_only=True))
)


combined.reset_index(drop=True, inplace=True)


combined.dropna(subset=["HomeTeam","AwayTeam","FTHG","FTAG","FTR"], inplace=True)
combined.head()

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,home_vs_away_winrate,away_vs_home_winrate,h2h_draw_rate,home_shot_effiency,away_shot_effiency,h_avg_goals_for_last5,...,a_avg_goals_against_last5,a_win_rate_last5,h_avg_corners_last5,a_avg_corners_last5,h_avg_redcards_last5,a_avg_redcards_last5,AvgH,AvgD,AvgA,FTR
1,Brentford,Man City,0.0,1.0,0.25,0.625,0.125,0.166667,0.4,1.4,...,0.8,0.6,5.2,3.6,0.0,0.0,4.73,4.27,1.64,A
2,Wolves,Brighton,1.0,1.0,0.142857,0.428571,0.428571,0.5,0.352941,1.0,...,1.4,0.4,4.2,4.8,0.0,0.0,3.67,3.6,1.99,D
3,Newcastle,Nott'm Forest,2.0,0.0,0.833333,0.166667,0.0,0.5,0.8,0.8,...,2.0,0.0,6.4,5.2,0.0,0.0,1.58,4.21,5.46,H
4,Everton,Crystal Palace,2.0,1.0,0.5,0.115385,0.384615,0.5,0.533333,1.4,...,0.8,0.6,4.2,4.0,0.0,0.0,2.53,3.21,2.89,H
5,Aston Villa,Burnley,2.0,1.0,0.416667,0.166667,0.416667,0.466667,0.4,1.2,...,2.4,0.0,5.2,2.4,0.0,0.0,1.59,4.0,5.76,H


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
import numpy as np

X_cols = [
    # Head-to-head win rates
    "home_vs_away_winrate","away_vs_home_winrate","home_shot_effiency","away_shot_effiency",
    "h_avg_goals_for_last5","h_avg_goals_against_last5","h_win_rate_last5","a_avg_goals_for_last5","a_avg_goals_against_last5","a_win_rate_last5",
    "h_avg_corners_last5","a_avg_corners_last5","h_avg_redcards_last5","a_avg_redcards_last5",
    "AvgH", "AvgD", "AvgA"
]
y_cols='FTR'
X=combined[X_cols].reset_index(drop=True)
y=combined[y_cols].reset_index(drop=True)

scaler=StandardScaler()
lr=LabelEncoder()

X=X.replace([np.inf,-np.inf],np.nan)
X=X.fillna(0.5)

#scaling X
X_scaled=scaler.fit_transform(X)

#encoding y
y=lr.fit_transform(y)




In [14]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


rf=RandomForestClassifier(    n_estimators=300,          # more trees
    max_depth=12,              # limit tree depth
    min_samples_split=5,       # prevent splits with too few samples
    min_samples_leaf=2,        # leaf must have at least 2 samples
    max_features='sqrt',       # sqrt(num_features) for split
    class_weight='balanced',   # handle class imbalance
    random_state=42)
rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.5527426160337553
              precision    recall  f1-score   support

           0       0.55      0.60      0.57       582
           1       0.29      0.21      0.24       457
           2       0.64      0.70      0.67       857

    accuracy                           0.55      1896
   macro avg       0.50      0.50      0.50      1896
weighted avg       0.53      0.55      0.54      1896

