<a href="https://colab.research.google.com/github/prasannashrestha011/PL_prediction_/blob/main/pl_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [195]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [196]:
import pandas as pd
from typing import List,Dict,Tuple
from collections import defaultdict

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [197]:
def parse_match_result(df:pd.DataFrame)->pd.DataFrame:
  goals=df['FT'].str.split("-",expand=True)
  df=df.copy()
  df['home_goals']=goals[0].astype(int)
  df['away_goals']=goals[1].astype(int)
  return df

In [198]:
def summarise_season(matches:pd.DataFrame)->pd.DataFrame:
  teams:Dict[str,Dict[str,int]]=defaultdict(lambda:{
      "points":0,
      "wins":0,
      "losses":0,
      "draws":0,
      "goals_for":0,
      "goals_against":0,
  })
  for _,row in matches.iterrows():
    home,away=row["Team 1"],row["Team 2"]
    hg,ag=row["home_goals"],row["away_goals"]

    #goals stats for home team
    teams[home]["goals_for"]+=hg
    teams[home]["goals_against"]+=ag
    #goal stats for away team
    teams[away]["goals_for"]+=ag
    teams[away]["goals_against"]+=hg

    if hg>ag:
      teams[home]["points"]+=3
      teams[home]["wins"]+=1
      teams[away]["losses"]+=1
    elif ag>hg:
      teams[away]["points"]+=3
      teams[away]["wins"]+=1
      teams[home]["losses"]+=1
    else:
      teams[home]["points"]+=1
      teams[away]["points"]+=1
      teams[home]["draws"]+=1
      teams[away]["draws"]+=1

    #calculating goal difference
    data=[]
    for team,stats in teams.items():
       goal_diff=stats['goals_for']-stats['goals_for']
       data.append({
           "team":team,
           "points":stats["points"],
           "wins":stats["wins"],
           "losses":stats["losses"],
           "draws":stats["draws"],
           "goals_for":stats["goals_for"],
           "goals_against":stats["goals_against"],
           "goal_diff":goal_diff
       })
    summary=pd.DataFrame(data)
    summary=summary.sort_values(["points","goals_for","goal_diff"],ascending=[False,False,False]).reset_index(drop=True)
    summary["position"]=summary.index+1
  return summary


In [224]:
def prepare_training_data(season_files:List[str])->Tuple[pd.DataFrame,pd.Series,pd.DataFrame]:
  season_summaries:Dict[str,pd.DataFrame]={}
  for file_path in season_files:
    raw = pd.read_csv(file_path)
    parsed = parse_match_result(raw)
    summary = summarise_season(parsed)
    season_summaries[file_path] = summary

  file_sorted = season_files
  for i in range(len(file_sorted) - 1):
    prev_summary = season_summaries[file_sorted[i]].copy().set_index('team')
    current_summary=season_summaries[file_sorted[i+1]].copy().set_index('team')

    #collecting the average value for newly promted teams from relegated teams
    bottom_three=prev_summary.sort_values(['points','goals_for','goal_diff'],ascending=[True,True,True]).head(3)
    default_features=bottom_three.mean().to_dict()

    feature_rows=[]
    target_rows=[]
    for team,row in current_summary.iterrows():
        if team in prev_summary.index:
          feats=prev_summary.loc[team][["points","wins","losses","draws","goals_for","goals_against","goal_diff"]].to_dict()
        else:
          feats={k:default_features[k] for k in ["points","wins","losses","draws","goals_for","goals_against","goal_diff"]}
        feature_rows.append(feats)
        target_rows.append(row["position"])

  X_train=pd.DataFrame(feature_rows)
  y_train=pd.Series(target_rows)

  #features for the latest season to predict the upcoming season positions
  last_summary=season_summaries[file_sorted[-1]].copy().set_index("team")
  default_latest_features=last_summary.sort_values(["points","goals_for","goal_diff"],ascending=[True,True,True]).head(3).mean().to_dict()
  last_summary_sorted=last_summary.sort_values(["points","goals_for","goal_diff"],ascending=[False,False,False]).head(17)
  staying_teams=last_summary_sorted.index.tolist()

  #promoted teams in 2025 season
  promoted = ["Leeds United", "Burnley", "Sunderland"]
  latest_feature_rows=[]
  for team in staying_teams:
          feats = last_summary.loc[team][
              ["points", "wins", "draws", "losses", "goals_for", "goals_against", "goal_diff"]
          ].to_dict()
          latest_feature_rows.append((team, feats))
  for team in promoted:
      feats = {k: default_latest_features[k] for k in [
                  "points", "wins", "draws", "losses", "goals_for", "goals_against", "goal_diff"
          ]}
  feature_columns = ["points", "wins", "losses", "draws", "goals_for", "goals_against", "goal_diff"]

  X_train = pd.DataFrame(feature_rows, columns=feature_columns)

  latest_features_df = pd.DataFrame([feats for _, feats in latest_feature_rows],
                                      index=[t for t, _ in latest_feature_rows], columns=feature_columns)
  return X_train, y_train, latest_features_df









In [200]:
def build_train_model(X:pd.DataFrame,y:pd.Series)->Pipeline:
  model=Pipeline([
      ("scaler",StandardScaler()),
      ("rf",RandomForestClassifier(n_estimators=100,
                                   max_depth=8,
                                   random_state=42,
                                   class_weight="balanced"))
  ])
  model.fit(X,y)
  return model

In [226]:
def predict_pl_table(model:Pipeline,features:pd.DataFrame)->pd.DataFrame:
  probas=model.predict_proba(features)
  classes = model.named_steps["rf"].classes_
  exp_positions = probas.dot(classes)
  prediction_df = pd.DataFrame({
      "team": features.index,
      "expected_position": exp_positions
  })
  # sort teams by lowest expected position (i.e. best finish)
  prediction_df = prediction_df.sort_values("expected_position").reset_index(drop=True)
  # assign integer ranks 1..n based on sorted order
  prediction_df["predicted_rank"] = prediction_df.index + 1
  return prediction_df[["predicted_rank", "team", "expected_position"]]



In [202]:
X_train.columns

Index(['points', 'wins', 'losses', 'draws', 'goals_for', 'goals_against',
       'goal_diff'],
      dtype='object')

In [203]:
import os
data_path="/content/drive/MyDrive/data"
files = os.listdir(data_path)
season_files = [os.path.join(data_path, f) for f in files]

In [229]:
def main():
  X_train,y_train,latest_feature_df=prepare_training_data(season_files)
  model=build_train_model(X_train,y_train)
  df=predict_pl_table(model,latest_feature_df)
  print(df)


main()

    predicted_rank            team  expected_position
0                1         Arsenal           3.010000
1                2        Man City           3.300000
2                3       Liverpool           5.000000
3                4     Aston Villa           6.880000
4                5       Tottenham           7.982000
5                6         Chelsea           8.130000
6                7       Newcastle           9.302000
7                8         Everton           9.360000
8                9  Crystal Palace          10.400000
9               10      Man United          10.492000
10              11          Fulham          10.616667
11              12        Brighton          10.690000
12              13     Bournemouth          10.810000
13              14        West Ham          10.976429
14              15          Wolves          11.916667
15              16       Brentford          13.426667
16              17   Nott'm Forest          13.544000
