<a href="https://colab.research.google.com/github/prasannashrestha011/PL_prediction_2025-26-season/blob/main/Laliga_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [202]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [203]:

data_path="/content/drive/MyDrive/data/laliga"
os.listdir(data_path)
file_path=os.path.join(data_path,'LaLiga_Matches.csv')


In [204]:
import os
import pandas as pd
import matplotlib as plt
import numpy as np
from collections import defaultdict
from typing import Dict,List,Tuple

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [205]:
df=pd.read_csv(file_path)
df.head()

Unnamed: 0,Season,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR
0,1995-96,02-09-1995,La Coruna,Valencia,3,0,H,2.0,0.0,H
1,1995-96,02-09-1995,Sp Gijon,Albacete,3,0,H,3.0,0.0,H
2,1995-96,03-09-1995,Ath Bilbao,Santander,4,0,H,2.0,0.0,H
3,1995-96,03-09-1995,Ath Madrid,Sociedad,4,1,H,1.0,1.0,D
4,1995-96,03-09-1995,Celta,Compostela,0,1,A,0.0,0.0,D


In [206]:
df['HTHG']=df['HTHG'].fillna(np.median(df['HTHG']))
df['HTAG']=df['HTAG'].fillna(np.median(df['HTAG']))
df['HTR']=df['HTR'].fillna(df['HTR'].mode()[0])

df=df.drop(columns=['Date'])

In [207]:

season_matches = [
    (season, season_df)
    for season, season_df in df.groupby('Season')
    if season != '2023-24'
]


In [208]:
def summarise_season(matches:pd.DataFrame)->pd.DataFrame:
  teams:Dict[str,Dict[str,int]]=defaultdict(lambda:{
      "points":0,
      "wins":0,
      "losses":0,
      "draws":0,
      "goals_for":0,
      "goals_against":0,
      "goal_diff":0
  })
  for row,match in matches.iterrows():
      home,away=match['HomeTeam'],match['AwayTeam']
      hg,ag=match['FTHG'],match['FTAG']

      teams[home]["goals_for"]+=hg
      teams[home]["goals_against"]+=ag

      teams[away]["goals_for"]+=ag
      teams[away]["goals_against"]+=hg

      if hg>ag:
        teams[home]["points"]+=3
        teams[home]["wins"]+=1
        teams[away]["losses"]+=1
      elif hg<ag:
        teams[away]["points"]+=3
        teams[away]["wins"]+=1
        teams[home]["losses"]+=1
      else:
        teams[home]["draws"]+=1
        teams[home]["points"]+=1
        teams[away]["draws"]+=1
        teams[away]["points"]+=1
  data=[]
  for team,stats in teams.items():
     goal_diff=stats['goals_for']-stats['goals_against']
     data.append({
         "team":team,
         "points":stats['points'],
         "wins":stats["wins"],
         "losses":stats["losses"],
         "draws":stats["draws"],
         "goals_for":stats["goals_for"],
         "goals_against":stats["goals_against"],
         "goal_diff":goal_diff
     })
  summary=pd.DataFrame(data)
  sorted_summary=summary.sort_values(["wins","goals_for","goal_diff","points"],ascending=[False,False,False,False]).reset_index(drop=True)
  sorted_summary["position"]=sorted_summary.index+1
  return sorted_summary



In [209]:
season_summaries:Dict[str,pd.DataFrame]={}
season_list: List[str] = [
    season for season, _ in season_matches if season != '2023-24'
]
for season,matches in season_matches:
  season_summary=summarise_season(matches)
  season_summaries[season]=season_summary


In [210]:
# preparing training data
X_train=[]
y_train=[]
for i in range(len(season_list)-1):
  prev_season=season_summaries[season_list[i]].copy().set_index('team')
  current_season=season_summaries[season_list[i+1]].copy().set_index('team')

  bottom_three_stats=prev_season.sort_values(["position"],ascending=[False]).head(3)
  default_features=bottom_three_stats.mean().to_dict()

  features=["points","wins","losses","draws","goals_for","goals_against","goal_diff"]
  features_rows=[]
  target_rows=[]
  for team,stats in current_season.iterrows():
    if team in prev_season.index:
       feats=prev_season.loc[team][features].to_dict()
    else:
       feats={k:default_features[k] for k in features}
    features_rows.append(feats)
    target_rows.append(stats['position'])
X_train=pd.DataFrame(features_rows)
y_train=pd.Series(target_rows)

In [211]:
# preparing latest season stats for prediction
latest_season_summary=season_summaries[season_list[-2]].copy().set_index('team')
bottom_three_stats=latest_season_summary.sort_values(["position"],ascending=[False]).head(3)
default_features=bottom_three_stats.mean().to_dict()
staying_teams=latest_season_summary.sort_values(["position"],ascending=[True]).head(16)

promoted_teams=["Almeria","Las Palmas","Burgos"]
latest_season_features=[]
for team in staying_teams.index:
  feats=latest_season_summary.loc[team][features].to_dict()
  latest_season_features.append((team,feats))
for team in promoted_teams:
  feats={k:default_features[k] for k in features}
  latest_season_features.append((team,feats))

latest_season_features=pd.DataFrame([stats for _,stats in latest_season_features],
                                    index=[team for team,_ in latest_season_features])




In [215]:
latest_season_features

Unnamed: 0,points,wins,losses,draws,goals_for,goals_against,goal_diff
Real Madrid,86.0,26.0,4.0,8.0,80.0,31.0,49.0
Barcelona,73.0,21.0,7.0,10.0,68.0,38.0,30.0
Ath Madrid,71.0,21.0,9.0,8.0,65.0,43.0,22.0
Betis,65.0,19.0,11.0,8.0,62.0,40.0,22.0
Sevilla,70.0,18.0,4.0,16.0,53.0,30.0,23.0
Sociedad,62.0,17.0,10.0,11.0,40.0,37.0,3.0
Villarreal,59.0,16.0,11.0,11.0,63.0,37.0,26.0
Ath Bilbao,55.0,14.0,11.0,13.0,43.0,36.0,7.0
Celta,46.0,12.0,16.0,10.0,43.0,43.0,0.0
Osasuna,47.0,12.0,15.0,11.0,37.0,51.0,-14.0


In [212]:
# building pipeling for random forest model
model=Pipeline([
    ('scaler',StandardScaler()),
    ('rf',RandomForestClassifier(
        n_estimators=100,
        max_depth=9,
        class_weight='balanced',
        random_state=42
    ))
])
model.fit(X_train,y_train)

In [214]:
probas=model.predict_proba(latest_season_features)
classes=model.named_steps['rf'].classes_
exp_positions=probas.dot(classes)
predicted_table=pd.DataFrame({"team":latest_season_features.index,
                              "position":exp_positions})
predicted_table=predicted_table.sort_values(['position'],ascending=[True]).reset_index(drop=True)
predicted_table

Unnamed: 0,team,position
0,Barcelona,2.83
1,Real Madrid,3.34
2,Ath Madrid,3.635
3,Sociedad,5.12
4,Villarreal,5.53
5,Betis,5.765
6,Ath Bilbao,8.44
7,Sevilla,9.44
8,Osasuna,9.91
9,Mallorca,10.9275
