In [1]:
import pandas as pd
import numpy as np
import requests
import time
from io import StringIO
from bs4 import BeautifulSoup
import re

In [2]:
url = "https://fbref.com/en/comps/Big5/history/Big-5-European-Leagues-Seasons"
data = requests.get(url)
soup = BeautifulSoup(data.text, "html.parser")
table = soup.find("table", {"id": "seasons"})
table_rows = table.find_all("tr")

desired_seasons = ["2023-2024", "2022-2023", "2021-2022"]
seasons_pattern = "|".join(desired_seasons)

# Debug: Print number of rows found
print(f"Total season rows found: {len(table_rows)}")

# Initialize list for final URLs
season_stats_urls = []

# Process rows to filter and build URLs
for row in table_rows:
    # Find the href attribute of the first <a> tag in the row
    a_tag = row.find("a", href=True)
    if a_tag:
        row_href = a_tag["href"]

        # Check if the href matches any of the desired seasons
        season = re.search(rf"/{seasons_pattern}/", row_href)
        if season:
            print(
                f"Match found for season: {season.group(0)}"
            )  # Debug: Print if match is found

            # Modify the URL to include 'stats/players/' in the correct position
            modified_href = re.sub(r"(/[^/]+)$", r"/stats/players\1", row_href)
            full_url = f"https://fbref.com{modified_href}"
            season_stats_urls.append(full_url)

            print(f"Final URL: {full_url}")  # Debug: Print each final URL

            time.sleep(
                3
            )  # Optional: Delay between requests to avoid overloading the server

# Debug: Print number of final URLs
print(f"Total season stats URLs: {len(season_stats_urls)}")
# Debug: Print all final URLs before extraction
print(f"All season stats URLs: {len(season_stats_urls)}\n{season_stats_urls}")

# Extend the list with other leagues and seasons
season_stats_urls.extend(
    [
        "https://fbref.com/en/comps/10/2023-2024/stats/2023-2024-Championship-Stats",
        "https://fbref.com/en/comps/10/2022-2023/stats/2022-2023-Championship-Stats",
        "https://fbref.com/en/comps/10/2021-2022/stats/2021-2022-Championship-Stats",
        "https://fbref.com/en/comps/23/2023-2024/stats/2023-2024-Eredivisie-Stats",
        "https://fbref.com/en/comps/23/2022-2023/stats/2022-2023-Eredivisie-Stats",
        "https://fbref.com/en/comps/23/2021-2022/stats/2021-2022-Eredivisie-Stats",
    ]
)

Total season rows found: 31
Match found for season: /2023-2024
Final URL: https://fbref.com/en/comps/Big5/2023-2024/stats/players/2023-2024-Big-5-European-Leagues-Stats
Match found for season: 2022-2023
Final URL: https://fbref.com/en/comps/Big5/2022-2023/stats/players/2022-2023-Big-5-European-Leagues-Stats
Match found for season: 2021-2022/
Final URL: https://fbref.com/en/comps/Big5/2021-2022/stats/players/2021-2022-Big-5-European-Leagues-Stats
Total season stats URLs: 3
All season stats URLs: 3
['https://fbref.com/en/comps/Big5/2023-2024/stats/players/2023-2024-Big-5-European-Leagues-Stats', 'https://fbref.com/en/comps/Big5/2022-2023/stats/players/2022-2023-Big-5-European-Leagues-Stats', 'https://fbref.com/en/comps/Big5/2021-2022/stats/players/2021-2022-Big-5-European-Leagues-Stats']


In [3]:
def extract_data_from_url(index, url):
    try:
        data = requests.get(url).text.replace("<!--", "").replace("-->", "")
        soup = BeautifulSoup(data, "html.parser")
        season = re.search(r"/(\d{4}-\d{4})/", url).group(1)

        if index >= 3:
            h2_element = soup.find("h2")
            competition = (
                h2_element.find("span").get_text().split()[-1]
                if h2_element and h2_element.find("span")
                else pd.NA
            )
        else:
            competition = pd.NA

        table = soup.find("table", {"id": "stats_standard"})
        ids = [
            x["data-append-csv"] for x in table.find_all("td", {"data-stat": "player"})
        ]

        df = pd.read_html(StringIO(str(table)))[0]
        df.columns = [f"{i} {j}" if "Unnamed" not in i else j for i, j in df.columns]
        df = df[df["Rk"] != "Rk"]
        df["Fbref"] = ids
        df["Season"] = season
        df["Competition"] = competition

        if "Comp" not in df.columns:
            df["Comp"] = pd.NA

        cols_to_move = ["Fbref", "Season", "Competition"]
        remaining_cols = [col for col in df.columns if col not in cols_to_move]
        df = df[remaining_cols[:1] + cols_to_move + remaining_cols[1:]]
        df = df.rename(
            {
                "Playing Time MP": "MP",
                "Playing Time Starts": "Starts",
                "Playing Time Min": "Min",
                "Playing Time 90s": "90s",
                "Performance G-PK": "Total_npG",
                "Expected npxG": "Total_npxG",
                "Per 90 Minutes npxG": "npxG",
                "Per 90 Minutes xAG": "xAG",
            },
            axis=1,
        )

        return df

    except Exception as e:
        print(f"An error occurred for URL index {index}: {e}")
        return pd.DataFrame()

In [4]:
all_dfs = []

# Debug: Print all final URLs before extraction
print(f"All season stats URLs: {len(season_stats_urls)}\n{season_stats_urls}")

# Process each URL
for index, url in enumerate(season_stats_urls):
    df = extract_data_from_url(index, url)
    if not df.empty:
        all_dfs.append(df)
        print(f"Data successfully extracted for URL index {index}")
    else:
        print(f"Empty DataFrame for URL index {index}")

    time.sleep(3)

All season stats URLs: 9
['https://fbref.com/en/comps/Big5/2023-2024/stats/players/2023-2024-Big-5-European-Leagues-Stats', 'https://fbref.com/en/comps/Big5/2022-2023/stats/players/2022-2023-Big-5-European-Leagues-Stats', 'https://fbref.com/en/comps/Big5/2021-2022/stats/players/2021-2022-Big-5-European-Leagues-Stats', 'https://fbref.com/en/comps/10/2023-2024/stats/2023-2024-Championship-Stats', 'https://fbref.com/en/comps/10/2022-2023/stats/2022-2023-Championship-Stats', 'https://fbref.com/en/comps/10/2021-2022/stats/2021-2022-Championship-Stats', 'https://fbref.com/en/comps/23/2023-2024/stats/2023-2024-Eredivisie-Stats', 'https://fbref.com/en/comps/23/2022-2023/stats/2022-2023-Eredivisie-Stats', 'https://fbref.com/en/comps/23/2021-2022/stats/2021-2022-Eredivisie-Stats']
Data successfully extracted for URL index 0
Data successfully extracted for URL index 1
Data successfully extracted for URL index 2
Data successfully extracted for URL index 3
Data successfully extracted for URL index 

In [8]:
all_dfs

[        Rk     Fbref     Season Competition             Player   Nation  \
 0        1  774cf58b  2023-2024        <NA>         Max Aarons  eng ENG   
 1        2  5bc43860  2023-2024        <NA>   Brenden Aaronson   us USA   
 2        3  4cd41883  2023-2024        <NA>    Paxten Aaronson   us USA   
 3        4  7310786d  2023-2024        <NA>  Keyliane Abdallah   fr FRA   
 4        5  32c2d95f  2023-2024        <NA>   Yunis Abdelhamid   ma MAR   
 ...    ...       ...        ...         ...                ...      ...   
 2960  2848  253bd2b3  2023-2024        <NA>     Lovro Zvonarek   hr CRO   
 2961  2849  79300479  2023-2024        <NA>    Martin Ødegaard   no NOR   
 2962  2850  405f6586  2023-2024        <NA>        Milan Đurić   ba BIH   
 2964  2851  405f6586  2023-2024        <NA>        Milan Đurić   ba BIH   
 2965  2852  75c62731  2023-2024        <NA>   Mateusz Łęgowski   pl POL   
 
         Pos           Squad                Comp Age  ... Per 90 Minutes Ast  \
 0    

In [34]:
all_players_prev_seasons = pd.concat(all_dfs)
all_players_prev_seasons = all_players_prev_seasons[
    [
        "Fbref",
        "Season",
        "Player",
        "Squad",
        "Competition",
        "Comp",
        "Age",
        "MP",
        "Starts",
        "Min",
        "90s",
        "Total_npG",
        "Total_npxG",
        "npxG",
        "xAG",
    ]
]
all_players_prev_seasons = all_players_prev_seasons.astype(
    {
        "MP": "int",
        "Starts": "int",
        "Min": "int",
        "90s": "float64",
        "Total_npG": "int",
        "Total_npxG": "float64",
        "npxG": "float64",
        "xAG": "float64",
    }
)
all_players_prev_seasons["Comp"] = (
    all_players_prev_seasons["Comp"].str.split(" ", n=1).str[1]
)
all_players_prev_seasons["Comp"] = all_players_prev_seasons["Comp"].fillna(
    all_players_prev_seasons["Competition"]
)
all_players_prev_seasons = all_players_prev_seasons.drop(columns=["Competition"])

In [35]:
finishing_df = all_players_prev_seasons[["Fbref", "Season", "Player", "Squad", "Comp", "Total_npG", "Total_npxG"]].copy()

In [36]:
finishing_df = (
        finishing_df.groupby(["Fbref", "Player"])
        .agg(
            Total_npG=("Total_npG", "sum"),
            Total_npxG=("Total_npxG", "sum")
        )
        .reset_index()
    )

In [37]:
finishing_df["finishing"] = ((finishing_df["Total_npG"] + 55) / (finishing_df["Total_npxG"] + 55)).round(2)

In [44]:
print(finishing_df.sort_values(by="finishing", ascending=False).head(50))

         Fbref             Player  Total_npG  Total_npxG  finishing
5480  e46012d4    Kevin De Bruyne         26        13.7       1.18
3882  a26fb8aa   Teun Koopmeiners         22        11.0       1.17
5697  ed1e53f3         Phil Foden         39        25.6       1.17
3864  a167537f      Julian Brandt         25        14.5       1.15
3252  8790f988  Gianluca Scamacca         30        19.2       1.15
1970  5148dcc5     Martin Terrier         35        23.1       1.15
1290  3515d404  James Ward-Prowse         18         9.3       1.14
5240  db7849ca     Franck Honorat         20        10.7       1.14
458   135d500d            Gabriel         20        11.1       1.13
3510  92e7e919      Son Heung-min         48        36.4       1.13
5149  d7f99582      Benson Manuel         11         4.1       1.12
247   0ae2b36f   Arno Verschueren         20        11.9       1.12
5057  d3d49142   Morgan Whittaker         20        12.2       1.12
1616  42fd9c7f      Kylian Mbappé         71    