# downloading dataset from fbref

## Downloading important libraries

In [213]:
!pip install cloudscraper beautifulsoup4 pandas



In [214]:
import cloudscraper
from bs4 import BeautifulSoup
from bs4.element import Comment # Import Comment
import pandas as pd
import requests

# using function to scrape data

In [215]:
def scrape_league_stats(url, league_name):
    # Fetch the page
    scraper = cloudscraper.create_scraper()
    response = scraper.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the div that contains the commented table
    div = soup.find('div', id='all_stats_standard')
    if div:
        # Extract the HTML comment inside the div
        comment = div.find(string=lambda text: isinstance(text, Comment))
        if comment:
            # Parse the comment to create a new soup object
            table_soup = BeautifulSoup(comment, 'html.parser')
            # Find the specific table with id='stats_standard' inside the comment
            table = table_soup.find('table', id='stats_standard')
            if table:
                # Convert the table to a DataFrame
                df = pd.read_html(str(table))[0]
                # Flatten multi-level columns
                df.columns = [' '.join(col).strip() if isinstance(col, tuple) else col for col in df.columns]
                # Add a 'League' column for identification
                df['League'] = league_name
                print(f"Scraped {len(df)} rows for {league_name}")
                return df
            else:
                print(f"Table with id='stats_standard' not found in comment for {league_name}.")
        else:
            print(f"No comment found in the div for {league_name}.")
    else:
        print(f"Div with id='all_stats_standard' not found for {league_name}.")
    return None

In [216]:
# List of leagues and their FBref URLs (2024-2025 season)
leagues = [
    {'name': 'Premier League', 'url': 'https://fbref.com/en/comps/9/2024-2025/stats/2024-2025-Premier-League-Stats'},
    {'name': 'La Liga', 'url': 'https://fbref.com/en/comps/12/2024-2025/stats/2024-2025-La-Liga-Stats'},
    {'name': 'Bundesliga', 'url': 'https://fbref.com/en/comps/20/2024-2025/stats/2024-2025-Bundesliga-Stats'},
    {'name': 'Serie A', 'url': 'https://fbref.com/en/comps/11/2024-2025/stats/2024-2025-Serie-A-Stats'},
    {'name': 'Ligue 1', 'url': 'https://fbref.com/en/comps/13/2024-2025/stats/2024-2025-Ligue-1-Stats'},
    {'name': 'Primeira Liga', 'url': 'https://fbref.com/en/comps/32/2024-2025/stats/2024-2025-Primeira-Liga-Stats'}  # For Gyökeres
]

# Scrape all leagues and combine
all_dfs = []
for league in leagues:
    df_league = scrape_league_stats(league['url'], league['name'])
    if df_league is not None:
        all_dfs.append(df_league)

# Combine all league DataFrames into one
combined_df = pd.concat(all_dfs, ignore_index=True)
print(f"Combined DataFrame shape: {combined_df.shape}")
combined_df.head()

  df = pd.read_html(str(table))[0]


Scraped 596 rows for Premier League


  df = pd.read_html(str(table))[0]


Scraped 625 rows for La Liga


  df = pd.read_html(str(table))[0]


Scraped 511 rows for Bundesliga


  df = pd.read_html(str(table))[0]


Scraped 659 rows for Serie A


  df = pd.read_html(str(table))[0]


Scraped 575 rows for Ligue 1


  df = pd.read_html(str(table))[0]


Scraped 608 rows for Primeira Liga
Combined DataFrame shape: (3574, 38)


Unnamed: 0,Unnamed: 0_level_0 Rk,Unnamed: 1_level_0 Player,Unnamed: 2_level_0 Nation,Unnamed: 3_level_0 Pos,Unnamed: 4_level_0 Squad,Unnamed: 5_level_0 Age,Unnamed: 6_level_0 Born,Playing Time MP,Playing Time Starts,Playing Time Min,...,Per 90 Minutes G+A,Per 90 Minutes G-PK,Per 90 Minutes G+A-PK,Per 90 Minutes xG,Per 90 Minutes xAG,Per 90 Minutes xG+xAG,Per 90 Minutes npxG,Per 90 Minutes npxG+xAG,Unnamed: 36_level_0 Matches,League
0,1,Max Aarons,eng ENG,DF,Bournemouth,24,2000,3,1,86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches,Premier League
1,2,Joshua Acheampong,eng ENG,DF,Chelsea,18,2006,4,2,170,...,0.0,0.0,0.0,0.12,0.0,0.12,0.12,0.12,Matches,Premier League
2,3,Tyler Adams,us USA,MF,Bournemouth,25,1999,28,21,1965,...,0.14,0.0,0.14,0.07,0.05,0.12,0.07,0.12,Matches,Premier League
3,4,Tosin Adarabioyo,eng ENG,DF,Chelsea,26,1997,22,15,1409,...,0.13,0.06,0.13,0.06,0.01,0.07,0.06,0.07,Matches,Premier League
4,5,Simon Adingra,ci CIV,"FW,MF",Brighton,22,2002,29,12,1097,...,0.33,0.16,0.33,0.2,0.2,0.4,0.2,0.4,Matches,Premier League


In [217]:
# Filter for nominees (case-insensitive match on player name)
combined_nominees = combined_df[combined_df['Unnamed: 1_level_0 Player'].str.lower().isin([p.lower() for p in ballondor_nominees])]

# Sort and display
combined_nominees = combined_nominees.sort_values(by='Unnamed: 1_level_0 Player')
display(combined_nominees)

Unnamed: 0,Unnamed: 0_level_0 Rk,Unnamed: 1_level_0 Player,Unnamed: 2_level_0 Nation,Unnamed: 3_level_0 Pos,Unnamed: 4_level_0 Squad,Unnamed: 5_level_0 Age,Unnamed: 6_level_0 Born,Playing Time MP,Playing Time Starts,Playing Time Min,...,Per 90 Minutes G+A,Per 90 Minutes G-PK,Per 90 Minutes G+A-PK,Per 90 Minutes xG,Per 90 Minutes xAG,Per 90 Minutes xG+xAG,Per 90 Minutes npxG,Per 90 Minutes npxG+xAG,Unnamed: 36_level_0 Matches,League
2627,228,Achraf Hakimi,ma MAR,DF,Paris S-G,25,1998,25,24,2066,...,0.44,0.17,0.44,0.13,0.27,0.4,0.13,0.4,Matches,Ligue 1
330,319,Alexis Mac Allister,ar ARG,MF,Liverpool,25,1998,35,30,2599,...,0.35,0.17,0.35,0.1,0.16,0.26,0.1,0.26,Matches,Premier League
423,408,Cole Palmer,eng ENG,"MF,FW",Chelsea,22,2002,37,36,3191,...,0.65,0.31,0.54,0.49,0.31,0.79,0.38,0.68,Matches,Premier League
451,435,Declan Rice,eng ENG,MF,Arsenal,25,1999,35,33,2825,...,0.35,0.13,0.35,0.11,0.21,0.32,0.11,0.32,Matches,Premier League
1918,180,Denzel Dumfries,nl NED,"DF,FW",Inter,28,1996,29,20,1955,...,0.41,0.32,0.41,0.25,0.15,0.4,0.25,0.4,Matches,Serie A
2570,174,Désiré Doué,fr FRA,"FW,MF",Paris S-G,19,2005,31,18,1730,...,0.62,0.31,0.62,0.27,0.41,0.67,0.27,0.67,Matches,Ligue 1
232,225,Erling Haaland,no NOR,FW,Manchester City,24,2000,31,31,2736,...,0.82,0.62,0.72,0.72,0.1,0.82,0.62,0.72,Matches,Premier League
1717,478,Florian Wirtz,de GER,"MF,FW",Leverkusen,21,2003,31,25,2351,...,0.84,0.31,0.77,0.36,0.27,0.63,0.23,0.5,Matches,Bundesliga
2568,172,Gianluigi Donnarumma,it ITA,GK,Paris S-G,25,1999,24,24,2091,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches,Ligue 1
1435,207,Harry Kane,eng ENG,FW,Bayern Munich,31,1993,31,28,2381,...,1.32,0.64,0.98,0.77,0.2,0.97,0.5,0.7,Matches,Bundesliga


In [218]:
new_columns = [
    'Rk', 'Player', 'Nation', 'Pos', 'Squad', 'Age', 'Born',
    'MP', 'Starts', 'Min', '90s',
    'Gls', 'Ast', 'G+A', 'G-PK', 'PK', 'PKatt', 'CrdY', 'CrdR',
    'xG', 'npxG', 'xAG', 'npxG+xAG',
    'PrgC', 'PrgP', 'PrgR',
    'GlsPer90', 'AstPer90', 'G+APer90', 'G-PKPer90', 'G+A-PKPer90',
    'xGPer90', 'xAGPer90', 'xG+xAGPer90', 'npxGPer90', 'npxG+xAGPer90',
    'Matches', 'League'
]

In [219]:
# Verify the number of columns matches
if len(combined_nominees.columns) == len(new_columns):
    combined_nominees.columns = new_columns
    print("Columns renamed successfully!")
else:
    print(f"Error: DataFrame has {len(combined_nominees.columns)} columns, but {len(new_columns)} names provided.")
    print("Current columns:", list(combined_nominees.columns))

# Display the first few rows to verify
print("DataFrame with renamed columns:")
display(combined_nominees.head())

Columns renamed successfully!
DataFrame with renamed columns:


Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,...,G+APer90,G-PKPer90,G+A-PKPer90,xGPer90,xAGPer90,xG+xAGPer90,npxGPer90,npxG+xAGPer90,Matches,League
2627,228,Achraf Hakimi,ma MAR,DF,Paris S-G,25,1998,25,24,2066,...,0.44,0.17,0.44,0.13,0.27,0.4,0.13,0.4,Matches,Ligue 1
330,319,Alexis Mac Allister,ar ARG,MF,Liverpool,25,1998,35,30,2599,...,0.35,0.17,0.35,0.1,0.16,0.26,0.1,0.26,Matches,Premier League
423,408,Cole Palmer,eng ENG,"MF,FW",Chelsea,22,2002,37,36,3191,...,0.65,0.31,0.54,0.49,0.31,0.79,0.38,0.68,Matches,Premier League
451,435,Declan Rice,eng ENG,MF,Arsenal,25,1999,35,33,2825,...,0.35,0.13,0.35,0.11,0.21,0.32,0.11,0.32,Matches,Premier League
1918,180,Denzel Dumfries,nl NED,"DF,FW",Inter,28,1996,29,20,1955,...,0.41,0.32,0.41,0.25,0.15,0.4,0.25,0.4,Matches,Serie A


In [220]:
combined_nominees

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,...,G+APer90,G-PKPer90,G+A-PKPer90,xGPer90,xAGPer90,xG+xAGPer90,npxGPer90,npxG+xAGPer90,Matches,League
2627,228,Achraf Hakimi,ma MAR,DF,Paris S-G,25,1998,25,24,2066,...,0.44,0.17,0.44,0.13,0.27,0.4,0.13,0.4,Matches,Ligue 1
330,319,Alexis Mac Allister,ar ARG,MF,Liverpool,25,1998,35,30,2599,...,0.35,0.17,0.35,0.1,0.16,0.26,0.1,0.26,Matches,Premier League
423,408,Cole Palmer,eng ENG,"MF,FW",Chelsea,22,2002,37,36,3191,...,0.65,0.31,0.54,0.49,0.31,0.79,0.38,0.68,Matches,Premier League
451,435,Declan Rice,eng ENG,MF,Arsenal,25,1999,35,33,2825,...,0.35,0.13,0.35,0.11,0.21,0.32,0.11,0.32,Matches,Premier League
1918,180,Denzel Dumfries,nl NED,"DF,FW",Inter,28,1996,29,20,1955,...,0.41,0.32,0.41,0.25,0.15,0.4,0.25,0.4,Matches,Serie A
2570,174,Désiré Doué,fr FRA,"FW,MF",Paris S-G,19,2005,31,18,1730,...,0.62,0.31,0.62,0.27,0.41,0.67,0.27,0.67,Matches,Ligue 1
232,225,Erling Haaland,no NOR,FW,Manchester City,24,2000,31,31,2736,...,0.82,0.62,0.72,0.72,0.1,0.82,0.62,0.72,Matches,Premier League
1717,478,Florian Wirtz,de GER,"MF,FW",Leverkusen,21,2003,31,25,2351,...,0.84,0.31,0.77,0.36,0.27,0.63,0.23,0.5,Matches,Bundesliga
2568,172,Gianluigi Donnarumma,it ITA,GK,Paris S-G,25,1999,24,24,2091,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches,Ligue 1
1435,207,Harry Kane,eng ENG,FW,Bayern Munich,31,1993,31,28,2381,...,1.32,0.64,0.98,0.77,0.2,0.97,0.5,0.7,Matches,Bundesliga


In [221]:
# removing vitinha from genoa because he is not nominated
combined_nominees = combined_nominees[~((combined_nominees['Player'] == 'Vitinha') & (combined_nominees['Squad'] == 'Genoa'))]

In [222]:
combined_nominees

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,...,G+APer90,G-PKPer90,G+A-PKPer90,xGPer90,xAGPer90,xG+xAGPer90,npxGPer90,npxG+xAGPer90,Matches,League
2627,228,Achraf Hakimi,ma MAR,DF,Paris S-G,25,1998,25,24,2066,...,0.44,0.17,0.44,0.13,0.27,0.4,0.13,0.4,Matches,Ligue 1
330,319,Alexis Mac Allister,ar ARG,MF,Liverpool,25,1998,35,30,2599,...,0.35,0.17,0.35,0.1,0.16,0.26,0.1,0.26,Matches,Premier League
423,408,Cole Palmer,eng ENG,"MF,FW",Chelsea,22,2002,37,36,3191,...,0.65,0.31,0.54,0.49,0.31,0.79,0.38,0.68,Matches,Premier League
451,435,Declan Rice,eng ENG,MF,Arsenal,25,1999,35,33,2825,...,0.35,0.13,0.35,0.11,0.21,0.32,0.11,0.32,Matches,Premier League
1918,180,Denzel Dumfries,nl NED,"DF,FW",Inter,28,1996,29,20,1955,...,0.41,0.32,0.41,0.25,0.15,0.4,0.25,0.4,Matches,Serie A
2570,174,Désiré Doué,fr FRA,"FW,MF",Paris S-G,19,2005,31,18,1730,...,0.62,0.31,0.62,0.27,0.41,0.67,0.27,0.67,Matches,Ligue 1
232,225,Erling Haaland,no NOR,FW,Manchester City,24,2000,31,31,2736,...,0.82,0.62,0.72,0.72,0.1,0.82,0.62,0.72,Matches,Premier League
1717,478,Florian Wirtz,de GER,"MF,FW",Leverkusen,21,2003,31,25,2351,...,0.84,0.31,0.77,0.36,0.27,0.63,0.23,0.5,Matches,Bundesliga
2568,172,Gianluigi Donnarumma,it ITA,GK,Paris S-G,25,1999,24,24,2091,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches,Ligue 1
1435,207,Harry Kane,eng ENG,FW,Bayern Munich,31,1993,31,28,2381,...,1.32,0.64,0.98,0.77,0.2,0.97,0.5,0.7,Matches,Bundesliga


In [223]:
# Export to CSV for your dataset
combined_nominees.to_csv('ballondor_2025_nominees_dataset.csv', index=False)