In [1]:
import pandas as pd
import numpy as np
import requests
import time
from io import StringIO
from bs4 import BeautifulSoup
import re

In [2]:
url = "https://fbref.com/en/comps/Big5/history/Big-5-European-Leagues-Seasons"
data = requests.get(url)

In [3]:
soup = BeautifulSoup(data.text)
table = soup.find("table", {"id": "seasons"})
table_rows = table.find_all("tr")

In [4]:
desired_seasons = ['2023-2024','2022-2023', '2021-2022']
seasons_pattern = "|".join(desired_seasons)

In [5]:
# Debug: Print number of rows found
print(f"Total season rows found: {len(table_rows)}")

# Initialize list for final URLs
season_stats_urls = []

# Process rows to filter and build URLs
for row in table_rows:
    # Find the href attribute of the first <a> tag in the row
    a_tag = row.find("a", href=True)
    if a_tag:
        row_href = a_tag["href"]
        
        # Check if the href matches any of the desired seasons
        season = re.search(rf'/{seasons_pattern}/', row_href)
        if season:
            print(f"Match found for season: {season.group(0)}")  # Debug: Print if match is found
            
            # Modify the URL to include 'stats/players/' in the correct position
            modified_href = re.sub(r'(/[^/]+)$', r'/stats/players\1', row_href)
            full_url = f"https://fbref.com{modified_href}"
            season_stats_urls.append(full_url)
            
            print(f"Final URL: {full_url}")  # Debug: Print each final URL
            
            time.sleep(3)  # Optional: Delay between requests to avoid overloading the server

# Debug: Print number of final URLs
print(f"Total season stats URLs: {len(season_stats_urls)}")


Total season rows found: 31
Match found for season: /2023-2024
Final URL: https://fbref.com/en/comps/Big5/2023-2024/stats/players/2023-2024-Big-5-European-Leagues-Stats
Match found for season: 2022-2023
Final URL: https://fbref.com/en/comps/Big5/2022-2023/stats/players/2022-2023-Big-5-European-Leagues-Stats
Match found for season: 2021-2022/
Final URL: https://fbref.com/en/comps/Big5/2021-2022/stats/players/2021-2022-Big-5-European-Leagues-Stats
Total season stats URLs: 3


In [6]:
season_stats_urls.extend([
    "https://fbref.com/en/comps/10/2023-2024/stats/2023-2024-Championship-Stats", "https://fbref.com/en/comps/10/2022-2023/stats/2022-2023-Championship-Stats", "https://fbref.com/en/comps/10/2021-2022/stats/2021-2022-Championship-Stats",
    "https://fbref.com/en/comps/23/2023-2024/stats/2023-2024-Eredivisie-Stats", "https://fbref.com/en/comps/23/2022-2023/stats/2022-2023-Eredivisie-Stats", "https://fbref.com/en/comps/23/2021-2022/stats/2021-2022-Eredivisie-Stats"
    ])

In [7]:
season_stats_urls

['https://fbref.com/en/comps/Big5/2023-2024/stats/players/2023-2024-Big-5-European-Leagues-Stats',
 'https://fbref.com/en/comps/Big5/2022-2023/stats/players/2022-2023-Big-5-European-Leagues-Stats',
 'https://fbref.com/en/comps/Big5/2021-2022/stats/players/2021-2022-Big-5-European-Leagues-Stats',
 'https://fbref.com/en/comps/10/2023-2024/stats/2023-2024-Championship-Stats',
 'https://fbref.com/en/comps/10/2022-2023/stats/2022-2023-Championship-Stats',
 'https://fbref.com/en/comps/10/2021-2022/stats/2021-2022-Championship-Stats',
 'https://fbref.com/en/comps/23/2023-2024/stats/2023-2024-Eredivisie-Stats',
 'https://fbref.com/en/comps/23/2022-2023/stats/2022-2023-Eredivisie-Stats',
 'https://fbref.com/en/comps/23/2021-2022/stats/2021-2022-Eredivisie-Stats']

In [9]:
all_dfs = []
for index, url in enumerate(season_stats_urls):
    try:
        data = requests.get(url).text.replace("<!--", "").replace("-->", "")
        soup = BeautifulSoup(data)
        season = re.search(r'/(\d{4}-\d{4})/', url)
    
        if index >= 3:
        # Only extract competition for index 3 and above
            h2_element = soup.find("h2")
            if h2_element:
                span_element = h2_element.find("span")
                if span_element:
                    competition = span_element.get_text().split()[-1]
                else:
                    competition = pd.NA
                    print(f"No <span> tag found within <h2> tag for URL index {index}.")
            else:
                competition = pd.NA
                print(f"No <h2> tag found for URL index {index}.")
        else:
            competition = pd.NA
            print(f"Skipping competition extraction for URL index {index}.")
    
        table = soup.find("table", {"id": "stats_standard"})
        ids = [x["data-append-csv"] for x in table.find_all("td", {"data-stat": "player"})]

        df = pd.read_html(StringIO(str(table)))[0]
        df.columns = [f"{i} {j}" if "Unnamed" not in i else j for i, j in df.columns]
        df = df.loc[df["Rk"] != "Rk"]
        df["Fbref"] = ids
        df['Season'] = season.group(1)
        df['Competition'] = competition
        
        if 'Comp' not in df.columns:
            df['Comp'] = pd.NA
        
        cols_to_move = ['Fbref', 'Season', 'Competition']
        remaining_cols = [col for col in df.columns if col not in cols_to_move]
        new_order = remaining_cols[:1] + cols_to_move + remaining_cols[1:]
            
        df = df[new_order]
        df = df.rename({'Playing Time MP': 'MP','Playing Time Starts': 'Starts', 'Playing Time Min': 'Min', 'Playing Time 90s': '90s', 'Per 90 Minutes npxG': 'npxG', 'Per 90 Minutes xAG': 'xAG'}, axis=1)
        
        all_dfs.append(df)
        time.sleep(3)
    
    except Exception as e:
        print(f"An error occurred for URL index {index}: {e}")

Skipping competition extraction for URL index 0.
Skipping competition extraction for URL index 1.
Skipping competition extraction for URL index 2.


In [10]:
all_players_prev_seasons = pd.concat(all_dfs)

In [11]:
all_players_prev_seasons = all_players_prev_seasons[['Fbref', 'Season', 'Player','Squad', 'Competition', 'Comp', 'Age', 'MP', 'Starts', 'Min', '90s', 'npxG', 'xAG']]
all_players_prev_seasons = all_players_prev_seasons.astype({'MP':'int', 'Starts':'int', 'Min':'int', '90s':'float64', 'npxG':'float64', 'xAG':'float64'})

In [12]:
all_players_prev_seasons['Comp'] = all_players_prev_seasons['Comp'].str.split(' ',n=1).str[1]
all_players_prev_seasons['Comp'] = all_players_prev_seasons['Comp'].fillna(all_players_prev_seasons['Competition'])
all_players_prev_seasons = all_players_prev_seasons.drop(columns=['Competition'])

In [13]:
#all_players_prev_seasons = all_players_prev_seasons[all_players_prev_seasons['Fbref'] == '4e9a0555']

In [14]:
all_players_prev_seasons['Comp'].unique()

array(['Premier League', 'Bundesliga', 'Ligue 1', 'La Liga', 'Serie A',
       'Championship', 'Eredivisie'], dtype=object)

In [15]:
# Group by the relevant columns and perform aggregation
all_players_prev_seasons = all_players_prev_seasons.groupby(
    ['Fbref', 'Season', 'Player', 'Comp', 'Age']
).agg(
    MP=('MP', 'sum'),
    Starts=('Starts', 'sum'),
    Min=('Min', 'sum'),
    ninetys=('90s', 'sum'),
    npxG=('npxG', lambda x: (x * all_players_prev_seasons.loc[x.index, '90s']).sum() / all_players_prev_seasons.loc[x.index, '90s'].sum() if all_players_prev_seasons.loc[x.index, '90s'].sum() > 0 else 0),
    xAG=('xAG', lambda x: (x * all_players_prev_seasons.loc[x.index, '90s']).sum() / all_players_prev_seasons.loc[x.index, '90s'].sum() if all_players_prev_seasons.loc[x.index, '90s'].sum() > 0 else 0)
).reset_index()

# Rename the aggregated column from 'ninetys' to '90s'
all_players_prev_seasons = all_players_prev_seasons.rename(columns={'ninetys': '90s'})

In [16]:
# Define a function for weighted average with recency and league multipliers
def weighted_avg_with_recency_and_multiplier(values, weights, comps, recency_weights, multipliers):
    # Convert leagues to multipliers
    multipliers_series = pd.Series([multipliers.get(comp, 1) for comp in comps], index=weights.index)
    
    # Adjust values by league multiplier
    adjusted_values = values * multipliers_series
    
    # Apply recency weights
    adjusted_weights = weights * recency_weights
    
    # Calculate the weighted average
    total_weight = np.sum(adjusted_weights)
    if total_weight > 0:
        return np.sum(adjusted_values * adjusted_weights) / total_weight
    else:
        return np.nan  # or return 0 if you prefer

# Assign recency weights (more recent seasons get higher weights)
def calculate_recency_weights(seasons):
    unique_seasons = sorted(seasons.unique())
    season_weights = {season: weight for weight, season in enumerate(unique_seasons, 1)}
    return seasons.map(season_weights)

# Define multipliers
multipliers = {
    'Premier League': 1, 
    'Ligue 1': 0.75, 
    'Serie A': 0.75, 
    'Bundesliga': 0.75, 
    'La Liga': 0.75, 
    'Championship': 0.5, 
    'Eredivisie': 0.5
}

# Calculate recency weights for each season
all_players_prev_seasons['recency_weight'] = calculate_recency_weights(all_players_prev_seasons['Season'])

# Group by 'Fbref' and 'Player' and apply the weighted average function
all_players_prev_seasons = all_players_prev_seasons.groupby(['Fbref', 'Player']).agg(
    Age=('Age', 'max'),
    Seasons_count=('Season', 'nunique'),
    MP=('MP', 'sum'),
    Starts=('Starts', 'sum'),
    Min=('Min', 'sum'),
    ninetys=('90s', 'sum'),
    npxG=('npxG', lambda x: weighted_avg_with_recency_and_multiplier(
        x, 
        all_players_prev_seasons.loc[x.index, '90s'],
        all_players_prev_seasons.loc[x.index, 'Comp'],
        all_players_prev_seasons.loc[x.index, 'recency_weight'],
        multipliers
    )),
    xAG=('xAG', lambda x: weighted_avg_with_recency_and_multiplier(
        x, 
        all_players_prev_seasons.loc[x.index, '90s'],
        all_players_prev_seasons.loc[x.index, 'Comp'],
        all_players_prev_seasons.loc[x.index, 'recency_weight'],
        multipliers
    ))
).reset_index()

# Rename 'ninetys' to '90s'
all_players_prev_seasons = all_players_prev_seasons.rename(columns={'ninetys': '90s'})

# Round npxG and xAG to two decimals
all_players_prev_seasons[['npxG', 'xAG']] = all_players_prev_seasons[['npxG', 'xAG']].round(2)


In [17]:
#all_players_prev_seasons.to_csv('C:/Users/erknud3/fpl-optimization/model/data/all_players_prev_seasons.csv', index=False)

In [46]:
master = pd.read_csv("C:/Users/erknud3/fpl-optimization/model/FPL-ID-Map/Master.csv")

In [47]:
master_filtered = master[master['24-25'].notna()]
master_filtered = master_filtered[['code', 'fbref', '24-25', 'first_name', 'second_name', 'web_name']]
master_filtered = master_filtered.astype({'24-25': 'int'})
master_filtered = master_filtered.rename(columns={'24-25': 'fpl_id'})

In [106]:
merged_df = pd.merge(master_filtered, all_players_prev_seasons, left_on='fbref', right_on = 'Fbref', how='left')

In [107]:
merged_df = merged_df.drop(columns=['Fbref'])

In [108]:
r = requests.get("https://fantasy.premierleague.com/api/bootstrap-static/")
fpl_data = r.json()
elements = pd.DataFrame(fpl_data["elements"])
elements = elements[['id', 'team', 'element_type', 'now_cost', 'selected_by_percent']]
elements.loc[elements.element_type == 1, 'position'] ='GKP'
elements.loc[elements.element_type == 2, 'position'] ='DEF'
elements.loc[elements.element_type == 3, 'position'] ='MID'
elements.loc[elements.element_type == 4, 'position'] ='FWD'
elements = elements.rename({"selected_by_percent": "tsb"}, axis=1)
elements['now_cost'] = np.round(elements['now_cost'] / 10, 1)
teams = pd.DataFrame(fpl_data['teams'])
teams = teams[['id', 'name', 'short_name']]

In [109]:
fpl_elements = elements.merge(teams, left_on='team', right_on='id', how='inner')
fpl_elements = fpl_elements.rename(columns={"id_x": "fpl_id", "id_y": "team_id", "name": "team_name"})
fpl_elements = fpl_elements[['fpl_id', 'team_id', 'team_name', 'short_name', 'element_type', 'now_cost', 'tsb', 'position']]

In [110]:
fpl_elements

Unnamed: 0,fpl_id,team_id,team_name,short_name,element_type,now_cost,tsb,position
0,1,1,Arsenal,ARS,3,5.4,0.0,MID
1,2,1,Arsenal,ARS,4,6.9,1.7,FWD
2,3,1,Arsenal,ARS,2,6.0,13.0,DEF
3,4,1,Arsenal,ARS,4,8.1,17.1,FWD
4,5,1,Arsenal,ARS,1,4.0,0.1,GKP
...,...,...,...,...,...,...,...,...
622,563,20,Wolves,WOL,3,5.5,0.1,MID
623,564,20,Wolves,WOL,2,4.5,0.1,DEF
624,565,20,Wolves,WOL,3,5.5,0.2,MID
625,566,20,Wolves,WOL,4,5.5,0.3,FWD


In [111]:
fpl_players = merged_df.merge(fpl_elements, on='fpl_id', how='inner')

In [112]:
fpl_players.keys()

Index(['code', 'fbref', 'fpl_id', 'first_name', 'second_name', 'web_name',
       'Player', 'Age', 'Seasons_count', 'MP', 'Starts', 'Min', '90s', 'npxG',
       'xAG', 'team_id', 'team_name', 'short_name', 'element_type', 'now_cost',
       'tsb', 'position'],
      dtype='object')

In [113]:
fpl_players = fpl_players[['fbref', 'fpl_id', 'first_name', 'second_name', 'Player', 'web_name', 'Age', 'team_id', 'team_name', 'short_name', 'element_type', 'position', 'Seasons_count', 'now_cost', 'tsb', 'MP', 'Starts', 'Min', '90s', 'npxG', 'xAG']]

In [114]:
fpl_players.sort_values(by= 'npxG' , ascending=False).head(20)

Unnamed: 0,fbref,fpl_id,first_name,second_name,Player,web_name,Age,team_id,team_name,short_name,...,position,Seasons_count,now_cost,tsb,MP,Starts,Min,90s,npxG,xAG
624,95bd120d,165,Deivid Washington,de Souza Eugênio,Deivid Washington,Deivid,18,6,Chelsea,CHE,...,FWD,1.0,5.0,0.1,2.0,0.0,13.0,0.1,3.23,0.0
519,0bdeb013,140,Cameron,Peupion,Cameron Peupion,Peupion,19,5,Brighton,BHA,...,MID,1.0,4.5,0.1,1.0,0.0,6.0,0.1,0.9,0.0
513,123d2733,519,George,Earthy,George Earthy,Earthy,18,19,West Ham,WHU,...,MID,1.0,4.5,0.1,3.0,0.0,34.0,0.4,0.82,0.0
300,1f44ac21,351,Erling,Haaland,Erling Haaland,Haaland,23,13,Man City,MCI,...,FWD,3.0,15.1,56.8,90.0,83.0,7232.0,80.4,0.74,0.16
452,8450467d,260,Ali,Al-Hamadi,Ali Al Hamadi,Al-Hamadi,21,10,Ipswich,IPS,...,FWD,1.0,5.0,0.4,14.0,1.0,293.0,3.3,0.7,0.08
536,d3d774cc,178,Marc,Guiu Paz,Marc Guiu,Marc Guiu,17,6,Chelsea,CHE,...,FWD,1.0,5.0,0.8,3.0,1.0,74.0,0.8,0.69,0.24
422,4d77b365,316,Darwin,Núñez Ribeiro,Darwin Núñez,Darwin,24,12,Liverpool,LIV,...,FWD,2.0,7.4,4.2,65.0,41.0,3742.0,41.5,0.66,0.27
33,c596fcb0,421,Callum,Wilson,Callum Wilson,Wilson,31,15,Newcastle,NEW,...,FWD,3.0,6.9,0.1,69.0,46.0,4254.0,47.3,0.62,0.08
101,4bcf39f6,229,Neal,Maupay,Neal Maupay,Maupay,26,8,Everton,EVE,...,FWD,3.0,5.0,1.5,90.0,50.0,4826.0,53.6,0.62,0.08
106,e342ad68,328,Mohamed,Salah,Mohamed Salah,M.Salah,31,12,Liverpool,LIV,...,MID,3.0,12.5,36.7,105.0,95.0,8586.0,95.5,0.54,0.32


In [135]:
def get_player_stats_new_season(url):
    soup = BeautifulSoup(requests.get(url).text.replace("<!--", "").replace("-->", ""))
    table = soup.find("table", {"id": "stats_standard"})
    ids = [x["data-append-csv"] for x in table.find_all("td", {"data-stat": "player"})]

    df = pd.read_html(StringIO(str(table)))[0]
    df.columns = [f"{i} {j}" if "Unnamed" not in i else j for i, j in df.columns]
    df = df.loc[df["Rk"] != "Rk"]
    df["fbref"] = ids
    
    df = df.rename({'Playing Time MP': 'MP','Playing Time Starts': 'Starts', 'Playing Time Min': 'Min', 'Playing Time 90s': '90s', 'Per 90 Minutes npxG': 'npxG', 'Per 90 Minutes xAG': 'xAG'}, axis=1)
    df = df[['fbref', 'Player','Squad', 'Age', 'MP', 'Starts', 'Min', '90s', 'npxG', 'xAG']]
    df = df.astype({'MP':'int', 'Starts':'int', 'Min':'int', '90s':'float64', 'npxG':'float64', 'xAG':'float64'})
    
    return df

In [150]:
new_season = get_player_stats_new_season("https://fbref.com/en/comps/9/stats/Premier-League-Stats")
new_season = new_season[['fbref', '90s', 'npxG', 'xAG']]

In [152]:
fpl_players_new_season = fpl_players.merge(new_season, on='fbref', how='left', suffixes=('', '_new'))

In [153]:
total_90s = fpl_players_new_season['90s'] + fpl_players_new_season['90s_new']

fpl_players_new_season['weighted_npxG'] = (
    fpl_players_new_season['npxG'] * (fpl_players_new_season['90s'] / total_90s) 
    + fpl_players_new_season['npxG_new'] * (fpl_players_new_season['90s_new'] / total_90s)
).round(2)

fpl_players_new_season['weighted_xAG'] = (
    fpl_players_new_season['xAG'] * (fpl_players_new_season['90s'] / total_90s) 
    + fpl_players_new_season['xAG_new'] * (fpl_players_new_season['90s_new'] / total_90s)
).round(2)

In [154]:
fpl_players_new_season.sort_values(by= 'weighted_npxG' , ascending=False).head(20)

Unnamed: 0,fbref,fpl_id,first_name,second_name,Player,web_name,Age,team_id,team_name,short_name,...,Starts,Min,90s,npxG,xAG,90s_new,npxG_new,xAG_new,weighted_npxG,weighted_xAG
301,1f44ac21,351,Erling,Haaland,Erling Haaland,Haaland,23,13,Man City,MCI,...,83.0,7232.0,80.4,0.74,0.16,2.0,0.64,0.0,0.74,0.16
453,8450467d,260,Ali,Al-Hamadi,Ali Al Hamadi,Al-Hamadi,21,10,Ipswich,IPS,...,1.0,293.0,3.3,0.7,0.08,0.3,0.0,0.0,0.64,0.07
107,e342ad68,328,Mohamed,Salah,Mohamed Salah,M.Salah,31,12,Liverpool,LIV,...,95.0,8586.0,95.5,0.54,0.32,1.0,0.7,0.49,0.54,0.32
572,9c36ed83,180,Nicolas,Jackson,Nicolas Jackson,N.Jackson,22,6,Chelsea,CHE,...,47.0,4562.0,50.7,0.52,0.14,0.7,0.65,0.3,0.52,0.14
202,178ae8f8,317,Diogo,Teixeira da Silva,Diogo Jota,Diogo J.,26,12,Liverpool,LIV,...,53.0,4640.0,51.6,0.51,0.22,0.9,1.1,0.0,0.52,0.22
537,d3d774cc,178,Marc,Guiu Paz,Marc Guiu,Marc Guiu,17,6,Chelsea,CHE,...,1.0,74.0,0.8,0.69,0.24,0.3,0.0,0.0,0.5,0.17
279,8e92be30,401,Alexander,Isak,Alexander Isak,Isak,23,15,Newcastle,NEW,...,72.0,6069.0,67.5,0.49,0.13,1.0,0.0,0.17,0.48,0.13
239,b66315ae,2,Gabriel,Fernando de Jesus,Gabriel Jesus,G.Jesus,26,1,Arsenal,ARS,...,62.0,5419.0,60.2,0.47,0.22,0.1,0.0,0.0,0.47,0.22
180,aed3a70f,58,Ollie,Watkins,Ollie Watkins,Watkins,27,2,Aston Villa,AVL,...,106.0,9296.0,103.2,0.44,0.16,1.4,0.62,0.04,0.44,0.16
26,4e9a0555,447,Chris,Wood,Chris Wood,Wood,31,16,Nott'm Forest,NFO,...,61.0,5324.0,59.1,0.43,0.08,1.6,0.56,0.45,0.43,0.09
