In [2]:
import pandas as pd
import numpy as np
import requests
import time
from io import StringIO
from bs4 import BeautifulSoup
import re
import os

In [88]:
desired_seasons = ["2023-2024", "2022-2023", "2021-2022"]

base_urls = [
    "https://fbref.com/en/comps/9/history/Premier-League-Seasons",  # Premier League
    "https://fbref.com/en/comps/10/history/Championship-Seasons",  # Championship
]

season_stats_urls = []
seasons_pattern = "|".join(desired_seasons)

with requests.Session() as session:
    for base_url in base_urls:
        response = session.get(base_url)
        soup = BeautifulSoup(response.text, "html.parser")
        table = soup.find("table", {"id": "seasons"})
        table_rows = table.find_all("tr")

        print(f"Total season rows found for {base_url}: {len(table_rows)}")

        for row in table_rows:
            a_tag = row.find("a", href=True)
            if a_tag:
                row_href = a_tag["href"]
                season = re.search(rf"/{seasons_pattern}/", row_href)
                if season:
                    print(f"Match found for season: {season.group(0)}")

                    modified_href = re.sub(r"(/[^/]+)$", r"/stats/teams\1", row_href)
                    full_url = f"https://fbref.com{modified_href}"
                    season_stats_urls.append(full_url)

                    print(f"Final URL: {full_url}")

        time.sleep(3)  # Dynamic adjustment could be added here

print(f"Total season stats URLs: {len(season_stats_urls)}")
print(f"All season stats URLs:\n{season_stats_urls}")

Total season rows found for https://fbref.com/en/comps/9/history/Premier-League-Seasons: 127
Match found for season: /2023-2024
Final URL: https://fbref.com/en/comps/9/2023-2024/stats/teams/2023-2024-Premier-League-Stats
Match found for season: 2022-2023
Final URL: https://fbref.com/en/comps/9/2022-2023/stats/teams/2022-2023-Premier-League-Stats
Match found for season: 2021-2022/
Final URL: https://fbref.com/en/comps/9/2021-2022/stats/teams/2021-2022-Premier-League-Stats
Total season rows found for https://fbref.com/en/comps/10/history/Championship-Seasons: 25
Match found for season: /2023-2024
Final URL: https://fbref.com/en/comps/10/2023-2024/stats/teams/2023-2024-Championship-Stats
Match found for season: 2022-2023
Final URL: https://fbref.com/en/comps/10/2022-2023/stats/teams/2022-2023-Championship-Stats
Match found for season: 2021-2022/
Final URL: https://fbref.com/en/comps/10/2021-2022/stats/teams/2021-2022-Championship-Stats
Total season stats URLs: 6
All season stats URLs:
['h

In [135]:
def extract_data_from_url(url):
    try:
        # Fetch and parse the page content
        data = requests.get(url).text.replace("<!--", "").replace("-->", "")
        soup = BeautifulSoup(data, "html.parser")

        # Extract season and competition information
        season = re.search(r"/(\d{4}-\d{4})/", url).group(1)
        h2_element = soup.find("h2")
        competition = (
            " ".join(h2_element.find("span").get_text().split()[1:])
            if h2_element and h2_element.find("span")
            else pd.NA
        )

        # Initialize empty DataFrames
        df_for = pd.DataFrame()
        df_against = pd.DataFrame()

        # Extract the 'for' data table
        table_for = soup.find("table", {"id": "stats_squads_standard_for"})
        if table_for is not None:
            df_for = pd.read_html(StringIO(str(table_for)))[0]
            df_for.columns = [
                f"{i} {j}" if "Unnamed" not in i else j for i, j in df_for.columns
            ]
            df_for["Season"] = season
            df_for["Competition"] = competition

        # Extract the 'against' data table
        table_against = soup.find("table", {"id": "stats_squads_standard_against"})
        if table_against is not None:
            df_against = pd.read_html(StringIO(str(table_against)))[0]
            df_against.columns = [
                f"{i} {j}" if "Unnamed" not in i else j for i, j in df_against.columns
            ]
            df_against["Season"] = season
            df_against["Competition"] = competition

        # Rename relevant columns
        rename_for = {
            "Playing Time 90s": "90s",
            "Per 90 Minutes npxG": "npxG",
        }
        df_for = df_for.rename(rename_for, axis=1)

        rename_against = {
            "Playing Time 90s": "90s",
            "Per 90 Minutes npxG": "npxGC",
        }
        df_against = df_against.rename(rename_against, axis=1)

        # Keep only Squad, Season, Competition, and renamed columns
        selected_columns_for = ["Squad", "Season", "Competition"] + list(rename_for.values())
        selected_columns_against = ["Squad", "Season", "Competition"] + list(rename_against.values())

        if "Squad" in df_for.columns:
            df_for = df_for[selected_columns_for]
        if "Squad" in df_against.columns:
            df_against = df_against[selected_columns_against]
            
        df_against['Squad'] = df_against['Squad'].str.replace('vs ', '', regex=False)

        return df_for, df_against

    except Exception as e:
        print(f"An error occurred for URL: {url} - {e}")
        return pd.DataFrame(), pd.DataFrame()


In [136]:
combined_df_for = pd.DataFrame()
combined_df_against = pd.DataFrame()

for url in season_stats_urls:
    print(f"Processing URL: {url}")
    df_for, df_against = extract_data_from_url(url)
    
    if not df_for.empty:
        combined_df_for = pd.concat([combined_df_for, df_for], ignore_index=True)
    if not df_against.empty:
        combined_df_against = pd.concat([combined_df_against, df_against], ignore_index=True)

print("Combined DataFrame 'For':")
print(combined_df_for.head())

print("Combined DataFrame 'Against':")
print(combined_df_against.head())


Processing URL: https://fbref.com/en/comps/9/2023-2024/stats/teams/2023-2024-Premier-League-Stats
Processing URL: https://fbref.com/en/comps/9/2022-2023/stats/teams/2022-2023-Premier-League-Stats
Processing URL: https://fbref.com/en/comps/9/2021-2022/stats/teams/2021-2022-Premier-League-Stats
Processing URL: https://fbref.com/en/comps/10/2023-2024/stats/teams/2023-2024-Championship-Stats
Processing URL: https://fbref.com/en/comps/10/2022-2023/stats/teams/2022-2023-Championship-Stats
Processing URL: https://fbref.com/en/comps/10/2021-2022/stats/teams/2021-2022-Championship-Stats
Combined DataFrame 'For':
         Squad     Season     Competition   90s  npxG
0      Arsenal  2023-2024  Premier League  38.0  1.80
1  Aston Villa  2023-2024  Premier League  38.0  1.59
2  Bournemouth  2023-2024  Premier League  38.0  1.41
3    Brentford  2023-2024  Premier League  38.0  1.47
4     Brighton  2023-2024  Premier League  38.0  1.37
Combined DataFrame 'Against':
         Squad     Season     Compe

In [105]:
# Remove the 'vs ' prefix from the 'Squad' column in combined_df_against
combined_df_against['Squad'] = combined_df_against['Squad'].str.replace('vs ', '', regex=False)

# Display the updated 'combined_df_against' DataFrame to verify the changes
print("Updated Combined DataFrame 'Against':")
print(combined_df_against.head())


Updated Combined DataFrame 'Against':
         Squad     Season     Competition   90s  npxGC
0      Arsenal  2023-2024  Premier League  38.0   0.68
1  Aston Villa  2023-2024  Premier League  38.0   1.53
2  Bournemouth  2023-2024  Premier League  38.0   1.38
3    Brentford  2023-2024  Premier League  38.0   1.43
4     Brighton  2023-2024  Premier League  38.0   1.33


In [107]:
def calculate_weighted_npx(df, is_for=True):
    def apply_league_multiplier(row):
        if row['Competition'] == 'Premier League':
            return 1  # No change for Premier League
        elif row['Competition'] == 'Championship':
            return 0.5 if is_for else 2  # Different multipliers for npxG and npxGC
        return 1  # Default multiplier if league is neither

    def weighted_avg(series, weights):
        return (series * weights).sum() / weights.sum()

    def process_team(group):
        num_seasons = len(group)
        group = group.sort_values(by='Season', ascending=False).reset_index(drop=True)
        
        # Define weights based on number of seasons
        if num_seasons == 1:
            recency_weights = pd.Series([1], index=group.index)  # Single season gets weight of 1
        elif num_seasons == 2:
            recency_weights = pd.Series([0.7, 0.3], index=group.index)  # Two seasons with weights 0.7 and 0.3
        else:
            recency_weights = pd.Series([0.7, 0.2, 0.1], index=group.index)  # Three seasons with weights 0.7, 0.2, and 0.1
        
        # Ensure weights match the length of the group
        recency_weights = recency_weights.head(num_seasons)

        # Apply league multipliers
        if is_for:
            group['npxG'] *= group.apply(apply_league_multiplier, axis=1)
            series_to_weight = group['npxG']
        else:
            group['npxGC'] *= group.apply(apply_league_multiplier, axis=1)
            series_to_weight = group['npxGC']

        # Calculate weighted averages
        weighted_value = weighted_avg(series_to_weight, recency_weights)
        
        # Calculate the sum of the '90s' column
        total_90s = group['90s'].sum()

        return pd.Series({
            'Squad': group['Squad'].iloc[0],
            '90s': total_90s,
            'npxG' if is_for else 'npxGC': weighted_value
        })

    df_grouped = df.groupby('Squad')
    
    df_grouped = df_grouped[['Squad', 'Season', 'Competition', '90s', 'npxG'] if is_for else ['Squad', 'Season', 'Competition','90s', 'npxGC']]

    # Process each squad while excluding the grouping columns
    weighted_df = df_grouped.apply(process_team).reset_index(drop=True)

    return weighted_df

# Calculate weighted npxG for df_for
weighted_df_for = calculate_weighted_npx(combined_df_for, is_for=True)
print("Weighted DataFrame 'For':")
print(weighted_df_for.sort_values(by="npxG", ascending=False))

# Calculate weighted npxGC for df_against
weighted_df_against = calculate_weighted_npx(combined_df_against, is_for=False)
print("Weighted DataFrame 'Against':")
print(weighted_df_against.sort_values(by="npxGC", ascending=True))

csv_file_path = "C:/Users/erknud3/fpl-optimization/model/data"

if not weighted_df_for.empty:
    weighted_df_for.to_csv(f"{csv_file_path}/team_stats_for.csv", index=False)
    print(f"'For' data successfully saved to team_stats_for.csv")
else:
    print("No 'for' data to save.")

if not weighted_df_against.empty:
    weighted_df_against.to_csv(
        f"{csv_file_path}/team_stats_against.csv", index=False
    )
    print(f"'Against' data successfully saved to team_stats_against.csv")
else:
    print("No 'against' data to save.")


Weighted DataFrame 'For':
              Squad    90s    npxG
23        Liverpool  114.0  2.0620
25  Manchester City  114.0  1.9320
0           Arsenal  114.0  1.7670
29    Newcastle Utd  114.0  1.7220
44        Tottenham  114.0  1.6520
12          Chelsea  114.0  1.5940
1       Aston Villa  114.0  1.4710
8          Brighton  114.0  1.4280
26   Manchester Utd  114.0  1.4180
7         Brentford  114.0  1.4010
16          Everton  114.0  1.2720
6       Bournemouth  122.0  1.2660
47         West Ham  114.0  1.2420
17           Fulham  122.0  1.2135
31  Nott'ham Forest  122.0  1.1540
14   Crystal Palace  114.0  1.1400
49           Wolves  114.0  1.0840
10          Burnley  122.0  0.9460
22   Leicester City  122.0  0.9290
24       Luton Town  130.0  0.8985
21     Leeds United  122.0  0.8950
40      Southampton  122.0  0.8695
38    Sheffield Utd  130.0  0.8640
20     Ipswich Town   46.0  0.7800
27    Middlesbrough  138.0  0.7005
30     Norwich City  130.0  0.6660
13    Coventry City  138.0  0

In [112]:
def get_team_stats_new_season(url):
    try:
        # Fetch and parse the page content
        data = requests.get(url).text.replace("<!--", "").replace("-->", "")
        soup = BeautifulSoup(data, "html.parser")

        # Initialize empty DataFrames
        df_for = pd.DataFrame()
        df_against = pd.DataFrame()

        # Extract the 'for' data table
        table_for = soup.find("table", {"id": "stats_squads_standard_for"})
        if table_for is not None:
            df_for = pd.read_html(StringIO(str(table_for)))[0]
            df_for.columns = [
                f"{i} {j}" if "Unnamed" not in i else j for i, j in df_for.columns
            ]

        # Extract the 'against' data table
        table_against = soup.find("table", {"id": "stats_squads_standard_against"})
        if table_against is not None:
            df_against = pd.read_html(StringIO(str(table_against)))[0]
            df_against.columns = [
                f"{i} {j}" if "Unnamed" not in i else j for i, j in df_against.columns
            ]

        # Rename relevant columns
        rename_for = {
            "Playing Time 90s": "90s",
            "Per 90 Minutes npxG": "npxG",
        }
        df_for = df_for.rename(rename_for, axis=1)

        rename_against = {
            "Playing Time 90s": "90s",
            "Per 90 Minutes npxG": "npxGC",
        }
        df_against = df_against.rename(rename_against, axis=1)

        # Keep only Squad, Season, Competition, and renamed columns
        selected_columns_for = ["Squad"] + list(
            rename_for.values()
        )
        selected_columns_against = ["Squad"] + list(
            rename_against.values()
        )

        if "Squad" in df_for.columns:
            df_for = df_for[selected_columns_for]
        if "Squad" in df_against.columns:
            df_against = df_against[selected_columns_against]

        # Remove the 'vs ' prefix from the 'Squad' column in combined_df_against
        df_against["Squad"] = df_against["Squad"].str.replace("vs ", "", regex=False)

        return df_for, df_against

    except Exception as e:
        print(f"An error occurred for URL: {url} - {e}")
        return pd.DataFrame(), pd.DataFrame()

In [113]:
# Get new season stats
new_season = get_team_stats_new_season(
    "https://fbref.com/en/comps/9/stats/Premier-League-Stats"
)

In [124]:
new_df_for = pd.DataFrame()
new_df_against = pd.DataFrame()

df_for, df_against = get_team_stats_new_season("https://fbref.com/en/comps/9/stats/Premier-League-Stats")

if not df_for.empty and not df_against.empty:
    new_df_for = pd.concat([new_df_for, df_for], ignore_index=True)
    new_df_against = pd.concat(
        [new_df_against, df_against], ignore_index=True
    )
else:
    print(f"No data extracted for URL: {url}")

In [125]:
teams_previous_seasons_for = pd.read_csv("C:/Users/erknud3/fpl-optimization/model/data/team_stats_for.csv")
teams_previous_seasons_against = pd.read_csv("C:/Users/erknud3/fpl-optimization/model/data/team_stats_against.csv")

In [126]:
new_season_for = pd.merge(teams_previous_seasons_for, new_df_for, on="Squad", how="inner", suffixes=("", "_new"))
new_season_against = pd.merge(teams_previous_seasons_against, new_df_against, on="Squad", how="inner", suffixes=("", "_new"))

In [127]:
new_season_for.insert(0, 'team_id', range(1,21))
new_season_against.insert(0, 'team_id', range(1,21))

In [129]:
def calculate_weighted_stats(df, is_for=True, weight_new_data=1.0):
    """Calculate weighted stats with additional weight for new data."""
    # Calculate total 90s for normalization
    total_90s = df["90s"] + df["90s_new"]

    # Apply additional weight to new data
    weight_old_data = 1.0
    weight_new_data = float(weight_new_data)
    
    if is_for:
        df["weighted_npxG"] = (
            (df["npxG"] * df["90s"] / total_90s * weight_old_data)
            + (
                df["npxG_new"]
                * df["90s_new"]
                / total_90s
                * weight_new_data
            )
        ).round(2)
    else:
        df["weighted_npxGC"] = (
            (df["npxGC"] * df["90s"] / total_90s * weight_old_data)
            + (
                df["npxGC_new"]
                * df["90s_new"]
                / total_90s
                * weight_new_data
            )
        ).round(2)

In [132]:
# Calculate weighted stats
teams_new_season_for = calculate_weighted_stats(new_season_for, is_for=True, weight_new_data=1.5)
teams_new_season_against = calculate_weighted_stats(new_season_against, is_for=False, weight_new_data=1.5)

max_mp = new_season_for["90s_new"].max().astype(int)

csv_file_path = "C:/Users/erknud3/fpl-optimization/model/data"

if not new_season_for.empty:
    new_season_for.to_csv(f"{csv_file_path}/teams_new_season_for_gw{max_mp}.csv", index=False)
    print(f"'For' data successfully saved to teams_new_season_for_gw.csv")
else:
    print("No 'for' data to save.")

if not new_season_against.empty:
    new_season_against.to_csv(f"{csv_file_path}/teams_new_season_against_gw{max_mp}.csv", index=False)
    print(f"'Against' data successfully saved to teams_new_season_against_gw.csv")
else:
    print("No 'against' data to save.")

'For' data successfully saved to teams_new_season_for_gw.csv
'For' data successfully saved to teams_new_season_against_gw.csv


In [1]:
import pandas as pd
import os

In [3]:
newest_gw = 3

# Load the necessary CSV files
fpl_players_path = f"C:/Users/erknud3/fpl-optimization/model/data/New_Season_Data/fpl_players_new_season_gw{newest_gw}.csv"
teams_pred_npxG_path = f"C:/Users/erknud3/fpl-optimization/model/data/Prediction_Data/teams_pred_npxG_gw{newest_gw}.csv"
gc_probs_path = f"C:/Users/erknud3/fpl-optimization/model/data/Prediction_Data/GC_probabilities.csv"
pen_share_path = f"C:/Users/erknud3/fpl-optimization/model/data/Prediction_Data/pen_share.csv"

# Check if files exist
if not (
    os.path.exists(fpl_players_path)
    and os.path.exists(teams_pred_npxG_path)
    and os.path.exists(gc_probs_path)
):
    raise FileNotFoundError(
        f"One or more necessary files do not exist for gameweek {newest_gw}."
    )

fpl_players_new_season = pd.read_csv(fpl_players_path)
teams_pred_npxG = pd.read_csv(teams_pred_npxG_path)
gc_probs = pd.read_csv(gc_probs_path)
pen_share = pd.read_csv(pen_share_path)

fpl_players_new_season = fpl_players_new_season.dropna(
        subset=["weighted_npxG", "weighted_xAG"]
    )

# Check if 'weighted_xAG' exists immediately after loading the DataFrame
print(
    "Columns in fpl_players_new_season after loading:",
    fpl_players_new_season.columns,
)

Columns in fpl_players_new_season after loading: Index(['fbref', 'fpl_id', 'first_name', 'second_name', 'Player', 'web_name',
       'Age', 'team_id', 'team_name', 'short_name', 'element_type', 'position',
       'Seasons_count', 'now_cost', 'tsb', 'MP', 'Starts', 'Min', '90s',
       'npxG', 'xAG', 'finishing', 'MP_new', '90s_new', 'npxG_new', 'xAG_new',
       'weighted_npxG', 'weighted_xAG'],
      dtype='object')


In [4]:
# Define position multipliers
position_multipliers = {
1: 10,
2: 6,
3: 5,
4: 4,
}  # GKP, DEF, MID, FWD multipliers

In [37]:
# Generate player_xp_goals
columns_to_keep = [
    "fpl_id",
    "Player",
    "web_name",
    "Age",
    "team_name",
    "team_id",
    "element_type",
    "now_cost",
    "tsb",
    "Min",
    "90s",
    "finishing",
    "MP_new",
    "90s_new",
    "weighted_npxG",
]
player_xp_goals = fpl_players_new_season[columns_to_keep].copy()

for gw in range(1, 39):
    gw_column = str(gw)
    player_xp_goals[gw_column] = player_xp_goals.apply(
        lambda row: row["weighted_npxG"]
        * teams_pred_npxG.loc[
            teams_pred_npxG["team_id"] == row["team_id"], gw_column
        ].values[0]
        * row["finishing"]
        * position_multipliers[row["element_type"]],
        axis=1,
    )

player_xp_goals = player_xp_goals.round(2)

player_xp_goals.drop(["weighted_npxG"], axis=1, inplace=True)

xp_goals_1 = player_xp_goals[["fpl_id", "Player", "1"]]

print(xp_goals_1.sort_values(by="1", ascending=False).head(10))

     fpl_id              Player     1
195     317          Diogo Jota  3.39
103     328       Mohamed Salah  3.22
408     316        Darwin Núñez  3.22
272     401      Alexander Isak  2.94
294     351      Erling Haaland  2.68
349     327           Luis Díaz  2.65
219     392       Harvey Barnes  2.51
434     260       Ali Al Hamadi  2.19
47      503       Son Heung-min  2.12
496     372  Alejandro Garnacho  2.12


In [39]:
# Generate player_xp_pens
player_xp_pens = fpl_players_new_season[columns_to_keep].copy()

# Merge player_xp_pens with pen_share to get the penalty share for each player
player_xp_pens = player_xp_pens.merge(
    pen_share[["fpl_id", "pen_share"]], on="fpl_id", how="left"
)

# Fill missing pen_share values with 0 (players who don't take penalties)
player_xp_pens["pen_share"] = player_xp_pens["pen_share"].fillna(0)

for gw in range(1, 39):
    gw_column = str(gw)
    player_xp_pens[gw_column] = player_xp_pens.apply(
        lambda row: (
            (
                0.1
                * teams_pred_npxG.loc[
                    teams_pred_npxG["team_id"] == row["team_id"], gw_column
                ].values[0]
                * 0.77
                * position_multipliers[row["element_type"]]
                * row["pen_share"]
            )
            if row["pen_share"] > 0
            else 0
        ),  # Ensure EV is 0 if pen_share is 0
        axis=1,
    )

player_xp_pens = player_xp_pens.round(2)

player_xp_pens.drop(["weighted_npxG"], axis=1, inplace=True)

xp_pens_1 = player_xp_pens[["fpl_id", "Player", "1"]]

print(xp_pens_1.sort_values(by="1", ascending=False).head(10))

     fpl_id                 Player     1
71      328          Mohamed Salah  0.47
187     401         Alexander Isak  0.44
203      17            Bukayo Saka  0.43
272      99           Bryan Mbeumo  0.39
35      503          Son Heung-min  0.37
79      366        Bruno Fernandes  0.37
200     433     Morgan Gibbs-White  0.36
52      306            Jamie Vardy  0.34
116     220  Dominic Calvert-Lewin  0.31
219     199           Eberechi Eze  0.30


In [40]:
# Generate player_xp_assists
columns_to_keep_assists = [
    "fpl_id",
    "Player",
    "web_name",
    "Age",
    "team_name",
    "team_id",
    "element_type",
    "now_cost",
    "tsb",
    "Min",
    "90s",
    "finishing",
    "MP_new",
    "90s_new",
    "weighted_xAG",
]
player_xp_assists = fpl_players_new_season[columns_to_keep_assists].copy()

if "weighted_xAG" not in fpl_players_new_season.columns:
    raise KeyError(
        f"'weighted_xAG' not found in fpl_players_new_season DataFrame columns: {fpl_players_new_season.columns}"
    )

for gw in range(1, 39):
    gw_column = str(gw)
    player_xp_assists[gw_column] = player_xp_assists.apply(
        lambda row: row["weighted_xAG"]
        * teams_pred_npxG.loc[
            teams_pred_npxG["team_id"] == row["team_id"], gw_column
        ].values[0]
        * 3,
        axis=1,
    )

player_xp_assists = player_xp_assists.round(2)

player_xp_assists.drop(columns=["weighted_xAG"], inplace=True)

xp_assits_1 = player_xp_assists[["fpl_id", "Player", "1"]]

print(xp_assits_1.sort_values(by="1", ascending=False).head(10))

     fpl_id                  Player     1
26      345         Kevin De Bruyne  1.52
116     366         Bruno Fernandes  1.33
96      402            Jacob Murphy  1.30
33      418         Kieran Trippier  1.30
157     311  Trent Alexander-Arnold  1.24
103     328           Mohamed Salah  1.21
295      17             Bukayo Saka  1.07
400       9      Gabriel Martinelli  1.03
408     316            Darwin Núñez  0.99
326     398          Anthony Gordon  0.94


In [41]:
# Generate player_xp_cs
columns_to_keep_cs = [
    "fpl_id",
    "Player",
    "web_name",
    "Age",
    "team_name",
    "team_id",
    "element_type",
    "now_cost",
    "tsb",
    "Min",
    "90s",
    "finishing",
    "MP_new",
    "90s_new",
]
player_xp_cs = fpl_players_new_season[columns_to_keep_cs].copy()

points_per_goal_scenario = {
    0: {1: 4, 2: 4, 3: 1, 4: 0},
    1: 0,
    2: {1: -1, 2: -1, 3: 0, 4: 0},
    4: {1: -2, 2: -2, 3: 0, 4: 0},
    6: {1: -3, 2: -3, 3: 0, 4: 0},
    8: {1: -4, 2: -4, 3: 0, 4: 0},
}

for gw in range(1, 39):
    cs_column = f"{gw}_0_goals"
    gc_1_column = f"{gw}_1_goals"
    gc_2_column = f"{gw}_2_goals"
    gc_4_column = f"{gw}_4_goals"
    gc_6_column = f"{gw}_6_goals"
    gc_8_column = f"{gw}_8_goals"

    player_xp_cs[str(gw)] = player_xp_cs.apply(
        lambda row: (
            gc_probs.loc[
                gc_probs["team_id"] == row["team_id"], cs_column
            ].values[0]
            * points_per_goal_scenario[0][row["element_type"]]
            + gc_probs.loc[
                gc_probs["team_id"] == row["team_id"], gc_1_column
            ].values[0]
            * points_per_goal_scenario[1]
            + gc_probs.loc[
                gc_probs["team_id"] == row["team_id"], gc_2_column
            ].values[0]
            * points_per_goal_scenario[2][row["element_type"]]
            + gc_probs.loc[
                gc_probs["team_id"] == row["team_id"], gc_4_column
            ].values[0]
            * points_per_goal_scenario[4][row["element_type"]]
            + gc_probs.loc[
                gc_probs["team_id"] == row["team_id"], gc_6_column
            ].values[0]
            * points_per_goal_scenario[6][row["element_type"]]
            + gc_probs.loc[
                gc_probs["team_id"] == row["team_id"], gc_8_column
            ].values[0]
            * points_per_goal_scenario[8][row["element_type"]]
        ),
        axis=1,
    )

player_xp_cs = player_xp_cs.round(2)

xp_cs_1 = player_xp_cs[["fpl_id", "Player", "1"]]

print(xp_cs_1.sort_values(by="1", ascending=False).head(10))

     fpl_id               Player     1
204      24            Ben White  2.19
234      25  Oleksandr Zinchenko  2.19
405       6       Jurriën Timber  2.19
306       3    Gabriel Magalhães  2.19
29       73                 Neto  2.19
131      15           David Raya  2.19
435      18       William Saliba  2.19
63      339      Virgil van Dijk  1.79
258     337      Kostas Tsimikas  1.79
398     333       Jarell Quansah  1.79


In [44]:
print(player_xp_goals.columns)
print(player_xp_pens.columns)
print(player_xp_assists.columns)
print(player_xp_cs.columns)

Index(['fpl_id', 'Player', 'web_name', 'Age', 'team_name', 'team_id',
       'element_type', 'now_cost', 'tsb', 'Min', '90s', 'finishing', 'MP_new',
       '90s_new', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
       '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',
       '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35',
       '36', '37', '38'],
      dtype='object')
Index(['fpl_id', 'Player', 'web_name', 'Age', 'team_name', 'team_id',
       'element_type', 'now_cost', 'tsb', 'Min', '90s', 'finishing', 'MP_new',
       '90s_new', 'pen_share', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21',
       '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33',
       '34', '35', '36', '37', '38'],
      dtype='object')
Index(['fpl_id', 'Player', 'web_name', 'Age', 'team_name', 'team_id',
       'element_type', 'now_cost', 'tsb', 'Min', '90s', '

In [49]:
# Step 1: Define common columns to retain without prefixes
common_columns = [
    'fpl_id', 'Player', 'web_name', 'Age', 'team_name', 'team_id',
    'element_type', 'now_cost', 'tsb', 'Min', '90s', 'finishing', 'MP_new', '90s_new'
]

# Step 2: Add prefixes only to the gameweek columns, not to the common columns
def rename_gameweek_columns(df, prefix):
    df_prefixed = df.copy()
    # Rename only the gameweek columns with the provided prefix
    gameweek_columns = [col for col in df.columns if col not in common_columns]
    df_prefixed.rename(columns={col: f"{prefix}_{col}" for col in gameweek_columns}, inplace=True)
    return df_prefixed

player_xp_goals_prefixed = rename_gameweek_columns(player_xp_goals, 'goals')
player_xp_pens_prefixed = rename_gameweek_columns(player_xp_pens, 'pens')
player_xp_assists_prefixed = rename_gameweek_columns(player_xp_assists, 'assists')
player_xp_cs_prefixed = rename_gameweek_columns(player_xp_cs, 'cs')

# Step 3: Merge the dataframes on the common columns
merged_df = player_xp_goals_prefixed.merge(player_xp_pens_prefixed, on=common_columns, how='outer')
merged_df = merged_df.merge(player_xp_assists_prefixed, on=common_columns, how='outer')
merged_df = merged_df.merge(player_xp_cs_prefixed, on=common_columns, how='outer')

# Step 4: Add total columns (sum of goals, pens, assists, cs for each gameweek)
num_gameweeks = 38  # Assuming 38 gameweeks, adjust this if needed
for gw in range(1, num_gameweeks + 1):
    gw_cols = [f'goals_{gw}', f'pens_{gw}', f'assists_{gw}', f'cs_{gw}']
    # Check if these columns exist in the dataframe (in case some columns are missing for certain gameweeks)
    if all(col in merged_df.columns for col in gw_cols):
        merged_df[f'total_{gw}'] = merged_df[gw_cols].sum(axis=1)

# Step 5: Reorder the columns so common columns come first and the gameweek columns follow in a specific order
# Extract all the columns from merged_df
all_columns = merged_df.columns.tolist()

# Separate common columns and gameweek columns
gw_columns = [col for col in all_columns if col not in common_columns]

# Desired order within each gameweek
metric_order = ['goals', 'pens', 'assists', 'cs', 'total']

# Function to extract the numerical part of the column names
def extract_number(col):
    try:
        return int(col.split('_')[1])
    except (IndexError, ValueError):
        return float('inf')

# Sort the gameweek columns first by gameweek number, then by the desired metric order
ordered_gw_columns = sorted(gw_columns, key=lambda x: (extract_number(x), metric_order.index(x.split('_')[0])))

# Order: common_columns first, then the ordered gameweek columns
ordered_columns = common_columns + ordered_gw_columns

# Reorder the merged dataframe
merged_df = merged_df[ordered_columns]

# Step 6: Inspect the final merged dataframe
print(merged_df.head(10))

# Step 7: Save the merged dataframe to a CSV file
merged_df.to_csv(f"C:/Users/erknud3/fpl-optimization/model/data/Prediction_Data/player_ev_gw{newest_gw}.csv", index=False)

print(f"Player expected values for all metrics merged and saved successfully for gameweek {newest_gw}.")


   fpl_id              Player    web_name   Age    team_name  team_id  \
0       2       Gabriel Jesus     G.Jesus  26.0      Arsenal        1   
1       3   Gabriel Magalhães     Gabriel  25.0      Arsenal        1   
2       4         Kai Havertz     Havertz  24.0      Arsenal        1   
3       6      Jurriën Timber    J.Timber  22.0      Arsenal        1   
4       9  Gabriel Martinelli  Martinelli  22.0      Arsenal        1   
5      10        Reiss Nelson      Nelson  23.0       Fulham        9   
6      13     Martin Ødegaard    Ødegaard  24.0      Arsenal        1   
7      14      Aaron Ramsdale    Ramsdale  25.0  Southampton       17   
8      15          David Raya        Raya  27.0      Arsenal        1   
9      16         Declan Rice        Rice  24.0      Arsenal        1   

   element_type  now_cost   tsb     Min  ...  pens_37  assists_37  cs_37  \
0             4       6.8   0.9  5419.0  ...     0.00        0.73   0.00   
1             2       6.0  12.6  9516.0  ...

In [13]:
xp_goals_1_prefixed = xp_goals_1.add_prefix('goals_')
xp_pens_1_prefixed = xp_pens_1.add_prefix('pens_')
xp_assits_1_prefixed = xp_assits_1.add_prefix('assists_')
xp_cs_1_prefixed = xp_cs_1.add_prefix('cs_')

In [15]:
for df in [xp_goals_1_prefixed, xp_pens_1_prefixed, xp_assits_1_prefixed, xp_cs_1_prefixed]:
    df.rename(columns={
        f"{df.columns[0]}": "fpl_id",
        f"{df.columns[1]}": "Player",  # Ensuring Player column is correctly renamed
    }, inplace=True)

In [26]:
merged_df = xp_goals_1_prefixed.merge(xp_pens_1_prefixed, on=["fpl_id", "Player"], how='outer')
merged_df = merged_df.merge(xp_assits_1_prefixed, on=["fpl_id", "Player"], how='outer')
merged_df = merged_df.merge(xp_cs_1_prefixed, on=["fpl_id", "Player"], how='outer')

In [28]:
print(merged_df.sort_values(by="goals_1", ascending=False).head(10))

     fpl_id              Player  goals_1  pens_1  assists_1  cs_1
184     317          Diogo Jota     3.39    0.00       0.84  0.48
183     316        Darwin Núñez     3.22    0.00       0.99  0.00
190     328       Mohamed Salah     3.22    0.47       1.21  0.48
239     401      Alexander Isak     2.94    0.44       0.58  0.00
207     351      Erling Haaland     2.68    0.26       0.40  0.00
189     327           Luis Díaz     2.65    0.00       0.77  0.48
234     392       Harvey Barnes     2.51    0.00       0.86  0.45
151     260       Ali Al Hamadi     2.19    0.28       0.25  0.00
293     503       Son Heung-min     2.12    0.37       0.90  0.34
223     372  Alejandro Garnacho     2.12    0.00       0.65  0.30


In [8]:
import pandas as pd
import os

newest_gw = 3
# Load the necessary CSV files
fpl_players_path = f"C:/Users/erknud3/fpl-optimization/model/data/New_Season_Data/fpl_players_new_season_gw{newest_gw}.csv"
teams_pred_npxG_path = f"C:/Users/erknud3/fpl-optimization/model/data/Prediction_Data/teams_pred_npxG_gw{newest_gw}.csv"
gc_probs_path = f"C:/Users/erknud3/fpl-optimization/model/data/Prediction_Data/GC_probabilities.csv"
pen_share_path = f"C:/Users/erknud3/fpl-optimization/model/data/Prediction_Data/pen_share.csv"
xmins_path = f"C:/Users/erknud3/fpl-optimization/model/data/New_Season_Data/fpl_players_xmins.csv"

# Check if files exist
if not (
    os.path.exists(fpl_players_path)
    and os.path.exists(teams_pred_npxG_path)
    and os.path.exists(gc_probs_path)
):
    raise FileNotFoundError(
        f"One or more necessary files do not exist for gameweek {newest_gw}."
    )

fpl_players_new_season = pd.read_csv(fpl_players_path)
teams_pred_npxG = pd.read_csv(teams_pred_npxG_path)
gc_probs = pd.read_csv(gc_probs_path)
pen_share = pd.read_csv(pen_share_path)
player_xmins = pd.read_csv(xmins_path)

fpl_players_new_season = fpl_players_new_season.dropna(
    subset=["weighted_npxG", "weighted_xAG"]
)

fpl_players_new_season = fpl_players_new_season.merge(
    player_xmins[["fpl_id", "xMins"]], on="fpl_id", how="left"
)

In [10]:
fpl_players_new_season.keys()

Index(['fbref', 'fpl_id', 'first_name', 'second_name', 'Player', 'web_name',
       'Age', 'team_id', 'team_name', 'short_name', 'element_type', 'position',
       'Seasons_count', 'now_cost', 'tsb', 'MP', 'Starts', 'Min', '90s',
       'npxG', 'xAG', 'finishing', 'MP_new', '90s_new', 'npxG_new', 'xAG_new',
       'weighted_npxG', 'weighted_xAG', 'xMins'],
      dtype='object')

In [6]:
player_xmins

Unnamed: 0,Date,fbref,fpl_id,Player,Season,Comp,Round,Squad,Mins,Start,xMins,P(start),P(1_min),P(60_min),weighted_npxGI
0,2024-08-31,1f44ac21,351,Erling Haaland,2024-2025,Premier League,Matchweek 3,Manchester City,90,1,89.63,1.00,1.00,1.00,0.93
1,2024-09-01,4d77b365,316,Darwin Núñez,2024-2025,Premier League,Matchweek 3,Liverpool,15,0,11.84,0.00,0.69,0.00,0.93
2,2024-09-01,e342ad68,328,Mohamed Salah,2024-2025,Premier League,Matchweek 3,Liverpool,90,1,87.00,1.00,1.00,1.00,0.88
3,2024-08-31,e46012d4,345,Kevin De Bruyne,2024-2025,Premier League,Matchweek 3,Manchester City,87,1,88.68,1.00,1.00,1.00,0.83
4,2024-09-01,dc7f8a28,182,Cole Palmer,2024-2025,Premier League,Matchweek 3,Chelsea,90,1,87.00,1.00,1.00,1.00,0.78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
347,2024-09-01,e5a76dfe,201,Dean Henderson,2024-2025,Premier League,Matchweek 3,Crystal Palace,90,1,90.00,1.00,1.00,1.00,0.00
348,2024-08-31,9e5708be,204,Sam Johnstone,2024-2025,Premier League,Matchweek 3,Wolves,90,1,28.32,0.32,0.32,0.32,0.00
349,2024-09-01,e9c0c1b2,383,André Onana,2024-2025,Premier League,Matchweek 3,Manchester Utd,90,1,90.00,1.00,1.00,1.00,0.00
350,2024-09-01,77d6fd4d,508,Guglielmo Vicario,2024-2025,Premier League,Matchweek 3,Tottenham,90,1,90.00,1.00,1.00,1.00,0.00
