In [1]:
import pandas as pd
import numpy as np
import requests
import time
from io import StringIO
from bs4 import BeautifulSoup
import re

In [88]:
desired_seasons = ["2023-2024", "2022-2023", "2021-2022"]

base_urls = [
    "https://fbref.com/en/comps/9/history/Premier-League-Seasons",  # Premier League
    "https://fbref.com/en/comps/10/history/Championship-Seasons",  # Championship
]

season_stats_urls = []
seasons_pattern = "|".join(desired_seasons)

with requests.Session() as session:
    for base_url in base_urls:
        response = session.get(base_url)
        soup = BeautifulSoup(response.text, "html.parser")
        table = soup.find("table", {"id": "seasons"})
        table_rows = table.find_all("tr")

        print(f"Total season rows found for {base_url}: {len(table_rows)}")

        for row in table_rows:
            a_tag = row.find("a", href=True)
            if a_tag:
                row_href = a_tag["href"]
                season = re.search(rf"/{seasons_pattern}/", row_href)
                if season:
                    print(f"Match found for season: {season.group(0)}")

                    modified_href = re.sub(r"(/[^/]+)$", r"/stats/teams\1", row_href)
                    full_url = f"https://fbref.com{modified_href}"
                    season_stats_urls.append(full_url)

                    print(f"Final URL: {full_url}")

        time.sleep(3)  # Dynamic adjustment could be added here

print(f"Total season stats URLs: {len(season_stats_urls)}")
print(f"All season stats URLs:\n{season_stats_urls}")

Total season rows found for https://fbref.com/en/comps/9/history/Premier-League-Seasons: 127
Match found for season: /2023-2024
Final URL: https://fbref.com/en/comps/9/2023-2024/stats/teams/2023-2024-Premier-League-Stats
Match found for season: 2022-2023
Final URL: https://fbref.com/en/comps/9/2022-2023/stats/teams/2022-2023-Premier-League-Stats
Match found for season: 2021-2022/
Final URL: https://fbref.com/en/comps/9/2021-2022/stats/teams/2021-2022-Premier-League-Stats
Total season rows found for https://fbref.com/en/comps/10/history/Championship-Seasons: 25
Match found for season: /2023-2024
Final URL: https://fbref.com/en/comps/10/2023-2024/stats/teams/2023-2024-Championship-Stats
Match found for season: 2022-2023
Final URL: https://fbref.com/en/comps/10/2022-2023/stats/teams/2022-2023-Championship-Stats
Match found for season: 2021-2022/
Final URL: https://fbref.com/en/comps/10/2021-2022/stats/teams/2021-2022-Championship-Stats
Total season stats URLs: 6
All season stats URLs:
['h

In [135]:
def extract_data_from_url(url):
    try:
        # Fetch and parse the page content
        data = requests.get(url).text.replace("<!--", "").replace("-->", "")
        soup = BeautifulSoup(data, "html.parser")

        # Extract season and competition information
        season = re.search(r"/(\d{4}-\d{4})/", url).group(1)
        h2_element = soup.find("h2")
        competition = (
            " ".join(h2_element.find("span").get_text().split()[1:])
            if h2_element and h2_element.find("span")
            else pd.NA
        )

        # Initialize empty DataFrames
        df_for = pd.DataFrame()
        df_against = pd.DataFrame()

        # Extract the 'for' data table
        table_for = soup.find("table", {"id": "stats_squads_standard_for"})
        if table_for is not None:
            df_for = pd.read_html(StringIO(str(table_for)))[0]
            df_for.columns = [
                f"{i} {j}" if "Unnamed" not in i else j for i, j in df_for.columns
            ]
            df_for["Season"] = season
            df_for["Competition"] = competition

        # Extract the 'against' data table
        table_against = soup.find("table", {"id": "stats_squads_standard_against"})
        if table_against is not None:
            df_against = pd.read_html(StringIO(str(table_against)))[0]
            df_against.columns = [
                f"{i} {j}" if "Unnamed" not in i else j for i, j in df_against.columns
            ]
            df_against["Season"] = season
            df_against["Competition"] = competition

        # Rename relevant columns
        rename_for = {
            "Playing Time 90s": "90s",
            "Per 90 Minutes npxG": "npxG",
        }
        df_for = df_for.rename(rename_for, axis=1)

        rename_against = {
            "Playing Time 90s": "90s",
            "Per 90 Minutes npxG": "npxGC",
        }
        df_against = df_against.rename(rename_against, axis=1)

        # Keep only Squad, Season, Competition, and renamed columns
        selected_columns_for = ["Squad", "Season", "Competition"] + list(rename_for.values())
        selected_columns_against = ["Squad", "Season", "Competition"] + list(rename_against.values())

        if "Squad" in df_for.columns:
            df_for = df_for[selected_columns_for]
        if "Squad" in df_against.columns:
            df_against = df_against[selected_columns_against]
            
        df_against['Squad'] = df_against['Squad'].str.replace('vs ', '', regex=False)

        return df_for, df_against

    except Exception as e:
        print(f"An error occurred for URL: {url} - {e}")
        return pd.DataFrame(), pd.DataFrame()


In [136]:
combined_df_for = pd.DataFrame()
combined_df_against = pd.DataFrame()

for url in season_stats_urls:
    print(f"Processing URL: {url}")
    df_for, df_against = extract_data_from_url(url)
    
    if not df_for.empty:
        combined_df_for = pd.concat([combined_df_for, df_for], ignore_index=True)
    if not df_against.empty:
        combined_df_against = pd.concat([combined_df_against, df_against], ignore_index=True)

print("Combined DataFrame 'For':")
print(combined_df_for.head())

print("Combined DataFrame 'Against':")
print(combined_df_against.head())


Processing URL: https://fbref.com/en/comps/9/2023-2024/stats/teams/2023-2024-Premier-League-Stats
Processing URL: https://fbref.com/en/comps/9/2022-2023/stats/teams/2022-2023-Premier-League-Stats
Processing URL: https://fbref.com/en/comps/9/2021-2022/stats/teams/2021-2022-Premier-League-Stats
Processing URL: https://fbref.com/en/comps/10/2023-2024/stats/teams/2023-2024-Championship-Stats
Processing URL: https://fbref.com/en/comps/10/2022-2023/stats/teams/2022-2023-Championship-Stats
Processing URL: https://fbref.com/en/comps/10/2021-2022/stats/teams/2021-2022-Championship-Stats
Combined DataFrame 'For':
         Squad     Season     Competition   90s  npxG
0      Arsenal  2023-2024  Premier League  38.0  1.80
1  Aston Villa  2023-2024  Premier League  38.0  1.59
2  Bournemouth  2023-2024  Premier League  38.0  1.41
3    Brentford  2023-2024  Premier League  38.0  1.47
4     Brighton  2023-2024  Premier League  38.0  1.37
Combined DataFrame 'Against':
         Squad     Season     Compe

In [105]:
# Remove the 'vs ' prefix from the 'Squad' column in combined_df_against
combined_df_against['Squad'] = combined_df_against['Squad'].str.replace('vs ', '', regex=False)

# Display the updated 'combined_df_against' DataFrame to verify the changes
print("Updated Combined DataFrame 'Against':")
print(combined_df_against.head())


Updated Combined DataFrame 'Against':
         Squad     Season     Competition   90s  npxGC
0      Arsenal  2023-2024  Premier League  38.0   0.68
1  Aston Villa  2023-2024  Premier League  38.0   1.53
2  Bournemouth  2023-2024  Premier League  38.0   1.38
3    Brentford  2023-2024  Premier League  38.0   1.43
4     Brighton  2023-2024  Premier League  38.0   1.33


In [107]:
def calculate_weighted_npx(df, is_for=True):
    def apply_league_multiplier(row):
        if row['Competition'] == 'Premier League':
            return 1  # No change for Premier League
        elif row['Competition'] == 'Championship':
            return 0.5 if is_for else 2  # Different multipliers for npxG and npxGC
        return 1  # Default multiplier if league is neither

    def weighted_avg(series, weights):
        return (series * weights).sum() / weights.sum()

    def process_team(group):
        num_seasons = len(group)
        group = group.sort_values(by='Season', ascending=False).reset_index(drop=True)
        
        # Define weights based on number of seasons
        if num_seasons == 1:
            recency_weights = pd.Series([1], index=group.index)  # Single season gets weight of 1
        elif num_seasons == 2:
            recency_weights = pd.Series([0.7, 0.3], index=group.index)  # Two seasons with weights 0.7 and 0.3
        else:
            recency_weights = pd.Series([0.7, 0.2, 0.1], index=group.index)  # Three seasons with weights 0.7, 0.2, and 0.1
        
        # Ensure weights match the length of the group
        recency_weights = recency_weights.head(num_seasons)

        # Apply league multipliers
        if is_for:
            group['npxG'] *= group.apply(apply_league_multiplier, axis=1)
            series_to_weight = group['npxG']
        else:
            group['npxGC'] *= group.apply(apply_league_multiplier, axis=1)
            series_to_weight = group['npxGC']

        # Calculate weighted averages
        weighted_value = weighted_avg(series_to_weight, recency_weights)
        
        # Calculate the sum of the '90s' column
        total_90s = group['90s'].sum()

        return pd.Series({
            'Squad': group['Squad'].iloc[0],
            '90s': total_90s,
            'npxG' if is_for else 'npxGC': weighted_value
        })

    df_grouped = df.groupby('Squad')
    
    df_grouped = df_grouped[['Squad', 'Season', 'Competition', '90s', 'npxG'] if is_for else ['Squad', 'Season', 'Competition','90s', 'npxGC']]

    # Process each squad while excluding the grouping columns
    weighted_df = df_grouped.apply(process_team).reset_index(drop=True)

    return weighted_df

# Calculate weighted npxG for df_for
weighted_df_for = calculate_weighted_npx(combined_df_for, is_for=True)
print("Weighted DataFrame 'For':")
print(weighted_df_for.sort_values(by="npxG", ascending=False))

# Calculate weighted npxGC for df_against
weighted_df_against = calculate_weighted_npx(combined_df_against, is_for=False)
print("Weighted DataFrame 'Against':")
print(weighted_df_against.sort_values(by="npxGC", ascending=True))

csv_file_path = "C:/Users/erknud3/fpl-optimization/model/data"

if not weighted_df_for.empty:
    weighted_df_for.to_csv(f"{csv_file_path}/team_stats_for.csv", index=False)
    print(f"'For' data successfully saved to team_stats_for.csv")
else:
    print("No 'for' data to save.")

if not weighted_df_against.empty:
    weighted_df_against.to_csv(
        f"{csv_file_path}/team_stats_against.csv", index=False
    )
    print(f"'Against' data successfully saved to team_stats_against.csv")
else:
    print("No 'against' data to save.")


Weighted DataFrame 'For':
              Squad    90s    npxG
23        Liverpool  114.0  2.0620
25  Manchester City  114.0  1.9320
0           Arsenal  114.0  1.7670
29    Newcastle Utd  114.0  1.7220
44        Tottenham  114.0  1.6520
12          Chelsea  114.0  1.5940
1       Aston Villa  114.0  1.4710
8          Brighton  114.0  1.4280
26   Manchester Utd  114.0  1.4180
7         Brentford  114.0  1.4010
16          Everton  114.0  1.2720
6       Bournemouth  122.0  1.2660
47         West Ham  114.0  1.2420
17           Fulham  122.0  1.2135
31  Nott'ham Forest  122.0  1.1540
14   Crystal Palace  114.0  1.1400
49           Wolves  114.0  1.0840
10          Burnley  122.0  0.9460
22   Leicester City  122.0  0.9290
24       Luton Town  130.0  0.8985
21     Leeds United  122.0  0.8950
40      Southampton  122.0  0.8695
38    Sheffield Utd  130.0  0.8640
20     Ipswich Town   46.0  0.7800
27    Middlesbrough  138.0  0.7005
30     Norwich City  130.0  0.6660
13    Coventry City  138.0  0

In [112]:
def get_team_stats_new_season(url):
    try:
        # Fetch and parse the page content
        data = requests.get(url).text.replace("<!--", "").replace("-->", "")
        soup = BeautifulSoup(data, "html.parser")

        # Initialize empty DataFrames
        df_for = pd.DataFrame()
        df_against = pd.DataFrame()

        # Extract the 'for' data table
        table_for = soup.find("table", {"id": "stats_squads_standard_for"})
        if table_for is not None:
            df_for = pd.read_html(StringIO(str(table_for)))[0]
            df_for.columns = [
                f"{i} {j}" if "Unnamed" not in i else j for i, j in df_for.columns
            ]

        # Extract the 'against' data table
        table_against = soup.find("table", {"id": "stats_squads_standard_against"})
        if table_against is not None:
            df_against = pd.read_html(StringIO(str(table_against)))[0]
            df_against.columns = [
                f"{i} {j}" if "Unnamed" not in i else j for i, j in df_against.columns
            ]

        # Rename relevant columns
        rename_for = {
            "Playing Time 90s": "90s",
            "Per 90 Minutes npxG": "npxG",
        }
        df_for = df_for.rename(rename_for, axis=1)

        rename_against = {
            "Playing Time 90s": "90s",
            "Per 90 Minutes npxG": "npxGC",
        }
        df_against = df_against.rename(rename_against, axis=1)

        # Keep only Squad, Season, Competition, and renamed columns
        selected_columns_for = ["Squad"] + list(
            rename_for.values()
        )
        selected_columns_against = ["Squad"] + list(
            rename_against.values()
        )

        if "Squad" in df_for.columns:
            df_for = df_for[selected_columns_for]
        if "Squad" in df_against.columns:
            df_against = df_against[selected_columns_against]

        # Remove the 'vs ' prefix from the 'Squad' column in combined_df_against
        df_against["Squad"] = df_against["Squad"].str.replace("vs ", "", regex=False)

        return df_for, df_against

    except Exception as e:
        print(f"An error occurred for URL: {url} - {e}")
        return pd.DataFrame(), pd.DataFrame()

In [113]:
# Get new season stats
new_season = get_team_stats_new_season(
    "https://fbref.com/en/comps/9/stats/Premier-League-Stats"
)

In [124]:
new_df_for = pd.DataFrame()
new_df_against = pd.DataFrame()

df_for, df_against = get_team_stats_new_season("https://fbref.com/en/comps/9/stats/Premier-League-Stats")

if not df_for.empty and not df_against.empty:
    new_df_for = pd.concat([new_df_for, df_for], ignore_index=True)
    new_df_against = pd.concat(
        [new_df_against, df_against], ignore_index=True
    )
else:
    print(f"No data extracted for URL: {url}")

In [125]:
teams_previous_seasons_for = pd.read_csv("C:/Users/erknud3/fpl-optimization/model/data/team_stats_for.csv")
teams_previous_seasons_against = pd.read_csv("C:/Users/erknud3/fpl-optimization/model/data/team_stats_against.csv")

In [126]:
new_season_for = pd.merge(teams_previous_seasons_for, new_df_for, on="Squad", how="inner", suffixes=("", "_new"))
new_season_against = pd.merge(teams_previous_seasons_against, new_df_against, on="Squad", how="inner", suffixes=("", "_new"))

In [127]:
new_season_for.insert(0, 'team_id', range(1,21))
new_season_against.insert(0, 'team_id', range(1,21))

In [129]:
def calculate_weighted_stats(df, is_for=True, weight_new_data=1.0):
    """Calculate weighted stats with additional weight for new data."""
    # Calculate total 90s for normalization
    total_90s = df["90s"] + df["90s_new"]

    # Apply additional weight to new data
    weight_old_data = 1.0
    weight_new_data = float(weight_new_data)
    
    if is_for:
        df["weighted_npxG"] = (
            (df["npxG"] * df["90s"] / total_90s * weight_old_data)
            + (
                df["npxG_new"]
                * df["90s_new"]
                / total_90s
                * weight_new_data
            )
        ).round(2)
    else:
        df["weighted_npxGC"] = (
            (df["npxGC"] * df["90s"] / total_90s * weight_old_data)
            + (
                df["npxGC_new"]
                * df["90s_new"]
                / total_90s
                * weight_new_data
            )
        ).round(2)

In [132]:
# Calculate weighted stats
teams_new_season_for = calculate_weighted_stats(new_season_for, is_for=True, weight_new_data=1.5)
teams_new_season_against = calculate_weighted_stats(new_season_against, is_for=False, weight_new_data=1.5)

max_mp = new_season_for["90s_new"].max().astype(int)

csv_file_path = "C:/Users/erknud3/fpl-optimization/model/data"

if not new_season_for.empty:
    new_season_for.to_csv(f"{csv_file_path}/teams_new_season_for_gw{max_mp}.csv", index=False)
    print(f"'For' data successfully saved to teams_new_season_for_gw.csv")
else:
    print("No 'for' data to save.")

if not new_season_against.empty:
    new_season_against.to_csv(f"{csv_file_path}/teams_new_season_against_gw{max_mp}.csv", index=False)
    print(f"'Against' data successfully saved to teams_new_season_against_gw.csv")
else:
    print("No 'against' data to save.")

'For' data successfully saved to teams_new_season_for_gw.csv
'For' data successfully saved to teams_new_season_against_gw.csv


In [1]:
import pandas as pd
import os

In [2]:
newest_gw = 3

# Load the necessary CSV files
fpl_players_path = f"C:/Users/erknud3/fpl-optimization/model/data/New_Season_Data/fpl_players_new_season_gw{newest_gw}.csv"
teams_pred_npxG_path = f"C:/Users/erknud3/fpl-optimization/model/data/Prediction_Data/teams_pred_npxG_gw{newest_gw}.csv"
gc_probs_path = f"C:/Users/erknud3/fpl-optimization/model/data/Prediction_Data/GC_probabilities.csv"
pen_share_path = f"C:/Users/erknud3/fpl-optimization/model/data/Prediction_Data/pen_share.csv"

# Check if files exist
if not (
    os.path.exists(fpl_players_path)
    and os.path.exists(teams_pred_npxG_path)
    and os.path.exists(gc_probs_path)
):
    raise FileNotFoundError(
        f"One or more necessary files do not exist for gameweek {newest_gw}."
    )

fpl_players_new_season = pd.read_csv(fpl_players_path)
teams_pred_npxG = pd.read_csv(teams_pred_npxG_path)
gc_probs = pd.read_csv(gc_probs_path)
pen_share = pd.read_csv(pen_share_path)

# Check if 'weighted_xAG' exists immediately after loading the DataFrame
print(
    "Columns in fpl_players_new_season after loading:",
    fpl_players_new_season.columns,
)

Columns in fpl_players_new_season after loading: Index(['fbref', 'fpl_id', 'first_name', 'second_name', 'Player', 'web_name',
       'Age', 'team_id', 'team_name', 'short_name', 'element_type', 'position',
       'Seasons_count', 'now_cost', 'tsb', 'MP', 'Starts', 'Min', '90s',
       'npxG', 'xAG', 'finishing', 'MP_new', '90s_new', 'npxG_new', 'xAG_new',
       'weighted_npxG', 'weighted_xAG'],
      dtype='object')


In [3]:
# Define position multipliers
position_multipliers = {
1: 10,
2: 6,
3: 5,
4: 4,
}  # GKP, DEF, MID, FWD multipliers

In [4]:
# Generate player_xp_goals
columns_to_keep = [
    "fpl_id",
    "Player",
    "web_name",
    "Age",
    "team_name",
    "team_id",
    "element_type",
    "now_cost",
    "tsb",
    "Min",
    "90s",
    "finishing",
    "MP_new",
    "90s_new",
    "weighted_npxG",
]
player_xp_goals = fpl_players_new_season[columns_to_keep].copy()

for gw in range(1, 39):
    gw_column = str(gw)
    player_xp_goals[gw_column] = player_xp_goals.apply(
        lambda row: row["weighted_npxG"]
        * teams_pred_npxG.loc[
            teams_pred_npxG["team_id"] == row["team_id"], gw_column
        ].values[0]
        * row["finishing"]
        * position_multipliers[row["element_type"]],
        axis=1,
    )

player_xp_goals = player_xp_goals.round(2)

xp_goals_1 = player_xp_goals[["Player", "1"]]

print(xp_goals_1.sort_values(by="1", ascending=False).head(10))

                 Player     1
202          Diogo Jota  3.39
107       Mohamed Salah  3.22
279      Alexander Isak  2.94
423        Darwin Núñez  2.93
301      Erling Haaland  2.68
225       Harvey Barnes  2.51
357           Luis Díaz  2.37
453       Ali Al Hamadi  2.19
50        Son Heung-min  2.12
523  Alejandro Garnacho  2.12


In [5]:
# Generate player_xp_pens
player_xp_pens = fpl_players_new_season[columns_to_keep].copy()

# Merge player_xp_pens with pen_share to get the penalty share for each player
player_xp_pens = player_xp_pens.merge(
    pen_share[["fpl_id", "pen_share"]], on="fpl_id", how="left"
)

# Fill missing pen_share values with 0 (players who don't take penalties)
player_xp_pens["pen_share"] = player_xp_pens["pen_share"].fillna(0)

for gw in range(1, 39):
    gw_column = str(gw)
    player_xp_pens[gw_column] = player_xp_pens.apply(
        lambda row: (
            (
                0.1
                * teams_pred_npxG.loc[
                    teams_pred_npxG["team_id"] == row["team_id"], gw_column
                ].values[0]
                * 0.77
                * position_multipliers[row["element_type"]]
                * row["pen_share"]
            )
            if row["pen_share"] > 0
            else 0
        ),  # Ensure EV is 0 if pen_share is 0
        axis=1,
    )

player_xp_pens = player_xp_pens.round(2)

xp_pens_1 = player_xp_pens[["Player", "1"]]

print(xp_pens_1.sort_values(by="1", ascending=False).head(10))

                    Player     1
109          Mohamed Salah  0.47
281         Alexander Isak  0.44
304            Bukayo Saka  0.43
33           Callum Wilson  0.42
424           Bryan Mbeumo  0.39
52           Son Heung-min  0.37
124        Bruno Fernandes  0.37
298     Morgan Gibbs-White  0.36
76             Jamie Vardy  0.34
180  Dominic Calvert-Lewin  0.31


In [6]:
# Generate player_xp_assists
columns_to_keep_assists = [
    "fpl_id",
    "Player",
    "web_name",
    "Age",
    "team_name",
    "team_id",
    "element_type",
    "now_cost",
    "tsb",
    "Min",
    "90s",
    "MP_new",
    "90s_new",
    "weighted_xAG",
]
player_xp_assists = fpl_players_new_season[columns_to_keep_assists].copy()

if "weighted_xAG" not in fpl_players_new_season.columns:
    raise KeyError(
        f"'weighted_xAG' not found in fpl_players_new_season DataFrame columns: {fpl_players_new_season.columns}"
    )

for gw in range(1, 39):
    gw_column = str(gw)
    player_xp_assists[gw_column] = player_xp_assists.apply(
        lambda row: row["weighted_xAG"]
        * teams_pred_npxG.loc[
            teams_pred_npxG["team_id"] == row["team_id"], gw_column
        ].values[0]
        * 3,
        axis=1,
    )

player_xp_assists = player_xp_assists.round(2)

xp_assits_1 = player_xp_assists[["Player", "1"]]

print(xp_assits_1.sort_values(by="1", ascending=False).head(10))

                     Player     1
28          Kevin De Bruyne  1.52
122         Bruno Fernandes  1.33
100            Jacob Murphy  1.30
35          Kieran Trippier  1.30
163  Trent Alexander-Arnold  1.24
107           Mohamed Salah  1.21
302             Bukayo Saka  1.07
413      Gabriel Martinelli  1.03
423            Darwin Núñez  0.99
334          Anthony Gordon  0.94


In [7]:
# Generate player_xp_cs
columns_to_keep_cs = [
    "fpl_id",
    "Player",
    "web_name",
    "Age",
    "team_name",
    "team_id",
    "element_type",
    "now_cost",
    "tsb",
    "Min",
    "90s",
    "MP_new",
    "90s_new",
]
player_xp_cs = fpl_players_new_season[columns_to_keep_cs].copy()

points_per_goal_scenario = {
    0: {1: 4, 2: 4, 3: 1, 4: 0},
    1: 0,
    2: {1: -1, 2: -1, 3: 0, 4: 0},
    4: {1: -2, 2: -2, 3: 0, 4: 0},
    6: {1: -3, 2: -3, 3: 0, 4: 0},
    8: {1: -4, 2: -4, 3: 0, 4: 0},
}

for gw in range(1, 39):
    cs_column = f"{gw}_0_goals"
    gc_1_column = f"{gw}_1_goals"
    gc_2_column = f"{gw}_2_goals"
    gc_4_column = f"{gw}_4_goals"
    gc_6_column = f"{gw}_6_goals"
    gc_8_column = f"{gw}_8_goals"

    player_xp_cs[str(gw)] = player_xp_cs.apply(
        lambda row: (
            gc_probs.loc[
                gc_probs["team_id"] == row["team_id"], cs_column
            ].values[0]
            * points_per_goal_scenario[0][row["element_type"]]
            + gc_probs.loc[
                gc_probs["team_id"] == row["team_id"], gc_1_column
            ].values[0]
            * points_per_goal_scenario[1]
            + gc_probs.loc[
                gc_probs["team_id"] == row["team_id"], gc_2_column
            ].values[0]
            * points_per_goal_scenario[2][row["element_type"]]
            + gc_probs.loc[
                gc_probs["team_id"] == row["team_id"], gc_4_column
            ].values[0]
            * points_per_goal_scenario[4][row["element_type"]]
            + gc_probs.loc[
                gc_probs["team_id"] == row["team_id"], gc_6_column
            ].values[0]
            * points_per_goal_scenario[6][row["element_type"]]
            + gc_probs.loc[
                gc_probs["team_id"] == row["team_id"], gc_8_column
            ].values[0]
            * points_per_goal_scenario[8][row["element_type"]]
        ),
        axis=1,
    )

player_xp_cs = player_xp_cs.round(2)

xp_cs_1 = player_xp_cs[["Player", "1"]]

print(xp_cs_1.sort_values(by="1", ascending=False).head(10))

                  Player     1
240  Oleksandr Zinchenko  2.19
459      Karl Jakob Hein  2.19
402         Jakub Kiwior  2.19
210            Ben White  2.19
313    Gabriel Magalhães  2.19
137           David Raya  2.19
197       Kieran Tierney  2.19
31                  Neto  2.19
304    Takehiro Tomiyasu  2.19
466                  NaN  2.19


In [21]:
# Step 1: Rename the columns to add prefixes
player_xp_goals_prefixed = player_xp_goals.add_prefix('goals_')
player_xp_pens_prefixed = player_xp_pens.add_prefix('pens_')
player_xp_assists_prefixed = player_xp_assists.add_prefix('assists_')
player_xp_cs_prefixed = player_xp_cs.add_prefix('cs_')

# Step 2: Ensure 'Player' column is correctly renamed back
for df in [player_xp_goals_prefixed, player_xp_pens_prefixed, player_xp_assists_prefixed, player_xp_cs_prefixed]:
    df.rename(columns={
        f"{df.columns[0]}": "fpl_id",
        f"{df.columns[1]}": "Player",  # Ensuring Player column is correctly renamed
        f"{df.columns[2]}": "web_name",
        f"{df.columns[3]}": "Age",
        f"{df.columns[4]}": "team_name",
        f"{df.columns[5]}": "team_id",
        f"{df.columns[6]}": "element_type",
        f"{df.columns[7]}": "now_cost",
        f"{df.columns[8]}": "tsb",
        f"{df.columns[9]}": "Min",
        f"{df.columns[10]}": "90s",
        f"{df.columns[11]}": "MP_new",
        f"{df.columns[12]}": "90s_new"
    }, inplace=True)


# Step 3: Remove rows with NaN values in the Player column in all relevant dataframes
player_xp_goals_prefixed = player_xp_goals_prefixed.dropna(subset=["Player"])
player_xp_pens_prefixed = player_xp_pens_prefixed.dropna(subset=["Player"])
player_xp_assists_prefixed = player_xp_assists_prefixed.dropna(subset=["Player"])
player_xp_cs_prefixed = player_xp_cs_prefixed.dropna(subset=["Player"])

# Step 4: Merge the dataframes
merged_df = player_xp_goals_prefixed.merge(player_xp_pens_prefixed, on=["fpl_id", "Player", "web_name", "Age", "team_name", "team_id", "element_type", "now_cost", "tsb", "Min", "90s", "MP_new", "90s_new"], how='outer')
merged_df = merged_df.merge(player_xp_assists_prefixed, on=["fpl_id", "Player", "web_name", "Age", "team_name", "team_id", "element_type", "now_cost", "tsb", "Min", "90s", "MP_new", "90s_new"], how='outer')
merged_df = merged_df.merge(player_xp_cs_prefixed, on=["fpl_id", "Player", "web_name", "Age", "team_name", "team_id", "element_type", "now_cost", "tsb", "Min", "90s", "MP_new", "90s_new"], how='outer')

# Step 5: Inspect the merged dataframe
merged_1 = merged_df[["Player", "goals_1", "pens_1", "assists_1", "cs_1"]]
print(merged_1.sort_values(by="goals_1", ascending=False).head(10))

# Step 6: Save the merged dataframe to a CSV file
merged_df.to_csv(f"C:/Users/erknud3/fpl-optimization/model/data/Prediction_Data/player_ev_gw{newest_gw}.csv", index=False)

print(f"Player expected values for all metrics merged and saved successfully for gameweek {newest_gw}.")


player_xp_goals_prefixed columns: Index(['goals_fpl_id', 'goals_Player', 'goals_web_name', 'goals_Age',
       'goals_team_name', 'goals_team_id', 'goals_element_type',
       'goals_now_cost', 'goals_tsb', 'goals_Min', 'goals_90s',
       'goals_finishing', 'goals_MP_new', 'goals_90s_new',
       'goals_weighted_npxG', 'goals_1', 'goals_2', 'goals_3', 'goals_4',
       'goals_5', 'goals_6', 'goals_7', 'goals_8', 'goals_9', 'goals_10',
       'goals_11', 'goals_12', 'goals_13', 'goals_14', 'goals_15', 'goals_16',
       'goals_17', 'goals_18', 'goals_19', 'goals_20', 'goals_21', 'goals_22',
       'goals_23', 'goals_24', 'goals_25', 'goals_26', 'goals_27', 'goals_28',
       'goals_29', 'goals_30', 'goals_31', 'goals_32', 'goals_33', 'goals_34',
       'goals_35', 'goals_36', 'goals_37', 'goals_38'],
      dtype='object')
player_xp_goals_prefixed columns after renaming: Index(['fpl_id', 'Player', 'web_name', 'Age', 'team_name', 'team_id',
       'element_type', 'now_cost', 'tsb', 'Min'

In [13]:
# Inspect the key columns for consistency across all dataframes
print(player_xp_goals_prefixed[["fpl_id", "Player", "team_id"]].drop_duplicates().head())
print(player_xp_pens_prefixed[["fpl_id", "Player", "team_id"]].drop_duplicates().head())
print(player_xp_assists_prefixed[["fpl_id", "Player", "team_id"]].drop_duplicates().head())
print(player_xp_cs_prefixed[["fpl_id", "Player", "team_id"]].drop_duplicates().head())

   fpl_id           Player  team_id
0     134     James Milner        5
1     344              NaN       13
2     236  James Tarkowski        8
3     238     Ashley Young        8
4     414       John Ruddy       15
   fpl_id           Player  team_id
0     134     James Milner        5
1     344              NaN       13
2     236  James Tarkowski        8
3     238     Ashley Young        8
4     414       John Ruddy       15
   fpl_id           Player  team_id
0     134     James Milner        5
1     344              NaN       13
2     236  James Tarkowski        8
3     238     Ashley Young        8
4     414       John Ruddy       15
   fpl_id           Player  team_id
0     134     James Milner        5
1     344              NaN       13
2     236  James Tarkowski        8
3     238     Ashley Young        8
4     414       John Ruddy       15
