In [1]:
import pandas as pd
from rapidfuzz import process, fuzz
import unidecode
import pulp

In [2]:
df = pd.read_html("fbref_shooting_2324.html")[0]
df.columns = df.columns.droplevel(0)
df.to_csv("fbref_shooting_2324_raw.csv", index=False)

In [3]:
df = df[df['Rk'] != 'Rk']
df['npxG'] = pd.to_numeric(df['npxG'])
df = df[df['Squad'] != 'Burnley']
df = df[df['Squad'] != 'Luton Town']
df = df[df['Squad'] != 'Sheffield Utd']

In [4]:
df_fpl_data = pd.read_csv("https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2024-25/cleaned_players.csv")

In [5]:
df1 = df[["Player", "npxG"]]

df2 = df_fpl_data[['first_name', 'second_name', 'element_type', 'now_cost']]

# Function to preprocess names (remove accents, lowercase)
def preprocess_name(name):
    return unidecode.unidecode(name).lower()

df1['processed_name'] = df1['Player'].apply(preprocess_name)
df2['processed_name'] = (df2['first_name'] + ' ' + df2['second_name']).apply(preprocess_name)

# Fuzzy matching
def match_names_with_score(row, choices, scorer=fuzz.token_sort_ratio, threshold=60):
    match, score, _ = process.extractOne(row['processed_name'], choices, scorer=scorer)
    if score >= threshold:
        return match, score
    else:
        return None, None

# Perform the fuzzy matching
choices = df2['processed_name'].tolist()
df1['matched_name'], df1['score'] = zip(*df1.apply(match_names_with_score, axis=1, choices=choices))

# Merge DataFrames
merged_df = pd.merge(df1, df2, left_on='matched_name', right_on='processed_name', how='left')

# Drop intermediate columns if needed
merged_df = merged_df.drop(columns=['processed_name_x', 'processed_name_y'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['processed_name'] = df1['Player'].apply(preprocess_name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['processed_name'] = (df2['first_name'] + ' ' + df2['second_name']).apply(preprocess_name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['matched_name'], df1['score'] = zip(*df1.apply

In [6]:
merged_df[merged_df['npxG'] >= 1].to_clipboard() # do some manual cleaning 'off-screen'

In [7]:
df_cleaned = pd.read_csv("clean_fpl_xg_pos.csv")

In [8]:
df_merged = df[['Player', 'Squad', 'npxG']].merge(df_cleaned[['Player', 'first_name', 'second_name', 'element_type', 'now_cost']])
df_merged = df_merged.rename(columns={"Player": "Name", "Squad": "Club", "element_type": "Position", "now_cost": "Cost"})

In [9]:
flat_pen_taker_bonus = (99 + 106 + 81 + 106 + 92) / 20 / 5 # number of penalties per season / 20 (for each team) averaged over last 5 seasons
top_bottom_diff = 1.2 # https://www.perplexity.ai/search/number-of-penalties-from-teams-hMulbzGRRaugUOmQUGiNaQ#1 (top ten teams averaged around 6 penalties, 1 higher)
print(f"flat_pen_taker_bonus: {flat_pen_taker_bonus}")
print(f"top ten pen taker bonus: {flat_pen_taker_bonus + top_bottom_diff}")
print(f"bottom ten pen taker bonus: {flat_pen_taker_bonus - top_bottom_diff}")

df_merged['xG'] = df_merged['npxG']
df_merged.loc[df_merged['second_name'] == 'Haaland', 'xG'] += flat_pen_taker_bonus + top_bottom_diff
df_merged.loc[df_merged['second_name'] == 'Saka', 'xG'] += flat_pen_taker_bonus + top_bottom_diff
df_merged.loc[df_merged['second_name'] == 'Salah', 'xG'] += flat_pen_taker_bonus + top_bottom_diff
df_merged.loc[df_merged['second_name'] == 'Heung-min', 'xG'] += flat_pen_taker_bonus + top_bottom_diff
df_merged.loc[df_merged['second_name'] == 'Palmer', 'xG'] += flat_pen_taker_bonus + top_bottom_diff
df_merged.loc[df_merged['second_name'] == 'Borges Fernandes', 'xG'] += flat_pen_taker_bonus + top_bottom_diff
df_merged.loc[df_merged['second_name'] == 'Watkins', 'xG'] += flat_pen_taker_bonus + top_bottom_diff
df_merged.loc[df_merged['second_name'] == 'Pedro', 'xG'] += flat_pen_taker_bonus + top_bottom_diff
df_merged.loc[df_merged['second_name'] == 'Isak', 'xG'] += flat_pen_taker_bonus + top_bottom_diff
df_merged.loc[df_merged['second_name'] == 'Mbeumo', 'xG'] += flat_pen_taker_bonus + top_bottom_diff

df_merged.loc[df_merged['second_name'] == 'Unal', 'xG'] += flat_pen_taker_bonus - top_bottom_diff
df_merged.loc[df_merged['second_name'] == 'Mateta', 'xG'] += flat_pen_taker_bonus - top_bottom_diff
df_merged.loc[df_merged['second_name'] == 'Calvert-Lewin', 'xG'] += flat_pen_taker_bonus - top_bottom_diff
df_merged.loc[df_merged['second_name'] == 'Muniz Carvalho', 'xG'] += flat_pen_taker_bonus - top_bottom_diff
df_merged.loc[df_merged['second_name'] == 'Gibbs-White', 'xG'] += flat_pen_taker_bonus - top_bottom_diff
df_merged.loc[df_merged['second_name'] == 'Paqueta', 'xG'] += flat_pen_taker_bonus - top_bottom_diff
df_merged.loc[df_merged['second_name'] == 'Sarabia', 'xG'] += flat_pen_taker_bonus - top_bottom_diff

flat_pen_taker_bonus: 4.84
top ten pen taker bonus: 6.04
bottom ten pen taker bonus: 3.6399999999999997


In [10]:
max_players = 10  # Total number of players
max_per_club = 3  # Max players per club

# Define possible formations
formations = {
    '3-5-2': {'FWD': 2, 'MID': 5, 'DEF': 3, 'budget': 790},
    '4-4-2': {'FWD': 2, 'MID': 4, 'DEF': 4, 'budget': 785},
    '3-4-3': {'FWD': 3, 'MID': 4, 'DEF': 3, 'budget': 790},
    '4-5-1': {'FWD': 1, 'MID': 5, 'DEF': 4, 'budget': 785}
}

best_total_xG = 0
best_formation = None
best_selected_players = []
best_captain = None
best_total_cost = 0
best_players_by_position = {}

# Iterate through each formation
for formation_name, max_per_position in formations.items():
    # Create a linear programming problem
    prob = pulp.LpProblem(f"Fantasy_Football_Team_Selection_{formation_name}", pulp.LpMaximize)

    # Decision variables: 1 if the player is selected, 0 otherwise
    x = pulp.LpVariable.dicts("x", df_merged['second_name'], cat="Binary")

    # Decision variables for captains: 1 if the player is selected as captain, 0 otherwise
    c = pulp.LpVariable.dicts("c", df_merged['second_name'], cat="Binary")

    # Objective function: Maximize the total expected goals (xG), considering captaincy
    prob += pulp.lpSum([df_merged.loc[i, 'xG'] * (x[df_merged.loc[i, 'second_name']] + c[df_merged.loc[i, 'second_name']]) for i in df_merged.index])

    # Constraint 1: Budget constraint
    prob += pulp.lpSum([df_merged.loc[i, 'Cost'] * x[df_merged.loc[i, 'second_name']] for i in df_merged.index]) <= max_per_position['budget']

    # Constraint 2: Position constraints
    for position, max_count in max_per_position.items():
        if position != 'budget':  # Skip the 'budget' key
            prob += pulp.lpSum([x[df_merged.loc[i, 'second_name']] for i in df_merged.index if df_merged.loc[i, 'Position'] == position]) <= max_count

    # Constraint 3: Total number of players
    prob += pulp.lpSum([x[df_merged.loc[i, 'second_name']] for i in df_merged.index]) == max_players

    # Constraint 4: Club constraints
    clubs = df_merged['Club'].unique()
    for club in clubs:
        prob += pulp.lpSum([x[df_merged.loc[i, 'second_name']] for i in df_merged.index if df_merged.loc[i, 'Club'] == club]) <= max_per_club

    # Constraint 5: Exactly one captain
    prob += pulp.lpSum([c[df_merged.loc[i, 'second_name']] for i in df_merged.index]) == 1

    # Constraint 6: A player can only be captain if they are also in the team
    for i in df_merged.index:
        prob += c[df_merged.loc[i, 'second_name']] <= x[df_merged.loc[i, 'second_name']]

    # Solve the problem
    prob.solve()

    # Calculate total xG and cost for this formation
    total_xG = sum(df_merged.loc[i, 'xG'] for i in df_merged.index if x[df_merged.loc[i, 'second_name']].varValue == 1) + \
               sum(df_merged.loc[i, 'xG'] for i in df_merged.index if c[df_merged.loc[i, 'second_name']].varValue == 1)
    total_cost = sum(df_merged.loc[i, 'Cost'] for i in df_merged.index if x[df_merged.loc[i, 'second_name']].varValue == 1)

    # Check if this formation is better
    if total_xG > best_total_xG:
        best_total_xG = total_xG
        best_formation = formation_name
        best_selected_players = [df_merged.loc[i, 'second_name'] for i in df_merged.index if x[df_merged.loc[i, 'second_name']].varValue == 1]
        best_captain = [df_merged.loc[i, 'second_name'] for i in df_merged.index if c[df_merged.loc[i, 'second_name']].varValue == 1][0]
        best_total_cost = total_cost
        
        # Store selected players by position
        best_players_by_position = {
            position: [df_merged.loc[i, 'second_name'] for i in df_merged.index if x[df_merged.loc[i, 'second_name']].varValue == 1 and df_merged.loc[i, 'Position'] == position]
            for position in ['FWD', 'MID', 'DEF']
        }

# Print the best formation and selected players
print(f"Best Formation: {best_formation}")
print("Selected Players:")
for player in best_selected_players:
    print(player)
print(f"\nCaptain: {best_captain}")
print(f"Total Expected Goals (xG): {best_total_xG}")
print(f"Total Cost: {best_total_cost} million")

# Print the selected players by position
print("\nSelected Players by Position:")
for position, players in best_players_by_position.items():
    print(f"{position}s: {', '.join(players)}")

Best Formation: 3-4-3
Selected Players:
Aït-Nouri
Cash
Díaz
Doucouré
Haaland
Isak
Mbeumo
Romero
Salah
Watkins

Captain: Haaland
Total Expected Goals (xG): 169.04
Total Cost: 790 million

Selected Players by Position:
FWDs: Haaland, Isak, Watkins
MIDs: Díaz, Doucouré, Mbeumo, Salah
DEFs: Aït-Nouri, Cash, Romero
