In [1]:
import pandas as pd
from rapidfuzz import process, fuzz
import unidecode
import pulp

In [2]:
df = pd.read_html("fbref_shooting_2324.html")[0]

In [3]:
df.columns = df.columns.droplevel(0)

In [4]:
df.to_csv("fbref_shooting_2324_raw.csv", index=False)

In [5]:
df = df[df['Rk'] != 'Rk']

In [6]:
df['npxG'] = pd.to_numeric(df['npxG'])

In [7]:
df_fpl_data = pd.read_csv("https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2024-25/cleaned_players.csv")

In [8]:
df = df[df['Squad'] != 'Burnley']
df = df[df['Squad'] != 'Luton Town']
df = df[df['Squad'] != 'Sheffield Utd']

In [9]:
# Sample DataFrames
df1 = df[["Player", "npxG"]]

df2 = df_fpl_data[['first_name', 'second_name', 'element_type', 'now_cost']]

# Function to preprocess names (remove accents, lowercase)
def preprocess_name(name):
    return unidecode.unidecode(name).lower()

df1['processed_name'] = df1['Player'].apply(preprocess_name)
df2['processed_name'] = (df2['first_name'] + ' ' + df2['second_name']).apply(preprocess_name)

# Fuzzy matching
def match_names_with_score(row, choices, scorer=fuzz.token_sort_ratio, threshold=60):
    match, score, _ = process.extractOne(row['processed_name'], choices, scorer=scorer)
    if score >= threshold:
        return match, score
    else:
        return None, None

# Perform the fuzzy matching
choices = df2['processed_name'].tolist()
df1['matched_name'], df1['score'] = zip(*df1.apply(match_names_with_score, axis=1, choices=choices))

# Merge DataFrames
merged_df = pd.merge(df1, df2, left_on='matched_name', right_on='processed_name', how='left')

# Drop intermediate columns if needed
merged_df = merged_df.drop(columns=['processed_name_x', 'processed_name_y'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['processed_name'] = df1['Player'].apply(preprocess_name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['processed_name'] = (df2['first_name'] + ' ' + df2['second_name']).apply(preprocess_name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['matched_name'], df1['score'] = zip(*df1.apply

In [10]:
merged_df[merged_df['npxG'] >= 1].to_clipboard() # do some manual cleaning 'off-screen'

In [11]:
df_cleaned = pd.read_csv("clean_fpl_xg_pos.csv")

In [12]:
df_merged = df[['Player', 'Squad', 'npxG']].merge(df_cleaned[['Player', 'first_name', 'second_name', 'element_type', 'now_cost']])
df_merged = df_merged.rename(columns={"Player": "Name", "Squad": "Club", "element_type": "Position", "now_cost": "Cost"})

In [13]:
# Parameters
budget = 790  # FPL budget in millions
max_players = 10  # Total number of players
max_per_position = {
    "FWD": 2,
    "MID": 5,
    "DEF": 3
}

max_per_club = 3  # Max players per club

# Create a linear programming problem
prob = pulp.LpProblem("Fantasy_Football_Team_Selection", pulp.LpMaximize)

# Decision variables: 1 if the player is selected, 0 otherwise
x = pulp.LpVariable.dicts("x", df_merged['second_name'], cat="Binary")

# Decision variables for captains: 1 if the player is selected as captain, 0 otherwise
c = pulp.LpVariable.dicts("c", df_merged['second_name'], cat="Binary")

# Objective function: Maximize the total expected goals (npxG), considering captaincy
prob += pulp.lpSum([df_merged.loc[i, 'npxG'] * (x[df_merged.loc[i, 'second_name']] + c[df_merged.loc[i, 'second_name']]) for i in df_merged.index])

# Constraint 1: Budget constraint
prob += pulp.lpSum([df_merged.loc[i, 'Cost'] * x[df_merged.loc[i, 'second_name']] for i in df_merged.index]) <= budget

# Constraint 2: Position constraints
for position, max_count in max_per_position.items():
    prob += pulp.lpSum([x[df_merged.loc[i, 'second_name']] for i in df_merged.index if df_merged.loc[i, 'Position'] == position]) <= max_count

# Constraint 3: Total number of players
prob += pulp.lpSum([x[df_merged.loc[i, 'second_name']] for i in df_merged.index]) == max_players

# Constraint 4: Club constraints
clubs = df_merged['Club'].unique()
for club in clubs:
    prob += pulp.lpSum([x[df_merged.loc[i, 'second_name']] for i in df_merged.index if df_merged.loc[i, 'Club'] == club]) <= max_per_club

# Constraint 5: Exactly one captain
prob += pulp.lpSum([c[df_merged.loc[i, 'second_name']] for i in df_merged.index]) == 1

# Constraint 6: A player can only be captain if they are also in the team
for i in df_merged.index:
    prob += c[df_merged.loc[i, 'second_name']] <= x[df_merged.loc[i, 'second_name']]

# Solve the problem
prob.solve()

# Display the selected players
selected_players = [df_merged.loc[i, 'second_name'] for i in df_merged.index if x[df_merged.loc[i, 'second_name']].varValue == 1]
captain = [df_merged.loc[i, 'second_name'] for i in df_merged.index if c[df_merged.loc[i, 'second_name']].varValue == 1][0]
total_npxG = sum(df_merged.loc[i, 'npxG'] for i in df_merged.index if x[df_merged.loc[i, 'second_name']].varValue == 1) + \
             sum(df_merged.loc[i, 'npxG'] for i in df_merged.index if c[df_merged.loc[i, 'second_name']].varValue == 1)
total_cost = sum(df_merged.loc[i, 'Cost'] for i in df_merged.index if x[df_merged.loc[i, 'second_name']].varValue == 1)

# Print the selected players
print("Selected Players:")
for player in selected_players:
    print(player)
print(f"\nCaptain: {captain}")
print(f"Total Expected Goals (npxG): {total_npxG}")
print(f"Total Cost: {total_cost} million")

# Print the selected players by position
print("\nSelected Players by Position:")
for position in max_per_position.keys():
    players_in_position = [df_merged.loc[i, 'second_name'] for i in df_merged.index if x[df_merged.loc[i, 'second_name']].varValue == 1 and df_merged.loc[i, 'Position'] == position]
    print(f"{position}s: {', '.join(players_in_position)}")

Selected Players:
Aït-Nouri
Bowen
Burn
Cash
Díaz
Doucouré
Haaland
Jackson
Saka
Salah

Captain: Haaland
Total Expected Goals (npxG): 133.6
Total Cost: 790 million

Selected Players by Position:
FWDs: Haaland, Jackson
MIDs: Bowen, Díaz, Doucouré, Saka, Salah
DEFs: Aït-Nouri, Burn, Cash
