In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
import pulp
import random
import numpy as np

# Step 1: Load and Preprocess Data
file_paths = [
    "E:/New folder/fpl_data/2021-22/cleaned_players_starts.csv",
    "E:/New folder/fpl_data/2022-23/cleaned_players_starts.csv",
    "E:/New folder/fpl_data/2023-24/cleaned_players_starts.csv",
    "E:/New folder/fpl_data/2024-25/cleaned_players_starts.csv"
]

dfs = [pd.read_csv(file) for file in file_paths]
df = pd.concat(dfs)

# Assign Weights to Seasons
season_weights = {
    '2021': 0.2,
    '2022': 0.3,
    '2023': 0.4,
    '2024': 1.0
}

df['weight'] = df['Season'].astype(str).map(season_weights)

# One-Hot Encoding for categorical features
categorical_features = ['element_type', 'status']
encoder = OneHotEncoder(sparse=False)
encoded_features = pd.DataFrame(encoder.fit_transform(df[categorical_features]))

# Update the DataFrame with the encoded features
encoded_features.columns = encoder.get_feature_names_out(categorical_features)
df = df.drop(columns=categorical_features).reset_index(drop=True)
df = pd.concat([df, encoded_features], axis=1)

# Drop unnecessary columns
df = df.drop(columns=['team_name'])

# Features and target
X = df.drop(columns=['total_points', 'Season', 'weight', 'first_name', 'second_name'])
y = df['total_points']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=df['Season'])

# Train a Random Forest model with sample weights
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train, sample_weight=df.loc[X_train.index, 'weight'])

# Step 2: Predict for the 2024-25 Season
X_current = df[df['Season'] == 2024].drop(columns=['total_points', 'Season', 'weight', 'first_name', 'second_name'])
df.loc[df['Season'] == 2024, 'predicted_points'] = model.predict(X_current)

# Step 3: Optimization Problem Setup Using PuLP
current_season_data = df[df['Season'] == 2024]
current_season_data = current_season_data[current_season_data['status_a'] == 1]

# Add random factor to predicted points
random_factor = 0.05
for player_id in current_season_data.index:
    current_season_data.loc[player_id, 'randomized_points'] = current_season_data.loc[player_id, 'predicted_points'] * (1 + random.uniform(-random_factor, random_factor))

problem = pulp.LpProblem("FPL_Player_Selection_2024_25", pulp.LpMaximize)

# Define decision variables
player_vars = {player_id: pulp.LpVariable(f"player_{player_id}", cat="Binary")
               for player_id in current_season_data.index}


omit_players = current_season_data[(current_season_data['first_name'] == 'Pascal') & (current_season_data['second_name'] == 'Groß') | 
                                   (current_season_data['first_name'] == 'Julián') & (current_season_data['second_name'] == 'Álvarez')].index
for player_id in omit_players:
    problem += player_vars[player_id] == 0
    

# Objective function: Maximize total randomized points
problem += pulp.lpSum([
    (1.5 if current_season_data.loc[player_id, 'element_type_FWD'] == 1 else
     0.9 if current_season_data.loc[player_id, 'element_type_MID'] == 1 else 1.0) *
    current_season_data.loc[player_id, 'randomized_points'] * player_vars[player_id]
    for player_id in current_season_data.index
])

# Constraint 1: Total cost between £82.0m and £83.0m
problem += pulp.lpSum([current_season_data.loc[player_id, 'now_cost'] * player_vars[player_id]
                       for player_id in current_season_data.index]) <= 1000
problem += pulp.lpSum([current_season_data.loc[player_id, 'now_cost'] * player_vars[player_id]
                       for player_id in current_season_data.index]) >= 990

# Constraint 2: Exactly 11 players
problem += pulp.lpSum([player_vars[player_id] for player_id in current_season_data.index]) == 15

# Constraint 3: Exactly 1 Goalkeeper
problem += pulp.lpSum([player_vars[player_id] for player_id in current_season_data.index
                       if current_season_data.loc[player_id, 'element_type_GK'] == 1]) >= 1
problem += pulp.lpSum([player_vars[player_id] for player_id in current_season_data.index
                       if current_season_data.loc[player_id, 'element_type_GK'] == 1]) <= 2

# Constraint 4: 3-5 Defenders
problem += pulp.lpSum([player_vars[player_id] for player_id in current_season_data.index
                       if current_season_data.loc[player_id, 'element_type_DEF'] == 1]) >= 3
problem += pulp.lpSum([player_vars[player_id] for player_id in current_season_data.index
                       if current_season_data.loc[player_id, 'element_type_DEF'] == 1]) <= 5

# Constraint 5: 2-5 Midfielders
problem += pulp.lpSum([player_vars[player_id] for player_id in current_season_data.index
                       if current_season_data.loc[player_id, 'element_type_MID'] == 1]) >= 2
problem += pulp.lpSum([player_vars[player_id] for player_id in current_season_data.index
                       if current_season_data.loc[player_id, 'element_type_MID'] == 1]) <= 5

# Constraint 6: 1-3 Forwards
problem += pulp.lpSum([player_vars[player_id] for player_id in current_season_data.index
                       if current_season_data.loc[player_id, 'element_type_FWD'] == 1]) >= 1
problem += pulp.lpSum([player_vars[player_id] for player_id in current_season_data.index
                       if current_season_data.loc[player_id, 'element_type_FWD'] == 1]) <= 3

# Constraint 7: No more than 3 players from the same team
for team_id in current_season_data['team_id'].unique():
    problem += pulp.lpSum([player_vars[player_id] for player_id in current_season_data.index
                           if current_season_data.loc[player_id, 'team_id'] == team_id]) <= 3

# Solve the problem
problem.solve()

# Print the optimization status
print("Optimization status:", pulp.LpStatus[problem.status])

# Get the selected players
selected_players = [player_id for player_id, var in player_vars.items() if var.varValue == 1]

# Print selected player information
selected_player_data = current_season_data.loc[selected_players]
print("\nSelected players:")
print(selected_player_data[['first_name', 'second_name', 'element_type_GK', 'element_type_DEF', 'element_type_MID', 'element_type_FWD', 'team_id', 'now_cost', 'predicted_points']])

# Print total cost and predicted points
total_cost = selected_player_data['now_cost'].sum()
total_points = selected_player_data['predicted_points'].sum()
print(f"\nTotal cost: £{total_cost/10:.1f}m")
print(f"Total predicted points: {total_points:.2f}")



Optimization status: Optimal

Selected players:
           first_name                second_name  element_type_GK  \
2361              Kai                    Havertz              0.0   
2375          William                     Saliba              0.0   
2381         Benjamin                      White              0.0   
2416            Ollie                    Watkins              0.0   
2442          Dominic           Solanke-Mitchell              0.0   
2543             Cole                     Palmer              0.0   
2554          Joachim                   Andersen              0.0   
2597           Jordan                   Pickford              1.0   
2612            Bernd                       Leno              1.0   
2619          Antonee                   Robinson              0.0   
2728  Rodrigo 'Rodri'                  Hernandez              0.0   
2764            Bruno  Guimarães Rodriguez Moura              0.0   
2768          Anthony                     Gordon       

In [24]:
print(current_season_data.columns)

Index(['first_name', 'second_name', 'goals_scored', 'assists', 'total_points',
       'minutes', 'goals_conceded', 'creativity', 'influence', 'threat',
       'bonus', 'bps', 'ict_index', 'clean_sheets', 'red_cards',
       'yellow_cards', 'selected_by_percent', 'now_cost', 'starts', 'team_id',
       'squad_strength', 'Season', 'weight', 'element_type_DEF',
       'element_type_FWD', 'element_type_GK', 'element_type_MID', 'status_a',
       'status_d', 'status_i', 'status_n', 'status_s', 'status_u',
       'predicted_points', 'randomized_points'],
      dtype='object')
