In [1]:
import gurobipy as gb
from gurobipy import *
import pandas as pd
import numpy as np

In [61]:
# Specify data types for problematic columns
dtype_options = {'nation_position': 'str', 'nation_logo_url': 'str'}

players = pd.read_csv('players_22.csv', dtype=dtype_options)

In [62]:
players.shape

(19239, 110)

In [58]:
players.loc[:, 'pace': 'goalkeeping_speed'].isnull().sum()

pace                            2132
shooting                        2132
passing                         2132
dribbling                       2132
defending                       2132
physic                          2132
attacking_crossing                 0
attacking_finishing                0
attacking_heading_accuracy         0
attacking_short_passing            0
attacking_volleys                  0
skill_dribbling                    0
skill_curve                        0
skill_fk_accuracy                  0
skill_long_passing                 0
skill_ball_control                 0
movement_acceleration              0
movement_sprint_speed              0
movement_agility                   0
movement_reactions                 0
movement_balance                   0
power_shot_power                   0
power_jumping                      0
power_stamina                      0
power_strength                     0
power_long_shots                   0
mentality_aggression               0
m

In [59]:
#players.dropna(subset=players.loc[:, 'pace': 'goalkeeping_reflexes'].columns, inplace=True)
# We don't drop missing values because some missing values are associated with positions. For example, goalkeepers don't have scores for pace, shooting...

In [64]:
players.shape

(19239, 110)

In [65]:
# Separate the position score columns, for example, column 'ls' with value '83+3' separates into column 'ls' with value 83 and column 'ls_potential' 3
for col in players.loc[:, 'ls': 'gk'].columns:
    # Extract the numbers using regular expressions and handle the case where the sign is not present
    extracted_values = players[col].str.extract(r'(\d+)([\+\-]?\d*)')
    # Separate the extracted values into two columns
    players[[col, col+'_potential']] = extracted_values.apply(lambda x: pd.to_numeric(x, errors='coerce'))
    # Fill missing values with 0 in the 'Value2' column
    players[col+'_potential'].fillna(0, inplace=True)

In [68]:
players.dropna(subset=['club_position'], inplace=True)
players['club_position'].isnull().sum()

0

In [69]:
players = pd.get_dummies(data = players, columns = ['club_position'])
players.columns

Index(['sofifa_id', 'player_url', 'short_name', 'long_name',
       'player_positions', 'overall', 'potential', 'value_eur', 'wage_eur',
       'age',
       ...
       'club_position_RCM', 'club_position_RDM', 'club_position_RES',
       'club_position_RF', 'club_position_RM', 'club_position_RS',
       'club_position_RW', 'club_position_RWB', 'club_position_ST',
       'club_position_SUB'],
      dtype='object', length=165)

In [70]:
players[players['nationality_name'] == 'Brazil'].isnull().sum()

sofifa_id                        0
player_url                       0
short_name                       0
long_name                        0
player_positions                 0
overall                          0
potential                        0
value_eur                        0
wage_eur                         0
age                              0
dob                              0
height_cm                        0
weight_kg                        0
club_team_id                     0
club_name                        0
league_name                      0
league_level                     0
club_jersey_number               0
club_loaned_from               866
club_joined                     31
club_contract_valid_until        0
nationality_id                   0
nationality_name                 0
nation_team_id                 874
nation_position                874
nation_jersey_number           874
preferred_foot                   0
weak_foot                        0
skill_moves         

In [71]:
brazil_players = players[players['nationality_name'] == 'Brazil']
brazil_players.shape

(897, 165)

In [97]:
id = np.array(brazil_players['sofifa_id'])
n = len(id)

model = gb.Model("World Cup Team Selection")

overall_score = np.array(brazil_players['overall'])
# Goalkeepers
goalkeeper = np.array(brazil_players['club_position_GK'])
# Defenders
center_back = np.array(brazil_players['club_position_CB'])
left_back = np.array(brazil_players['club_position_LB'])
right_back = np.array(brazil_players['club_position_RB'])
# Midfielders
central_mid = np.array(brazil_players['club_position_CM'])
left_mid = np.array(brazil_players['club_position_LM'])
right_mid = np.array(brazil_players['club_position_RM'])
# Forwards
striker = np.array(brazil_players['club_position_ST'])
center_forward = np.array(brazil_players['club_position_CF'])
# Substitute
substitute = np.array(brazil_players['club_position_SUB'])

# Add variables
X = model.addVars(n, vtype=GRB.BINARY, name = ["Player with ID = "+str(i) for i in id])

# Set Objective
model.setObjective(sum(X[i]*overall_score[i] for i in range(n)), GRB.MAXIMIZE)

# Add Constraints
model.addConstr(sum(X[i] for i in range(n)) == 23)
model.addConstr(sum(X[i]*substitute[i] for i in range(n)) == 12)

model.addConstr(sum(X[i]*goalkeeper[i] for i in range(n)) == 1)
model.addConstr(sum(X[i]*center_back[i] for i in range(n)) == 2)
model.addConstr(sum(X[i]*left_back[i] for i in range(n)) == 1)
model.addConstr(sum(X[i]*right_back[i] for i in range(n)) == 1)
model.addConstr(sum(X[i]*central_mid[i] for i in range(n)) == 2)
model.addConstr(sum(X[i]*left_mid[i] for i in range(n)) == 1)
model.addConstr(sum(X[i]*right_mid[i] for i in range(n)) == 1)
model.addConstr(sum(X[i]*striker[i] for i in range(n)) == 1)
model.addConstr(sum(X[i]*center_forward[i] for i in range(n)) == 1)

model.optimize()

Gurobi Optimizer version 10.0.2 build v10.0.2rc0 (mac64[x86])

CPU model: Intel(R) Core(TM) i5-5350U CPU @ 1.80GHz
Thread count: 2 physical cores, 4 logical processors, using up to 4 threads

Optimize a model with 11 rows, 897 columns and 1448 nonzeros
Model fingerprint: 0x438f9e72
Variable types: 0 continuous, 897 integer (897 binary)
Coefficient statistics:
  Matrix range     [1e+00, 1e+00]
  Objective range  [5e+01, 9e+01]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 2e+01]
Found heuristic solution: objective 1613.0000000
Presolve removed 10 rows and 886 columns
Presolve time: 0.01s
Presolved: 1 rows, 11 columns, 11 nonzeros
Found heuristic solution: objective 1874.0000000
Variable types: 0 continuous, 11 integer (3 binary)

Root relaxation: cutoff, 0 iterations, 0.00 seconds (0.00 work units)

Explored 1 nodes (0 simplex iterations) in 0.06 seconds (0.00 work units)
Thread count was 4 (of 4 available processors)

Solution count 2: 1874 1613 

Optimal solution found 

In [104]:
# Players name are not printed because if we use short_name as varName, Gurobi has some problems to decode some special characters
for v in model.getVars():
    if v.x > 0:
        print(v.varName, "is selected")

Player with ID = 210257 is selected
Player with ID = 164240 is selected
Player with ID = 201942 is selected
Player with ID = 207863 is selected
Player with ID = 135507 is selected
Player with ID = 191043 is selected
Player with ID = 230658 is selected
Player with ID = 230666 is selected
Player with ID = 189242 is selected
Player with ID = 194404 is selected
Player with ID = 212462 is selected
Player with ID = 231943 is selected
Player with ID = 233419 is selected
Player with ID = 199304 is selected
Player with ID = 230294 is selected
Player with ID = 251573 is selected
Player with ID = 176676 is selected
Player with ID = 201400 is selected
Player with ID = 229927 is selected
Player with ID = 238794 is selected
Player with ID = 222715 is selected
Player with ID = 239580 is selected
Player with ID = 221540 is selected
