<a href="https://colab.research.google.com/github/shipley7/fifa/blob/main/FIFA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Necessary imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [2]:
df = pd.read_csv("combined_match_data.csv") # getting the data
df.head() # printing out the data

Unnamed: 0,index,Player,#,Nation,Pos,Age,Min,Gls,Ast,PK,...,AvgLen.1,Opp,Stp,Stp%,#OPA,AvgDist,home,game_id,top100,source_file
0,0,14 Players,,,,,990.0,3.0,2.0,1.0,...,,,,,,,1,0,0,bundesliga_2017-2018_player_data.csv
1,1,Arjen Robben,10.0,nl NED,RW,33-207,30.0,0.0,0.0,0.0,...,,,,,,,1,0,0,bundesliga_2017-2018_player_data.csv
2,2,Arturo Vidal,23.0,cl CHI,LM,30-088,90.0,0.0,1.0,0.0,...,,,,,,,1,0,0,bundesliga_2017-2018_player_data.csv
3,3,Corentin Tolisso,24.0,fr FRA,RM,23-015,90.0,1.0,0.0,0.0,...,,,,,,,1,0,0,bundesliga_2017-2018_player_data.csv
4,4,David Alaba,27.0,at AUT,LB,25-055,90.0,0.0,0.0,0.0,...,,,,,,,1,0,0,bundesliga_2017-2018_player_data.csv


In [3]:
df = df[df["Player"].str.contains("Players")==False] # removing excess rows that have no player information
df = df.drop(['index', "#"], axis = 1) # index and player number have no impact on anything
df[["Age Years", "Age Days"]] = (df.Age.str.split("-", expand = True)).apply(pd.to_numeric) # splitting age column into two other columns
df[["league", "season"]] = df["source_file"].replace("_player_data.csv", "", regex = True).str.split("_", expand = True) # splitting the source file into a league and season
df["season"] = (df["season"].str[:4].astype(int)) + 1 # season will be an integer representing the year the season ended
df = pd.get_dummies(df, columns=["league"]) # league will be a boolean variable representing which leage we are using
df = df.drop(['Age','source_file', "game_id"], axis = 1) # removing age, source file, and game id as they are no longer needed
df = df.fillna(0) # filling any NA with 0
df # just to check

Unnamed: 0,Player,Nation,Pos,Min,Gls,Ast,PK,PKatt,Sh,SoT,...,home,top100,Age Years,Age Days,season,league_bundesliga,league_la-liga,league_ligue-1,league_premier-league,league_serie-a
1,Arjen Robben,nl NED,RW,30.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1,0,33.0,207.0,2018,True,False,False,False,False
2,Arturo Vidal,cl CHI,LM,90.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1,0,30.0,88.0,2018,True,False,False,False,False
3,Corentin Tolisso,fr FRA,RM,90.0,1.0,0.0,0.0,0.0,4.0,3.0,...,1,0,23.0,15.0,2018,True,False,False,False,False
4,David Alaba,at AUT,LB,90.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,25.0,55.0,2018,True,False,False,False,False
5,Franck Ribéry,fr FRA,LW,76.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,34.0,133.0,2018,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
452035,Ondrej Duda,sk SVK,DM,90.0,0.0,1.0,0.0,0.0,2.0,0.0,...,0,0,30.0,171.0,2025,False,False,False,False,True
452036,Paweł Dawidowicz,pl POL,DM,90.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,30.0,5.0,2025,False,False,False,False,True
452037,Simone Perilli,it ITA,GK,90.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,30.0,138.0,2025,False,False,False,False,True
452038,Suat Serdar,de GER,DM,80.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0,0,28.0,44.0,2025,False,False,False,False,True


In [4]:
# splitting into keepers and outfielders since they all have different statistics
keepers = df[df["Pos"] == "GK"]
outfielders = df[df["Pos"] != "GK"]

In [5]:
# for keepers, removing everything that a keeper doesn't normally do
keepers = keepers.drop(["Pos", 'Gls', 'Ast', 'PK', 'PKatt', 'Sh',
       'SoT', 'Touches', 'Tkl', 'Int', 'Blocks', 'xG', 'npxG',
       'xAG', 'SCA', 'GCA', 'Cmp_x', 'Att_x', 'Cmp%_x', 'PrgP', 'Carries',
       'PrgC', 'Att_x.1', 'Succ'], axis = 1)
keepers

Unnamed: 0,Player,Nation,Min,CrdY,CrdR,SoTA,GA,Saves,Save%,PSxG,...,home,top100,Age Years,Age Days,season,league_bundesliga,league_la-liga,league_ligue-1,league_premier-league,league_serie-a
13,Sven Ulreich,de GER,90.0,0.0,0.0,4.0,1.0,3.0,75.0,0.9,...,1,0,29.0,15.0,2018,True,False,False,False,False
19,Bernd Leno,de GER,90.0,0.0,0.0,8.0,3.0,5.0,75.0,3.5,...,0,0,25.0,167.0,2018,True,False,False,False,False
36,Koen Casteels,be BEL,90.0,0.0,0.0,5.0,3.0,2.0,40.0,1.9,...,1,0,25.0,55.0,2018,True,False,False,False,False
56,Roman Bürki,ch SUI,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,26.0,278.0,2018,True,False,False,False,False
70,Rune Jarstein,no NOR,90.0,0.0,0.0,2.0,0.0,2.0,100.0,0.1,...,1,0,32.0,324.0,2018,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451966,Michele Di Gregorio,it ITA,90.0,0.0,0.0,5.0,2.0,3.0,60.0,1.4,...,0,0,27.0,302.0,2025,False,False,False,False,True
451981,Marco Carnesecchi,it ITA,90.0,0.0,0.0,10.0,3.0,7.0,70.0,3.0,...,1,0,24.0,328.0,2025,False,False,False,False,True
452006,Zion Suzuki,jp JPN,90.0,0.0,0.0,4.0,2.0,2.0,50.0,1.9,...,0,0,22.0,277.0,2025,False,False,False,False,True
452010,Devis Vásquez,co COL,90.0,0.0,0.0,2.0,2.0,0.0,0.0,1.5,...,1,0,27.0,13.0,2025,False,False,False,False,True


In [6]:
position_dummies = outfielders.Pos.str.get_dummies(sep = ",") # since most players play mutliple positions in a match, we will have to multi-hot encode
outfielders = pd.concat([outfielders.drop(columns=["Pos"]), position_dummies], axis = 1) # combining our binary position columns with the remaining dataframe
outfielders[['AM', 'CB', 'CM', 'DF', 'DM',
       'FW', 'LB', 'LM', 'LW', 'MF', 'RB', 'RM', 'RW', 'WB']] = outfielders[['AM', 'CB', 'CM', 'DF', 'DM',
       'FW', 'LB', 'LM', 'LW', 'MF', 'RB', 'RM', 'RW', 'WB']].astype('bool') # change position to boolean (did they play this position in the match or not?)
outfielders = outfielders.drop(['SoTA', 'GA', 'Saves', 'Save%', 'PSxG',
       'Cmp_y', 'Att_y', 'Cmp%_y', 'Att (GK)', 'Thr', 'Launch%', 'AvgLen',
       'Att_y.1', 'Launch%.1', 'AvgLen.1', 'Opp', 'Stp', 'Stp%', '#OPA',
       'AvgDist', 'GK'], axis = 1) # removing keeper statistics
outfielders

Unnamed: 0,Player,Nation,Min,Gls,Ast,PK,PKatt,Sh,SoT,CrdY,...,DM,FW,LB,LM,LW,MF,RB,RM,RW,WB
1,Arjen Robben,nl NED,30.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,False,False,False,False,False,False,False,False,True,False
2,Arturo Vidal,cl CHI,90.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,False,False,False,True,False,False,False,False,False,False
3,Corentin Tolisso,fr FRA,90.0,1.0,0.0,0.0,0.0,4.0,3.0,0.0,...,False,False,False,False,False,False,False,True,False,False
4,David Alaba,at AUT,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,True,False,False,False,False,False,False,False
5,Franck Ribéry,fr FRA,76.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
452034,Nicolás Valentini,ar ARG,90.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,False,False,False,False,False,False,False,False,False,False
452035,Ondrej Duda,sk SVK,90.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,...,True,False,False,False,False,False,False,False,False,False
452036,Paweł Dawidowicz,pl POL,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,False,False,False,False,False,False,False,False,False
452038,Suat Serdar,de GER,80.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,True,False,False,False,False,False,False,False,False,False


In [None]:
selected_season = int(input("Please enter the season you would like to investigate:"))

top_100_outfielders = outfielders[(outfielders['top100'] != 0) & (outfielders['season'] == selected_season)] # get players who at some point have been in the top 100
remainders = outfielders[(outfielders["top100"] == 0) & (outfielders['season'] == selected_season)] # players who are not in the top 100

top_100_outfielders_agg = top_100_outfielders.groupby("Player").mean(numeric_only=True).reset_index() # aggregate each top 100 player's stats (mean) into one row
remainders_agg = remainders.groupby("Player").mean(numeric_only=True).reset_index() # aggreate remainding players stats into one row

# Should we keep leagues? 'league_bundesliga', 'league_la-liga', 'league_ligue-1', 'league_premier-league', 'league_serie-a' For now we will not
top_agg = top_100_outfielders_agg.drop(["Player", "home", "Age Years", "Age Days", "top100", "season", 'league_bundesliga', 'league_la-liga', 'league_ligue-1', 'league_premier-league', 'league_serie-a'], axis = 1) # removing columns I do not believe will help
rest_agg = remainders_agg.drop(["Player", "home", "Age Years", "Age Days", "top100", "season", 'league_bundesliga', 'league_la-liga', 'league_ligue-1', 'league_premier-league', 'league_serie-a'], axis = 1) # removing columns I do not believe will help

# We should standardize our data
scaler = StandardScaler()
top_scaled_agg = scaler.fit_transform(top_agg)
rest_scaled_agg = scaler.fit_transform(rest_agg)

knn_out_agg = NearestNeighbors(n_neighbors=5, metric="euclidean") # KNN
knn_out_agg.fit(top_scaled_agg) # Fit with the top 100 player data

distances, indices = knn_out_agg.kneighbors(rest_scaled_agg) # for all non-top 100 players, find their neighbors (who will be top 100)
player = input("Please enter the name of an outfielder:") # Let's add player input for fun
player_number = remainders_agg[remainders_agg["Player"] == player].index[0] # Find the row where the player is
print("Candidate:", remainders_agg.iloc[player_number]['Player']) # Ensuring we have the right player
print("Similar top 100 players:")

# The top five top 100 players most similar to this rando for that year
for i, number in enumerate(indices[player_number]):
  print(f"{i+1}. {top_100_outfielders_agg.iloc[number]['Player']}")

# We will now do the same as above, but this time we will enter a top 100 player
# Then, we find the top five non-top 100 players who are most similar according to KNN
knn_out_agg_2 = NearestNeighbors(n_neighbors=5, metric="euclidean")
knn_out_agg_2.fit(rest_scaled_agg)

distances, indices = knn_out_agg_2.kneighbors(top_scaled_agg)
player = input("Please enter the name of an outfielder:")
player_number = top_100_outfielders_agg[top_100_outfielders_agg["Player"] == player].index[0]
print("Candidate:", top_100_outfielders_agg.iloc[player_number]['Player'])
print("Similar normal 100 players:")

for i, number in enumerate(indices[player_number]):
  print(f"{i+1}. {remainders_agg.iloc[number]['Player']}")

Please enter the season you would like to investigate:2018
Please enter the name of an outfielder:Anthony Martial
Candidate: Anthony Martial
Similar top 100 players:
1. Memphis
2. Lorenzo Insigne
3. Leroy Sané
4. Sadio Mané
5. Philippe Coutinho


In [None]:
# Now, let's do the keepers
selected_season = int(input("Please enter the season you would like to investigate:"))

top_100_keepers = keepers[(keepers['top100'] != 0) & (keepers['season'] == selected_season)]
remainders = keepers[(keepers["top100"] == 0) & (keepers['season'] == selected_season)]

top_100_keepers_agg = top_100_keepers.groupby("Player").mean(numeric_only=True).reset_index()
remainders_agg = remainders.groupby("Player").mean(numeric_only=True).reset_index()

top_agg = top_100_keepers_agg.drop(["Player", "home", "Age Years", "Age Days", "top100", "season", 'league_bundesliga', 'league_la-liga', 'league_ligue-1', 'league_premier-league', 'league_serie-a'], axis = 1)
rest_agg = remainders_agg.drop(["Player", "home", "Age Years", "Age Days", "top100", "season", 'league_bundesliga', 'league_la-liga', 'league_ligue-1', 'league_premier-league', 'league_serie-a'], axis = 1)

scaler = StandardScaler()
top_scaled_agg = scaler.fit_transform(top_agg)
rest_scaled_agg = scaler.fit_transform(rest_agg)

knn_keep_agg = NearestNeighbors(n_neighbors=5, metric="euclidean")
knn_keep_agg.fit(top_scaled_agg)

distances, indices = knn_keep_agg.kneighbors(rest_scaled_agg)
player = input("Please enter the name of a goalkeeper:")
player_number = remainders_agg[remainders_agg["Player"] == player].index[0]
print("Candidate:", remainders_agg.iloc[player_number]['Player'])
print("Similar top 100 players:")

for i, number in enumerate(indices[player_number]):
  print(f"{i+1}. {top_100_keepers_agg.iloc[number]['Player']}")

# "Reverse, reverse"
knn_keep_agg_2 = NearestNeighbors(n_neighbors=5, metric="euclidean")
knn_keep_agg_2.fit(rest_scaled_agg)

distances, indices = knn_keep_agg_2.kneighbors(top_scaled_agg)
player = input("Please enter the name of a goalkeeper:")
player_number = top_100_keepers_agg[top_100_keepers_agg["Player"] == player].index[0]
print("Candidate:", top_100_keepers_agg.iloc[player_number]['Player'])
print("Similar normal players:")

for i, number in enumerate(indices[player_number]):
  print(f"{i+1}. {remainders_agg.iloc[number]['Player']}")

Please enter the season you would like to investigate:2018
Please enter the name of a goalkeeper:Nick Pope
Candidate: Nick Pope
Similar top 100 players:
1. David de Gea
2. Jan Oblak
3. Jordan Pickford
4. Thibaut Courtois
5. Hugo Lloris


In [10]:
# Setup window for predictions
outfielders_pred = outfielders.groupby(["Player", "season"]).agg({
    'Min':'sum', 'Gls':'sum', 'Ast':'sum', 'PK':'sum', 'PKatt':'sum', 'Sh':'sum', 'SoT':'sum', 'CrdY':'sum', 'CrdR':'sum',
       'Touches':'sum', 'Tkl':'sum', 'Int':'sum', 'Blocks':'sum', 'xG':'sum', 'npxG':'sum', 'xAG':'sum', 'SCA':'sum', 'GCA':'sum',
       'Cmp_x':'sum', 'Att_x':'sum', 'Cmp%_x':'mean', 'PrgP':'sum', 'Carries':'sum', 'PrgC':'sum', 'Att_x.1':'sum',
       'Succ':'sum', 'home':'sum', 'top100':'mean', 'Age Years':'mean', 'Age Days':'mean', 'league_bundesliga':'mean',
       'league_la-liga':'mean', 'league_ligue-1':'mean', 'league_premier-league':'mean',
       'league_serie-a':'mean', 'AM':'mean', 'CB':'mean', 'CM':'mean', 'DF':'mean', 'DM':'mean', 'FW':'mean', 'LB':'mean', 'LM':'mean', 'LW':'mean',
       'MF':'mean', 'RB':'mean', 'RM':'mean', 'RW':'mean', 'WB':'mean'
}).reset_index() # how we are aggregating each of the player's statistics

outfielders_pred.top100 = (outfielders_pred.top100 != 0.0).astype(int) # make our target variable binary

train_seasons = [2018, 2019, 2020, 2021, 2022, 2023, 2024] # every season except 2025 is our training data
test_season = [2025] # we will test on the most recent season
train_df = outfielders_pred[outfielders_pred['season'].isin(train_seasons)] # get a dataframe of just training data
test_df = outfielders_pred[outfielders_pred['season'].isin(test_season)] # get a dataframe of just testing data

# Setting up X_train, y_train, X_test, and y_test
# Removing unnessary columns from training set
X_train = train_df.drop(columns = ['Player', 'season', 'home', 'Age Years', 'Age Days', 'top100', 'league_bundesliga', 'league_la-liga', 'league_ligue-1', 'league_premier-league', 'league_serie-a'])
y_train = train_df['top100']
X_test = test_df.drop(columns = ['Player', 'season', 'home', 'Age Years', 'Age Days', 'top100', 'league_bundesliga', 'league_la-liga', 'league_ligue-1', 'league_premier-league', 'league_serie-a'])
y_test = test_df['top100']
id_train = train_df.Player.reset_index().drop("index", axis = 1)
id_test = test_df.Player.reset_index().drop("index", axis = 1)

stars = list(outfielders_pred[(outfielders_pred['top100'] != 0) & (outfielders_pred['season'] == 2025)]['Player']) # The list of outfielders in the current top 100 who we do have in our dataset (77 of them)

In [11]:
# Resuable function
def predictions(model):
  y_pred = model.predict(X_test) # prediction on if the player in question will be a top 100 player (1) or not (0)
  y_pred_prob = model.predict_proba(X_test) # predicted probabilities of being a top 100 player or not
  potential_indexes = [] # list to hold the indexes of our test data who are predicted to be a star
  for i in range(len(y_pred)):
    potential_indexes.append(i) # add the index of the future top 100 player into our array
  stardom = y_pred_prob[potential_indexes, 1] # get the predicted probability of everyone who is predicted to be a future top 100 player
  rankings = []

  # highest predicted probabilities will be how we base our rankings
  for i in range(len(stardom)):
    for j in range(i + 1):
      if (stardom[j] < stardom[i]):
        temp = stardom[i]
        temp2 = potential_indexes[i]
        stardom[i] = stardom[j]
        potential_indexes[i] = potential_indexes[j]
        stardom[j] = temp
        potential_indexes[j] = temp2

  correct = 0 # see how many were succesfully identified in top 100

  for i in range(100):
    rankings.append(id_test.iloc[potential_indexes[i], 0]) # append the player's with the highest predicted probabilities into our rankings
    if str(id_test.iloc[potential_indexes[i], 0]) in stars:
      correct += 1 # increase our correct if the player here actually appears in the top 100 somewhere

  return rankings, correct # return our rankings and correctness

In [12]:
log = LogisticRegression(class_weight="balanced", random_state = 42) # Logistic regression model
log.fit(X_train, y_train) # fit the model
log_rankings, log_correct = predictions(log) # get rankings and correctness

print("Players correct:", log_correct) # 47 players were correct
print("Percentage correct:", str(log_correct / len(stars) * 100) + "%") # 61% of the possible 77 players identified as true top 100s were in the top 100

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Players correct: 47
Percentage correct: 61.038961038961034%


In [13]:
# We will also try a SVM
clf = svm.SVC(class_weight="balanced", probability=True, random_state = 42) # SVM model
clf.fit(X_train, y_train) # fit the model
svm_rankings, svm_correct = predictions(clf) # get the rankings and correctness

print("Players correct:", svm_correct) # 43 players were correct
print("Percentage correct:", str(svm_correct / len(stars) * 100) + "%") # 55.84% of the possible 77 players identified as true top 100s were in the top 100

Players correct: 43
Percentage correct: 55.84415584415584%


In [14]:
# And Random Forests too
rf = RandomForestClassifier(class_weight='balanced', random_state = 42) # RF model
rf.fit(X_train, y_train) # fit the model
rf_rankings, rf_correct = predictions(rf) # get the rankings and correctness.

print("Players correct:", rf_correct) # 43 players were correct
print("Percentage correct:", str(rf_correct / len(stars) * 100) + "%") # 55.84% of the possible 77 players identified as true top 100s were in the top 100

Players correct: 43
Percentage correct: 55.84415584415584%


In [None]:
# We are now trying RandomizedSearchCV to see if it changes anything
log_tuned = LogisticRegression(random_state = 42) # base log model
param_grid_log = {
    'penalty':['l1','l2','elasticnet','none'],
    'C' : np.linspace(0.1, 2.0, 10),
    'solver': ['lbfgs','liblinear','saga'],
    'max_iter'  : [500,1000,2500,5000],
    'l1_ratio'  : np.linspace(0.0, 1.0, 5),
    'class_weight' : ['balanced']
} # parameter grid (ChatGPT generated this grid for me to use)


# First, we will try f1
random_search_log1 = RandomizedSearchCV(
    estimator = log_tuned,
    param_distributions = param_grid_log,
    n_iter = 100,
    scoring = "f1",
    cv = 3,
    verbose = 0,
    random_state = 42,
    n_jobs = -1
)

random_search_log1.fit(X_train, y_train)
print('F1', random_search_log1.best_estimator_)
print('F1', random_search_log1.best_params_)

log1_correct = predictions(random_search_log1)[1]

print("F1 Players correct:", log1_correct)
print("F1 Percentage correct:", str(log1_correct / len(stars) * 100) + "%")

# Next, we will try ROC_AUC
random_search_log2 = RandomizedSearchCV(
    estimator = log_tuned,
    param_distributions = param_grid_log,
    n_iter = 100,
    scoring = "roc_auc",
    cv = 3,
    verbose = 0,
    random_state = 42,
    n_jobs = -1
)

random_search_log2.fit(X_train, y_train)
print('R0C_AUC', random_search_log2.best_estimator_)
print('ROC_AUC', random_search_log2.best_params_)

log2_correct = predictions(random_search_log2)[1]

print("ROC_AUC Players correct:", log2_correct)
print("ROC_AUC Percentage correct:", str(log2_correct / len(stars) * 100) + "%")


# Finally, average precision
random_search_log3 = RandomizedSearchCV(
    estimator = log_tuned,
    param_distributions = param_grid_log,
    n_iter = 100,
    scoring = "average_precision",
    cv = 3,
    verbose = 0,
    random_state = 42,
    n_jobs = -1
)

random_search_log3.fit(X_train, y_train)
print('Average Precision', random_search_log3.best_estimator_)
print('Average Precision', random_search_log3.best_params_)

log3_correct = predictions(random_search_log3)[1]

print("Average Precision Players correct:", log3_correct)
print("Average Precision Percentage correct:", str(log3_correct / len(stars) * 100) + "%")

135 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
22 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
skl

F1 LogisticRegression(C=np.float64(0.5222222222222223), class_weight='balanced',
                   l1_ratio=np.float64(0.25), max_iter=2500, random_state=42)
F1 {'solver': 'lbfgs', 'penalty': 'l2', 'max_iter': 2500, 'l1_ratio': np.float64(0.25), 'class_weight': 'balanced', 'C': np.float64(0.5222222222222223)}
F1 Players correct: 49
F1 Percentage correct: 63.63636363636363%


135 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
37 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
skl

R0C_AUC LogisticRegression(C=np.float64(0.1), class_weight='balanced',
                   l1_ratio=np.float64(0.5), max_iter=500, random_state=42,
                   solver='liblinear')
ROC_AUC {'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 500, 'l1_ratio': np.float64(0.5), 'class_weight': 'balanced', 'C': np.float64(0.1)}
ROC_AUC Players correct: 48
ROC_AUC Percentage correct: 62.33766233766234%


135 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
55 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
skl

Average Precision LogisticRegression(C=np.float64(0.3111111111111111), class_weight='balanced',
                   l1_ratio=np.float64(1.0), max_iter=1000, penalty='l1',
                   random_state=42, solver='liblinear')
Average Precision {'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 1000, 'l1_ratio': np.float64(1.0), 'class_weight': 'balanced', 'C': np.float64(0.3111111111111111)}
Average Precision Players correct: 47
Average Precision Percentage correct: 61.038961038961034%


In [None]:
# We are now trying RandomizedSearchCV to see if it changes anything
rf_tuned = RandomForestClassifier(random_state = 42) # base rf model
param_grid_rf = {
    'n_estimators': np.arange(100, 1001, 100),
    'max_depth': [None, 5, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 10],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False],
    'class_weight': ['balanced']
} # parameter grid (ChatGPT recommended this is the grid I use)


# First, we wil try f1
random_search_rf1 = RandomizedSearchCV(
    estimator = rf_tuned,
    param_distributions = param_grid_rf,
    n_iter = 100,
    scoring = "f1",
    cv = 3,
    verbose = 0,
    random_state = 42,
    n_jobs = -1
)

random_search_rf1.fit(X_train, y_train)
print('F1', random_search_rf1.best_estimator_)
print('F1', random_search_rf1.best_params_)

rf1_correct = predictions(random_search_rf1)

print("F1 Players correct:", rf1_correct)
print("F1 Percentage correct:", str(rf1_correct / len(stars) * 100) + "%")


# Next, we will use ROC_AUC
random_search_rf2 = RandomizedSearchCV(
    estimator = rf_tuned,
    param_distributions = param_grid_rf,
    n_iter = 100,
    scoring = "roc_auc",
    cv = 3,
    verbose = 0,
    random_state = 42,
    n_jobs = -1
)

random_search_rf2.fit(X_train, y_train)
print('R0C_AUC', random_search_rf2.best_estimator_)
print('ROC_AUC', random_search_rf2.best_params_)

rf2_correct = predictions(random_search_rf2)

print("ROC_AUC Players correct:", rf2_correct)
print("ROC_AUC Percentage correct:", str(rf2_correct / len(stars) * 100) + "%")


# Finally, average precision
random_search_rf3 = RandomizedSearchCV(
    estimator = rf_tuned,
    param_distributions = param_grid_rf,
    n_iter = 100,
    scoring = "average_precision",
    cv = 3,
    verbose = 0,
    random_state = 42,
    n_jobs = -1
)

random_search_rf3.fit(X_train, y_train)
print('Average Precision', random_search_rf3.best_estimator_)
print('Average Precision', random_search_rf3.best_params_)

rf3_correct = predictions(random_search_rf3)

print("Average Precision Players correct:", rf3_correct)
print("Average Precision Percentage correct:", str(rf3_correct / len(stars) * 100) + "%")

F1 RandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=30,
                       min_samples_leaf=10, min_samples_split=10,
                       n_estimators=np.int64(800), random_state=42)
F1 {'n_estimators': np.int64(800), 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'max_depth': 30, 'class_weight': 'balanced', 'bootstrap': False}
F1 Players correct: 45
F1 Percentage correct: 58.44155844155844%
R0C_AUC RandomForestClassifier(class_weight='balanced', max_depth=50,
                       min_samples_leaf=10, min_samples_split=20,
                       n_estimators=np.int64(500), random_state=42)
ROC_AUC {'n_estimators': np.int64(500), 'min_samples_split': 20, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'max_depth': 50, 'class_weight': 'balanced', 'bootstrap': True}
ROC_AUC Players correct: 44
ROC_AUC Percentage correct: 57.14285714285714%
Average Precision RandomForestClassifier(class_weight='balanced', min_samples_leaf=10,
   

In [15]:
# Let's test the best log model based on correctness.
best_log = LogisticRegression(solver = 'lbfgs', penalty = 'l2', C=np.float64(0.5222222222222223), class_weight='balanced', l1_ratio=np.float64(0.25), max_iter=2500, random_state=42)
best_log.fit(X_train, y_train)
best_log_rankings, best_log_correct = predictions(best_log)

print("Best Log Players correct:", best_log_correct)
print("Best Log Percentage correct:", str(best_log_correct / len(stars) * 100) + "%")
print(best_log_rankings) # These rankings are not bad

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best Log Players correct: 49
Best Log Percentage correct: 63.63636363636363%
['Kylian Mbappé', 'Joshua Kimmich', 'Lamine Yamal', 'Ousmane Dembélé', 'Mohamed Salah', 'Raphinha', 'Michael Olise', 'Bradley Barcola', 'Pierre Højbjerg', 'Omar Marmoush', 'Vitinha', 'Achraf Hakimi', 'Pedri', 'Désiré Doué', 'Leonardo Balerdi', 'Harry Kane', 'Granit Xhaka', 'Hugo Ekitike', 'Virgil van Dijk', 'Joško Gvardiol', 'Kim Min-jae', 'Florian Wirtz', 'Cole Palmer', 'Pau Cubarsí', 'Mason Greenwood', 'João Neves', 'William Saliba', 'Khvicha Kvaratskhelia', 'Robert Lewandowski', 'Federico Valverde', 'Mateo Kovačić', 'Matheus Cunha', 'Leroy Sané', 'Fabián Ruiz Peña', 'Angelo Stiller', 'Luis Díaz', 'Alexander Isak', 'Lucas Beraldo', 'Iñigo Martínez', 'Rayan Cherki', 'Luka Modrić', 'Amir Rrahmani', 'Mateo Retegui', 'Corentin Tolisso', 'Dominik Szoboszlai', 'Jamal Musiala', 'Andrey Santos', 'Serhou Guirassy', 'Adrien Rabiot', 'Maghnes Akliouche', 'Alexander Sørloth', 'Rúben Dias', 'Levi Colwill', 'Ademola Lookm

In [16]:
# Let's try the best random forest model based on correctness.
best_rf = RandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=30, max_features = 'sqrt', min_samples_leaf=10, min_samples_split=10, n_estimators=np.int64(800), random_state=42)
best_rf.fit(X_train, y_train)
best_rf_rankings, best_rf_correct = predictions(best_rf)

print("Best RF Players correct:", best_rf_correct)
print("Best RF Percentage correct:", str(best_rf_correct / len(stars) * 100) + "%")
print(best_rf_rankings) # Holy shit, the predicted number one player in 2025 is actually the player who won the Ballon D'Or

Best RF Players correct: 45
Best RF Percentage correct: 58.44155844155844%
['Ousmane Dembélé', 'Michael Olise', 'Cole Palmer', 'Alexander Isak', 'Mohamed Salah', 'Kylian Mbappé', 'Joshua Kimmich', 'Bradley Barcola', 'Raphinha', 'Hugo Ekitike', 'Harry Kane', 'Lamine Yamal', 'Mason Greenwood', 'Omar Marmoush', 'Erling Haaland', 'Granit Xhaka', 'Ollie Watkins', 'Mateo Retegui', 'Achraf Hakimi', 'Vinicius Júnior', 'Pierre Højbjerg', 'Luis Díaz', 'Pedri', 'Matheus Cunha', 'Rafael Leão', 'Serhou Guirassy', 'Iñigo Martínez', 'Julián Álvarez', 'Ademola Lookman', 'Antoine Semenyo', 'Lewis Dunk', 'Rayan Cherki', 'Maghnes Akliouche', 'Florian Wirtz', 'Bruno Fernandes', 'Angelo Stiller', 'Virgil van Dijk', 'Joško Gvardiol', 'Pau Cubarsí', 'Désiré Doué', 'Moise Kean', 'Federico Valverde', 'Leroy Sané', 'Jude Bellingham', 'Christian Pulisic', 'Khvicha Kvaratskhelia', 'Éderson', 'Antonio Rüdiger', 'Nicolò Barella', 'Alessandro Bastoni', 'Luka Modrić', 'Nicolás Paz', 'Martin Ødegaard', 'Bruno Guimarãe

In [None]:
# Where to next?

# Ranking of top 100 (1st, 10th, unranked) - Logistic Regression sort of does this, will need to elaborate further
# Above top 25, where could they land
# Transferability (need club name)?
# Team chemistry (combine with KNN)?
# Play-by-play (+ or -)?

# log = LogisticRegression(class_weight = "balanced", max_iter=5000) # Logistic regression model
# log = LogisticRegression(C = 0.25, class_weight = "balanced", solver = 'liblinear') # Logistic regression model