<a href="https://colab.research.google.com/github/shipley7/fifa/blob/main/FIFA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
# Necessary imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, accuracy_score, make_scorer
from sklearn import svm
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [2]:
df = pd.read_csv("combined_match_data.csv") # getting the data
df.head() # printing out the data

Unnamed: 0,index,Player,#,Nation,Pos,Age,Min,Gls,Ast,PK,...,AvgLen.1,Opp,Stp,Stp%,#OPA,AvgDist,home,game_id,top100,source_file
0,0,14 Players,,,,,990.0,3.0,2.0,1.0,...,,,,,,,1,0,0,bundesliga_2017-2018_player_data.csv
1,1,Arjen Robben,10.0,nl NED,RW,33-207,30.0,0.0,0.0,0.0,...,,,,,,,1,0,0,bundesliga_2017-2018_player_data.csv
2,2,Arturo Vidal,23.0,cl CHI,LM,30-088,90.0,0.0,1.0,0.0,...,,,,,,,1,0,0,bundesliga_2017-2018_player_data.csv
3,3,Corentin Tolisso,24.0,fr FRA,RM,23-015,90.0,1.0,0.0,0.0,...,,,,,,,1,0,0,bundesliga_2017-2018_player_data.csv
4,4,David Alaba,27.0,at AUT,LB,25-055,90.0,0.0,0.0,0.0,...,,,,,,,1,0,0,bundesliga_2017-2018_player_data.csv


In [3]:
df = df[df["Player"].str.contains("Players")==False] # removing excess rows that have no player information
df = df.drop(['index', "#"], axis = 1) # index and player number have no impact on anything
df[["Age Years", "Age Days"]] = (df.Age.str.split("-", expand = True)).apply(pd.to_numeric) # splitting age column into two other columns
df[["league", "season"]] = df["source_file"].replace("_player_data.csv", "", regex = True).str.split("_", expand = True) # splitting the source file into a league and season
df["season"] = (df["season"].str[:4].astype(int)) + 1 # season will be an integer representing the year the season ended
df = pd.get_dummies(df, columns=["league"]) # league will be a boolean variable representing which leage we are using
df = df.drop(['Age','source_file', "game_id"], axis = 1) # removing age, source file, and game id as they are no longer needed
df = df.fillna(0) # filling any NA with 0
df # just to check

Unnamed: 0,Player,Nation,Pos,Min,Gls,Ast,PK,PKatt,Sh,SoT,...,home,top100,Age Years,Age Days,season,league_bundesliga,league_la-liga,league_ligue-1,league_premier-league,league_serie-a
1,Arjen Robben,nl NED,RW,30.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1,0,33.0,207.0,2018,True,False,False,False,False
2,Arturo Vidal,cl CHI,LM,90.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1,0,30.0,88.0,2018,True,False,False,False,False
3,Corentin Tolisso,fr FRA,RM,90.0,1.0,0.0,0.0,0.0,4.0,3.0,...,1,0,23.0,15.0,2018,True,False,False,False,False
4,David Alaba,at AUT,LB,90.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,25.0,55.0,2018,True,False,False,False,False
5,Franck Ribéry,fr FRA,LW,76.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,34.0,133.0,2018,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
452035,Ondrej Duda,sk SVK,DM,90.0,0.0,1.0,0.0,0.0,2.0,0.0,...,0,0,30.0,171.0,2025,False,False,False,False,True
452036,Paweł Dawidowicz,pl POL,DM,90.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,30.0,5.0,2025,False,False,False,False,True
452037,Simone Perilli,it ITA,GK,90.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,30.0,138.0,2025,False,False,False,False,True
452038,Suat Serdar,de GER,DM,80.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0,0,28.0,44.0,2025,False,False,False,False,True


In [4]:
# splitting into keepers and outfielders since they all have different statistics
keepers = df[df["Pos"] == "GK"]
outfielders = df[df["Pos"] != "GK"]

In [5]:
# for keepers, removing everything that a keeper doesn't normally do
keepers = keepers.drop(["Pos", 'Gls', 'Ast', 'PK', 'PKatt', 'Sh',
       'SoT', 'Touches', 'Tkl', 'Int', 'Blocks', 'xG', 'npxG',
       'xAG', 'SCA', 'GCA', 'Cmp_x', 'Att_x', 'Cmp%_x', 'PrgP', 'Carries',
       'PrgC', 'Att_x.1', 'Succ'], axis = 1)
keepers

Unnamed: 0,Player,Nation,Min,CrdY,CrdR,SoTA,GA,Saves,Save%,PSxG,...,home,top100,Age Years,Age Days,season,league_bundesliga,league_la-liga,league_ligue-1,league_premier-league,league_serie-a
13,Sven Ulreich,de GER,90.0,0.0,0.0,4.0,1.0,3.0,75.0,0.9,...,1,0,29.0,15.0,2018,True,False,False,False,False
19,Bernd Leno,de GER,90.0,0.0,0.0,8.0,3.0,5.0,75.0,3.5,...,0,0,25.0,167.0,2018,True,False,False,False,False
36,Koen Casteels,be BEL,90.0,0.0,0.0,5.0,3.0,2.0,40.0,1.9,...,1,0,25.0,55.0,2018,True,False,False,False,False
56,Roman Bürki,ch SUI,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,26.0,278.0,2018,True,False,False,False,False
70,Rune Jarstein,no NOR,90.0,0.0,0.0,2.0,0.0,2.0,100.0,0.1,...,1,0,32.0,324.0,2018,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451966,Michele Di Gregorio,it ITA,90.0,0.0,0.0,5.0,2.0,3.0,60.0,1.4,...,0,0,27.0,302.0,2025,False,False,False,False,True
451981,Marco Carnesecchi,it ITA,90.0,0.0,0.0,10.0,3.0,7.0,70.0,3.0,...,1,0,24.0,328.0,2025,False,False,False,False,True
452006,Zion Suzuki,jp JPN,90.0,0.0,0.0,4.0,2.0,2.0,50.0,1.9,...,0,0,22.0,277.0,2025,False,False,False,False,True
452010,Devis Vásquez,co COL,90.0,0.0,0.0,2.0,2.0,0.0,0.0,1.5,...,1,0,27.0,13.0,2025,False,False,False,False,True


In [6]:
position_dummies = outfielders.Pos.str.get_dummies(sep = ",") # since most players play mutliple positions in a match, we will have to multi-hot encode
outfielders = pd.concat([outfielders.drop(columns=["Pos"]), position_dummies], axis = 1) # combining our binary position columns with the remaining dataframe
outfielders[['AM', 'CB', 'CM', 'DF', 'DM',
       'FW', 'LB', 'LM', 'LW', 'MF', 'RB', 'RM', 'RW', 'WB']] = outfielders[['AM', 'CB', 'CM', 'DF', 'DM',
       'FW', 'LB', 'LM', 'LW', 'MF', 'RB', 'RM', 'RW', 'WB']].astype('bool') # change position to boolean (did they play this position in the match or not?)
outfielders = outfielders.drop(['SoTA', 'GA', 'Saves', 'Save%', 'PSxG',
       'Cmp_y', 'Att_y', 'Cmp%_y', 'Att (GK)', 'Thr', 'Launch%', 'AvgLen',
       'Att_y.1', 'Launch%.1', 'AvgLen.1', 'Opp', 'Stp', 'Stp%', '#OPA',
       'AvgDist', 'GK'], axis = 1) # removing keeper statistics
outfielders

Unnamed: 0,Player,Nation,Min,Gls,Ast,PK,PKatt,Sh,SoT,CrdY,...,DM,FW,LB,LM,LW,MF,RB,RM,RW,WB
1,Arjen Robben,nl NED,30.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,False,False,False,False,False,False,False,False,True,False
2,Arturo Vidal,cl CHI,90.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,False,False,False,True,False,False,False,False,False,False
3,Corentin Tolisso,fr FRA,90.0,1.0,0.0,0.0,0.0,4.0,3.0,0.0,...,False,False,False,False,False,False,False,True,False,False
4,David Alaba,at AUT,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,True,False,False,False,False,False,False,False
5,Franck Ribéry,fr FRA,76.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
452034,Nicolás Valentini,ar ARG,90.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,False,False,False,False,False,False,False,False,False,False
452035,Ondrej Duda,sk SVK,90.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,...,True,False,False,False,False,False,False,False,False,False
452036,Paweł Dawidowicz,pl POL,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,False,False,False,False,False,False,False,False,False
452038,Suat Serdar,de GER,80.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,True,False,False,False,False,False,False,False,False,False


In [57]:
# Setup window for predictions
outfielders_pred = outfielders.groupby(["Player", "season"]).agg({
    'Min':'sum', 'Gls':'sum', 'Ast':'sum', 'PK':'sum', 'PKatt':'sum', 'Sh':'sum', 'SoT':'sum', 'CrdY':'sum', 'CrdR':'sum',
       'Touches':'sum', 'Tkl':'sum', 'Int':'sum', 'Blocks':'sum', 'xG':'sum', 'npxG':'sum', 'xAG':'sum', 'SCA':'sum', 'GCA':'sum',
       'Cmp_x':'sum', 'Att_x':'sum', 'Cmp%_x':'mean', 'PrgP':'sum', 'Carries':'sum', 'PrgC':'sum', 'Att_x.1':'sum',
       'Succ':'sum', 'home':'sum', 'top100':'mean', 'Age Years':'mean', 'Age Days':'mean', 'league_bundesliga':'mean',
       'league_la-liga':'mean', 'league_ligue-1':'mean', 'league_premier-league':'mean',
       'league_serie-a':'mean', 'AM':'mean', 'CB':'mean', 'CM':'mean', 'DF':'mean', 'DM':'mean', 'FW':'mean', 'LB':'mean', 'LM':'mean', 'LW':'mean',
       'MF':'mean', 'RB':'mean', 'RM':'mean', 'RW':'mean', 'WB':'mean'
}).reset_index() # how we are aggregating each of the player's statistics

outfielders_pred.top100 = (outfielders_pred.top100 != 0.0).astype(int) # make our target variable binary

train_seasons = [2018, 2019, 2020, 2021, 2022, 2023, 2024] # every season except 2025 is our training data
test_season = [2025] # we will test on the most recent season
train_df = outfielders_pred[outfielders_pred['season'].isin(train_seasons)] # get a dataframe of just training data
test_df = outfielders_pred[outfielders_pred['season'].isin(test_season)] # get a dataframe of just testing data

# Setting up X_train, y_train, X_test, and y_test
# Removing unnessary columns from training set
X_train = train_df.drop(columns = ['Player', 'season', 'home', 'Age Years', 'Age Days', 'top100', 'league_bundesliga', 'league_la-liga', 'league_ligue-1', 'league_premier-league', 'league_serie-a'])
y_train = train_df['top100']
X_test = test_df.drop(columns = ['Player', 'season', 'home', 'Age Years', 'Age Days', 'top100', 'league_bundesliga', 'league_la-liga', 'league_ligue-1', 'league_premier-league', 'league_serie-a'])
y_test = test_df['top100']
id_train = train_df.Player.reset_index().drop("index", axis = 1)
id_test = test_df.Player.reset_index().drop("index", axis = 1)

stars = list(outfielders_pred[(outfielders_pred['top100'] != 0) & (outfielders_pred['season'] == 2025)]['Player']) # The list of outfielders in the current top 100 who we do have in our dataset (77 of them)

In [38]:
# Resuable function
def predictions(model):
  y_pred = model.predict(X_test) # prediction on if the player in question will be a top 100 player (1) or not (0)
  y_pred_prob = model.predict_proba(X_test) # predicted probabilities of being a top 100 player or not
  potential_indexes = [] # list to hold the indexes of our test data who are predicted to be a star
  for i in range(len(y_pred)):
    potential_indexes.append(i) # add the index of the future top 100 player into our array
  stardom = y_pred_prob[potential_indexes, 1] # get the predicted probability of everyone who is predicted to be a future top 100 player
  rankings = []

  # highest predicted probabilities will be how we base our rankings
  for i in range(len(stardom)):
    for j in range(i + 1):
      if (stardom[j] < stardom[i]):
        temp = stardom[i]
        temp2 = potential_indexes[i]
        stardom[i] = stardom[j]
        potential_indexes[i] = potential_indexes[j]
        stardom[j] = temp
        potential_indexes[j] = temp2

  correct = 0 # see how many were succesfully identified in top 100

  for i in range(100):
    rankings.append(id_test.iloc[potential_indexes[i], 0]) # append the player's with the highest predicted probabilities into our rankings
    if str(id_test.iloc[potential_indexes[i], 0]) in stars:
      correct += 1 # increase our correct if the player here actually appears in the top 100 somewhere

  return rankings, correct # return our rankings and correctness

In [32]:
log1 = LogisticRegression(class_weight="balanced", random_state = 42) # Logistic regression model
log1.fit(X_train, y_train) # fit the model
log1_rankings, log1_correct = predictions(log1) # get rankings and correctness

print("Balanced Players correct:", log1_correct) # 47 players were correct
print("Balanced Percentage correct:", str(log1_correct / len(stars) * 100) + "%") # 61.04% of the possible 77 players identified as true top 100s were in the top 100
print("Balanced Rankings:", log1_rankings)
print(" ")


log2 = LogisticRegression(class_weight=None, random_state = 42) # Logistic regression model
log2.fit(X_train, y_train) # fit the model
log2_rankings, log2_correct = predictions(log2) # get rankings and correctness

print("None Players correct:", log2_correct) # 43 players were correct
print("None Percentage correct:", str(log2_correct / len(stars) * 100) + "%") # 55.84% of the possible 77 players identified as true top 100s were in the top 100
print("None Rankings:", log2_rankings)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8801603206412826
Balanced Players correct: 47
Balanced Percentage correct: 61.038961038961034%
Balanced Rankings: ['Kylian Mbappé', 'Joshua Kimmich', 'Lamine Yamal', 'Pierre Højbjerg', 'Ousmane Dembélé', 'Vitinha', 'Mason Greenwood', 'Mohamed Salah', 'Michael Olise', 'Florian Wirtz', 'Leonardo Balerdi', 'Omar Marmoush', 'Raphinha', 'Virgil van Dijk', 'Achraf Hakimi', 'Joško Gvardiol', 'Cole Palmer', 'Federico Valverde', 'Granit Xhaka', 'Mateo Kovačić', 'Hugo Ekitike', 'Pau Cubarsí', 'William Saliba', 'Kim Min-jae', 'Harry Kane', 'Matheus Cunha', 'Rúben Dias', 'Amir Rrahmani', 'Bradley Barcola', 'Khvicha Kvaratskhelia', 'Lucas Beraldo', 'Pedri', 'Nicolás Paz', 'Antoine Semenyo', 'João Neves', 'Marcos Alonso', 'Bafodé Diakité', 'Robert Lewandowski', 'Levi Colwill', 'Fabián Ruiz Peña', 'Jude Bellingham', 'Leroy Sané', 'Tijjani Reijnders', 'Bruno Fernandes', 'Ademola Lookman', 'Erling Haaland', 'Alexander Isak', 'Iñigo Martínez', 'Luis Díaz', 'Jamal Musiala', 'Vinicius Júnior', 'Dominik 

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9763527054108216
None Players correct: 43
None Percentage correct: 55.84415584415584%
None Rankings: ['Kylian Mbappé', 'Joshua Kimmich', 'Lamine Yamal', 'Mohamed Salah', 'Ousmane Dembélé', 'Vitinha', 'Pierre Højbjerg', 'Mason Greenwood', 'Michael Olise', 'Omar Marmoush', 'Florian Wirtz', 'Cole Palmer', 'Granit Xhaka', 'Federico Valverde', 'Raphinha', 'Harry Kane', 'Joško Gvardiol', 'Bradley Barcola', 'Achraf Hakimi', 'Pedri', 'Mateo Kovačić', 'Hugo Ekitike', 'Erling Haaland', 'Matheus Cunha', 'Tijjani Reijnders', 'Virgil van Dijk', 'Khvicha Kvaratskhelia', 'Alexander Isak', 'Pau Cubarsí', 'Leonardo Balerdi', 'Jude Bellingham', 'Nicolás Paz', 'Luis Díaz', 'Robert Lewandowski', 'Karl Etta', 'Bruno Fernandes', 'Nathan Butler-Oyedeji', 'Shumaira Mheuka', 'Fabián Ruiz Peña', 'João Neves', 'Adrian Niño', 'Boubacar Traoré', 'William Saliba', 'Ayanda Sishuba', 'Kim Min-jae', 'Anthony Briançon', 'Remy Rees-Dottin', 'Michael Golding', 'Faik Sakar', 'Aaron Ciammaglichella', 'Jay Stansfield', 'D

In [None]:
# We will also try a SVM (rbf kernel)
clf_rbf1 = svm.SVC(kernel = 'rbf', class_weight="balanced", probability=True, random_state = 42) # SVM model
clf_rbf1.fit(X_train, y_train) # fit the model
rbf1_rankings, rbf1_correct = predictions(clf_rbf1) # get the rankings and correctness

print("Balanced Players correct:", rbf1_correct) # 43 players were correct
print("Balanced Percentage correct:", str(rbf1_correct / len(stars) * 100) + "%") # 55.84% of the possible 77 players identified as true top 100s were in the top 100
print("Balanced Rankings:", rbf1_rankings)
print(" ")


clf_rbf2 = svm.SVC(kernel = 'rbf', class_weight=None, probability=True, random_state = 42) # SVM model
clf_rbf2.fit(X_train, y_train) # fit the model
rbf2_rankings, rbf2_correct = predictions(clf_rbf2) # get the rankings and correctness

print("None Players correct:", rbf2_correct) # 46 players were correct
print("None Percentage correct:", str(rbf2_correct / len(stars) * 100) + "%") # 59.74% of the possible 77 players identified as true top 100s were in the top 100
print("None Rankings:", rbf2_rankings)

Balanced Players correct: 43
Balanced Percentage correct: 55.84415584415584%
Balanced Rankings: ['Kylian Mbappé', 'Michael Olise', 'Lamine Yamal', 'Ousmane Dembélé', 'Omar Marmoush', 'Florian Wirtz', 'Vinicius Júnior', 'Mateo Kovačić', 'Luka Modrić', 'Moise Kean', 'Achraf Hakimi', 'Erling Haaland', 'Marquinhos', 'Hugo Ekitike', 'Hakan Çalhanoğlu', 'Mason Greenwood', 'Khvicha Kvaratskhelia', 'Dayot Upamecano', 'Nicolás Paz', 'Fabián Ruiz Peña', 'Bradley Barcola', 'Manuel Akanji', 'Waldemar Anton', 'Rayan Cherki', 'Cole Palmer', 'Warren Zaïre-Emery', 'Rodrygo', 'İlkay Gündoğan', 'Jude Bellingham', 'Nikola Krstović', 'Désiré Doué', 'Martin Ødegaard', 'Lucas Beraldo', 'Alexander Isak', 'Ademola Lookman', 'Nico Schlotterbeck', 'Andrea Cambiaso', 'João Neves', 'Federico Valverde', 'Pierre Kalulu', 'Yoane Wissa', 'Robert Lewandowski', 'Luis Díaz', 'Roberto Piccoli', 'Lee Kang-in', 'Dani Ceballos', 'Granit Xhaka', 'Antonio Rüdiger', 'Ante Budimir', 'Willian Pacho', 'Rodrigo De Paul', 'Álex Gri

In [39]:
# And Random Forests too
rf1 = RandomForestClassifier(class_weight='balanced', random_state = 42) # RF model
rf1.fit(X_train, y_train) # fit the model
rf1_rankings, rf1_correct = predictions(rf1) # get the rankings and correctness.

print("Balanced Players correct:", rf1_correct) # 43 players were correct
print("Balanced Percentage correct:", str(rf1_correct / len(stars) * 100) + "%") # 55.84% of the possible 77 players identified as true top 100s were in the top 100
print("Balanced Rankings:", rf1_rankings)
print(" ")


rf2 = RandomForestClassifier(class_weight=None, random_state = 42) # RF model
rf2.fit(X_train, y_train) # fit the model
rf2_rankings, rf2_correct = predictions(rf2) # get the rankings and correctness.

print("None Players correct:", rf2_correct) # 51 players were correct
print("None Subsample Percentage correct:", str(rf2_correct / len(stars) * 100) + "%") # 66.23% of the possible 77 players identified as true top 100s were in the top 100
print("None Rankings:", rf2_rankings)
print(" ")


rf3 = RandomForestClassifier(class_weight='balanced_subsample', random_state = 42) # RF model
rf3.fit(X_train, y_train) # fit the model
rf3_rankings, rf3_correct = predictions(rf3) # get the rankings and correctness.

print("Balanced Subsample Players correct:", rf3_correct) # 43 players were correct
print("Balanced Subsample Percentage correct:", str(rf3_correct / len(stars) * 100) + "%") # 55.84% of the possible 77 players identified as true top 100s were in the top 100
print("Balanced Subsample Rankings:", rf3_rankings)

Balanced Players correct: 43
Balanced Percentage correct: 55.84415584415584%
Balanced Rankings: ['Kylian Mbappé', 'Ousmane Dembélé', 'Raphinha', 'Omar Marmoush', 'Alexander Isak', 'Cole Palmer', 'Bradley Barcola', 'Hugo Ekitike', 'Mason Greenwood', 'Michael Olise', 'Harry Kane', 'Mohamed Salah', 'Vinicius Júnior', 'Lamine Yamal', 'Serhou Guirassy', 'Achraf Hakimi', 'Granit Xhaka', 'Rafael Leão', 'Désiré Doué', 'Maghnes Akliouche', 'Luis Díaz', 'Erling Haaland', 'Matheus Cunha', 'Mateo Retegui', 'Nicolò Barella', 'Joshua Kimmich', 'Khvicha Kvaratskhelia', 'Florian Wirtz', 'Julián Álvarez', 'Lewis Dunk', 'Bruno Guimarães', 'Ademola Lookman', 'Ollie Watkins', 'Pedri', 'Christian Pulisic', 'Bernardo Silva', 'Iñigo Martínez', 'Pierre Højbjerg', 'Yoane Wissa', 'Éderson', 'Federico Dimarco', 'Luka Modrić', 'Alessandro Bastoni', 'Martin Ødegaard', 'Angelo Stiller', 'Leroy Sané', 'Alex Iwobi', 'Vincenzo Grifo', 'Rayan Cherki', 'Federico Valverde', 'Son Heung-min', 'İlkay Gündoğan', 'Virgil van 

In [59]:
# We are now trying RandomizedSearchCV to see if it changes anything
def custom_best_score(estimator, X, y):
  y_pred = estimator.predict(X)
  y_pred_prob = estimator.predict_proba(X)
  potential_indexes = []
  for i in range(len(y_pred)):
    potential_indexes.append(i) # add the index of the future top 100 player into our array
  stardom = y_pred_prob[potential_indexes, 1] # get the predicted probability of everyone who is predicted to be a future top 100 player

  # highest predicted probabilities will be how we base our rankings
  for i in range(len(stardom)):
    for j in range(i + 1):
      if (stardom[j] < stardom[i]):
        temp = stardom[i]
        temp2 = potential_indexes[i]
        stardom[i] = stardom[j]
        potential_indexes[i] = potential_indexes[j]
        stardom[j] = temp
        potential_indexes[j] = temp2

  correct = 0 # see how many were succesfully identified in top 100

  for i in range(100):
    if (y.iloc[potential_indexes[i]] == 1):
      correct += 1 # increase our correct if the player here actually appears in the top 100 somewhere
  return correct / 100

In [None]:
log_tuned = LogisticRegression(random_state = 42) # base log model
param_grid_log = {
    'penalty':['l1','l2','elasticnet','none'],
    'C' : np.linspace(0.1, 2.0, 10),
    'solver': ['lbfgs','liblinear','saga'],
    'max_iter'  : [500,1000,2500,5000],
    'l1_ratio'  : np.linspace(0.0, 1.0, 5),
    'class_weight' : ['balanced', None]
} # parameter grid (ChatGPT generated this grid for me to use)


random_search_log1 = RandomizedSearchCV(
    estimator = log_tuned,
    param_distributions = param_grid_log,
    n_iter = 100,
    scoring = "f1",
    cv = 3,
    verbose = 0,
    random_state = 42,
    n_jobs = -1
)

random_search_log1.fit(X_train, y_train)
print('F1', random_search_log1.best_estimator_)
print('F1', random_search_log1.best_params_)

log1_correct = predictions(random_search_log1)[1]

print("Players correct:", log1_correct)
print("Percentage correct:", str(log1_correct / len(stars) * 100) + "%")

156 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
42 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
skl

F1 LogisticRegression(C=np.float64(0.1), l1_ratio=np.float64(1.0), max_iter=5000,
                   random_state=42)
F1 {'solver': 'lbfgs', 'penalty': 'l2', 'max_iter': 5000, 'l1_ratio': np.float64(1.0), 'class_weight': None, 'C': np.float64(0.1)}
F1 Players correct: 51
F1 Percentage correct: 66.23376623376623%


In [None]:
svm_tuned_rbf = svm.SVC(kernel = 'rbf', probability=True, random_state = 42)  # RBF kernel

# RBF param grid
param_grid_svm_rbf = {
    'C': [0.01, 0.1, 1, 10, 100],
    'gamma' : ['scale', 'auto', 0.01, 0.1],
    'class_weight': ['balanced', None]
}

# RandomizedSearchCV
random_search_svm_rbf = RandomizedSearchCV(
    estimator = svm_tuned_rbf,
    param_distributions = param_grid_svm_rbf,
    n_iter = 40,
    scoring = "f1",
    cv = 3,
    verbose = 0,
    random_state = 42,
    n_jobs = -1
)

# Fitting best parameters
random_search_svm_rbf.fit(X_train, y_train)
print('RBF F1', random_search_svm_rbf.best_estimator_)
print('RBF F1', random_search_svm_rbf.best_params_)

# Accuracy
svm_rbf_correct = predictions(random_search_svm_rbf)[1]
print("RBF F1 Players correct:", svm_rbf_correct)
print("RBF F1 Percentage correct:", str(svm_rbf_correct / len(stars) * 100) + "%")

RBF F1 SVC(C=100, class_weight='balanced', probability=True, random_state=42)
RBF F1 {'gamma': 'scale', 'class_weight': 'balanced', 'C': 100}
RBF F1 Players correct: 42
RBF F1 Percentage correct: 54.54545454545454%


In [None]:
svm_tuned_sigmoid = svm.SVC(kernel = 'sigmoid', probability=True, random_state = 42)  # Sigmoid kernel

# Sigmoid param grid
param_grid_svm_sigmoid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'gamma' : ['scale', 'auto', 0.01, 0.1],
    'class_weight': ['balanced', None]
}

# RandomizedSearchCV
random_search_svm_sigmoid = RandomizedSearchCV(
    estimator = svm_tuned_sigmoid,
    param_distributions = param_grid_svm_sigmoid,
    n_iter = 40,
    scoring = "f1",
    cv = 3,
    verbose = 0,
    random_state = 42,
    n_jobs = -1
)

# Fitting best parameters
random_search_svm_sigmoid.fit(X_train, y_train)
print('Sigmoid F1', random_search_svm_sigmoid.best_estimator_)
print('Sigmoid F1', random_search_svm_sigmoid.best_params_)

# Accuracy
svm_sigmoid_correct = predictions(random_search_svm_sigmoid)[1]
print("Sigmoid F1 Players correct:", svm_sigmoid_correct)
print("Sigmoid F1 Percentage correct:", str(svm_sigmoid_correct / len(stars) * 100) + "%")

In [None]:
svm_tuned_poly = svm.SVC(kernel = 'poly', probability=True, random_state = 42)  # Poly kernel

# Poly param grid
param_grid_svm_poly = {
    'C': [0.01, 0.1, 1, 10, 100],
    'degree': [2, 3],
    'gamma' : ['scale', 'auto', 0.01, 0.1],
    'class_weight': ['balanced', None]
}

# RandomizedSearchCV
random_search_svm_poly = RandomizedSearchCV(
    estimator = svm_tuned_poly,
    param_distributions = param_grid_svm_poly,
    n_iter = 80,
    scoring = "f1",
    cv = 3,
    verbose = 0,
    random_state = 42,
    n_jobs = -1
)

# Fitting best parameters
random_search_svm_poly.fit(X_train, y_train)
print('Poly F1', random_search_svm_poly.best_estimator_)
print('Poly F1', random_search_svm_poly.best_params_)

# Accuracy
svm_poly_correct = predictions(random_search_svm_poly)[1]
print("Poly F1 Players correct:", svm_poly_correct)
print("Poly F1 Percentage correct:", str(svm_poly_correct / len(stars) * 100) + "%")

KeyboardInterrupt: 

In [None]:
# We are now trying RandomizedSearchCV to see if it changes anything
rf_tuned = RandomForestClassifier(random_state = 42) # base rf model
param_grid_rf = {
    'n_estimators': np.arange(100, 1001, 100),
    'max_depth': [None, 5, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 10],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False],
    'class_weight': ['balanced', 'balanced_subsample', None]
} # parameter grid (ChatGPT recommended this is the grid I use)


random_search_rf1 = RandomizedSearchCV(
    estimator = rf_tuned,
    param_distributions = param_grid_rf,
    n_iter = 100,
    scoring = "f1",
    cv = 3,
    verbose = 0,
    random_state = 42,
    n_jobs = -1
)

# random_search_rf1.fit(X_train, y_train)
print(random_search_rf1.best_estimator_)
print(random_search_rf1.best_params_)

rf1_correct = predictions(random_search_rf1)[1]

print("Players correct:", rf1_correct)
print("Percentage correct:", str(rf1_correct / len(stars) * 100) + "%")

F1 RandomForestClassifier(class_weight='balanced_subsample', min_samples_leaf=10,
                       n_estimators=np.int64(600), random_state=42)
F1 {'n_estimators': np.int64(600), 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'max_depth': None, 'class_weight': 'balanced_subsample', 'bootstrap': True}
F1 Players correct: 45
F1 Percentage correct: 58.44155844155844%


In [None]:
# Let's test the best log model based on correctness.
best_log = LogisticRegression(solver = 'lbfgs', penalty = 'l2', max_iter = 5000, l1_ratio = np.float64(1.0), class_weight = None, C = np.float64(0.1), random_state=42)
best_log.fit(X_train, y_train)
best_log_rankings, best_log_correct = predictions(best_log)

print("Best Log Players correct:", best_log_correct)
print("Best Log Percentage correct:", str(best_log_correct / len(stars) * 100) + "%")
print(best_log_rankings) # These rankings are not bad

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best Log Players correct: 51
Best Log Percentage correct: 66.23376623376623%
['Mohamed Salah', 'Kylian Mbappé', 'Joshua Kimmich', 'Ousmane Dembélé', 'Lamine Yamal', 'Raphinha', 'Michael Olise', 'Bradley Barcola', 'Omar Marmoush', 'Pierre Højbjerg', 'Harry Kane', 'Vitinha', 'Achraf Hakimi', 'Hugo Ekitike', 'Mateo Retegui', 'Granit Xhaka', 'Robert Lewandowski', 'Alexander Isak', 'Florian Wirtz', 'Mason Greenwood', 'Luis Díaz', 'Cole Palmer', 'Pedri', 'João Neves', 'Joško Gvardiol', 'Khvicha Kvaratskhelia', 'Virgil van Dijk', 'Désiré Doué', 'Pau Cubarsí', 'Leonardo Balerdi', 'Jude Bellingham', 'Mateo Kovačić', 'William Saliba', 'Vinicius Júnior', 'Leroy Sané', 'Kim Min-jae', 'Fabián Ruiz Peña', 'Serhou Guirassy', 'Angelo Stiller', 'Rayan Cherki', 'Matheus Cunha', 'Federico Valverde', 'Bruno Fernandes', 'Maghnes Akliouche', 'Amir Rrahmani', 'Ademola Lookman', 'Jamal Musiala', 'Alexander Sørloth', 'Iñigo Martínez', 'Yoane Wissa', 'Erling Haaland', 'Luka Modrić', 'Ayoze Pérez', 'Rayan Aït-No

In [None]:
# Let's try the best random forest model based on correctness.
best_rf = RandomForestClassifier(n_estimators = np.int64(600), min_samples_split = 2, min_samples_leaf = 10, max_features = 'sqrt', max_depth = None, class_weight = 'balanced_subsample', bootstrap = True, random_state = 42)
best_rf.fit(X_train, y_train)
best_rf_rankings, best_rf_correct = predictions(best_rf)

print("Best RF Players correct:", best_rf_correct)
print("Best RF Percentage correct:", str(best_rf_correct / len(stars) * 100) + "%")
print(best_rf_rankings) # Holy shit, the predicted number one player in 2025 is actually the player who won the Ballon D'Or

Best RF Players correct: 45
Best RF Percentage correct: 58.44155844155844%
['Ousmane Dembélé', 'Cole Palmer', 'Michael Olise', 'Kylian Mbappé', 'Mohamed Salah', 'Raphinha', 'Mason Greenwood', 'Alexander Isak', 'Bradley Barcola', 'Omar Marmoush', 'Hugo Ekitike', 'Lamine Yamal', 'Joshua Kimmich', 'Harry Kane', 'Mateo Retegui', 'Erling Haaland', 'Ollie Watkins', 'Luis Díaz', 'Achraf Hakimi', 'Vinicius Júnior', 'Granit Xhaka', 'Pedri', 'Julián Álvarez', 'Florian Wirtz', 'Pierre Højbjerg', 'Serhou Guirassy', 'Matheus Cunha', 'Rafael Leão', 'Ademola Lookman', 'Maghnes Akliouche', 'Antoine Semenyo', 'Bruno Fernandes', 'Iñigo Martínez', 'Rayan Cherki', 'Jude Bellingham', 'Angelo Stiller', 'Khvicha Kvaratskhelia', 'Moise Kean', 'Federico Valverde', 'Pau Cubarsí', 'Nicolás Paz', 'Virgil van Dijk', 'Nicolò Barella', 'Christian Pulisic', 'Leroy Sané', 'Désiré Doué', 'Joško Gvardiol', 'Martin Ødegaard', 'Luka Modrić', 'Éderson', 'Nicolas Jackson', 'Alex Iwobi', 'Lewis Dunk', 'Andrej Kramarić', 'Ami

In [None]:
# Where to next?

# Ranking of top 100 (1st, 10th, unranked) - Logistic Regression sort of does this, will need to elaborate further
# Above top 25, where could they land
# Transferability (need club name)?
# Team chemistry (combine with KNN)?
# Play-by-play (+ or -)?

# log = LogisticRegression(class_weight = "balanced", max_iter=5000) # Logistic regression model
# log = LogisticRegression(C = 0.25, class_weight = "balanced", solver = 'liblinear') # Logistic regression model