<a href="https://colab.research.google.com/github/shipley7/fifa/blob/main/FIFA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Necessary imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, accuracy_score, make_scorer
from sklearn import svm
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, TimeSeriesSplit
from sklearn.pipeline import Pipeline

In [3]:
df = pd.read_csv("combined_match_data.csv") # getting the data
df.head() # printing out the data

Unnamed: 0,index,Player,#,Nation,Pos,Age,Min,Gls,Ast,PK,...,AvgLen.1,Opp,Stp,Stp%,#OPA,AvgDist,home,game_id,top100,source_file
0,0,14 Players,,,,,990.0,3.0,2.0,1.0,...,,,,,,,1,0,0,bundesliga_2017-2018_player_data.csv
1,1,Arjen Robben,10.0,nl NED,RW,33-207,30.0,0.0,0.0,0.0,...,,,,,,,1,0,0,bundesliga_2017-2018_player_data.csv
2,2,Arturo Vidal,23.0,cl CHI,LM,30-088,90.0,0.0,1.0,0.0,...,,,,,,,1,0,0,bundesliga_2017-2018_player_data.csv
3,3,Corentin Tolisso,24.0,fr FRA,RM,23-015,90.0,1.0,0.0,0.0,...,,,,,,,1,0,0,bundesliga_2017-2018_player_data.csv
4,4,David Alaba,27.0,at AUT,LB,25-055,90.0,0.0,0.0,0.0,...,,,,,,,1,0,0,bundesliga_2017-2018_player_data.csv


In [4]:
df = df[df["Player"].str.contains("Players")==False] # removing excess rows that have no player information
df = df.drop(['index', "#"], axis = 1) # index and player number have no impact on anything
df[["Age Years", "Age Days"]] = (df.Age.str.split("-", expand = True)).apply(pd.to_numeric) # splitting age column into two other columns
df[["league", "season"]] = df["source_file"].replace("_player_data.csv", "", regex = True).str.split("_", expand = True) # splitting the source file into a league and season
df["season"] = (df["season"].str[:4].astype(int)) + 1 # season will be an integer representing the year the season ended
df = pd.get_dummies(df, columns=["league"]) # league will be a boolean variable representing which leage we are using
df = df.drop(['Age','source_file', "game_id"], axis = 1) # removing age, source file, and game id as they are no longer needed
df = df.fillna(0) # filling any NA with 0
df # just to check

Unnamed: 0,Player,Nation,Pos,Min,Gls,Ast,PK,PKatt,Sh,SoT,...,home,top100,Age Years,Age Days,season,league_bundesliga,league_la-liga,league_ligue-1,league_premier-league,league_serie-a
1,Arjen Robben,nl NED,RW,30.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1,0,33.0,207.0,2018,True,False,False,False,False
2,Arturo Vidal,cl CHI,LM,90.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1,0,30.0,88.0,2018,True,False,False,False,False
3,Corentin Tolisso,fr FRA,RM,90.0,1.0,0.0,0.0,0.0,4.0,3.0,...,1,0,23.0,15.0,2018,True,False,False,False,False
4,David Alaba,at AUT,LB,90.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,25.0,55.0,2018,True,False,False,False,False
5,Franck Ribéry,fr FRA,LW,76.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,34.0,133.0,2018,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
452035,Ondrej Duda,sk SVK,DM,90.0,0.0,1.0,0.0,0.0,2.0,0.0,...,0,0,30.0,171.0,2025,False,False,False,False,True
452036,Paweł Dawidowicz,pl POL,DM,90.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,30.0,5.0,2025,False,False,False,False,True
452037,Simone Perilli,it ITA,GK,90.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,30.0,138.0,2025,False,False,False,False,True
452038,Suat Serdar,de GER,DM,80.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0,0,28.0,44.0,2025,False,False,False,False,True


In [5]:
# splitting into keepers and outfielders since they all have different statistics
keepers = df[df["Pos"] == "GK"]
outfielders = df[df["Pos"] != "GK"]

In [6]:
# for keepers, removing everything that a keeper doesn't normally do
keepers = keepers.drop(["Pos", 'Gls', 'Ast', 'PK', 'PKatt', 'Sh',
       'SoT', 'Touches', 'Tkl', 'Int', 'Blocks', 'xG', 'npxG',
       'xAG', 'SCA', 'GCA', 'Cmp_x', 'Att_x', 'Cmp%_x', 'PrgP', 'Carries',
       'PrgC', 'Att_x.1', 'Succ'], axis = 1)
keepers

Unnamed: 0,Player,Nation,Min,CrdY,CrdR,SoTA,GA,Saves,Save%,PSxG,...,home,top100,Age Years,Age Days,season,league_bundesliga,league_la-liga,league_ligue-1,league_premier-league,league_serie-a
13,Sven Ulreich,de GER,90.0,0.0,0.0,4.0,1.0,3.0,75.0,0.9,...,1,0,29.0,15.0,2018,True,False,False,False,False
19,Bernd Leno,de GER,90.0,0.0,0.0,8.0,3.0,5.0,75.0,3.5,...,0,0,25.0,167.0,2018,True,False,False,False,False
36,Koen Casteels,be BEL,90.0,0.0,0.0,5.0,3.0,2.0,40.0,1.9,...,1,0,25.0,55.0,2018,True,False,False,False,False
56,Roman Bürki,ch SUI,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,26.0,278.0,2018,True,False,False,False,False
70,Rune Jarstein,no NOR,90.0,0.0,0.0,2.0,0.0,2.0,100.0,0.1,...,1,0,32.0,324.0,2018,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451966,Michele Di Gregorio,it ITA,90.0,0.0,0.0,5.0,2.0,3.0,60.0,1.4,...,0,0,27.0,302.0,2025,False,False,False,False,True
451981,Marco Carnesecchi,it ITA,90.0,0.0,0.0,10.0,3.0,7.0,70.0,3.0,...,1,0,24.0,328.0,2025,False,False,False,False,True
452006,Zion Suzuki,jp JPN,90.0,0.0,0.0,4.0,2.0,2.0,50.0,1.9,...,0,0,22.0,277.0,2025,False,False,False,False,True
452010,Devis Vásquez,co COL,90.0,0.0,0.0,2.0,2.0,0.0,0.0,1.5,...,1,0,27.0,13.0,2025,False,False,False,False,True


In [7]:
position_dummies = outfielders.Pos.str.get_dummies(sep = ",") # since most players play mutliple positions in a match, we will have to multi-hot encode
outfielders = pd.concat([outfielders.drop(columns=["Pos"]), position_dummies], axis = 1) # combining our binary position columns with the remaining dataframe
outfielders[['AM', 'CB', 'CM', 'DF', 'DM',
       'FW', 'LB', 'LM', 'LW', 'MF', 'RB', 'RM', 'RW', 'WB']] = outfielders[['AM', 'CB', 'CM', 'DF', 'DM',
       'FW', 'LB', 'LM', 'LW', 'MF', 'RB', 'RM', 'RW', 'WB']].astype('bool') # change position to boolean (did they play this position in the match or not?)
outfielders = outfielders.drop(['SoTA', 'GA', 'Saves', 'Save%', 'PSxG',
       'Cmp_y', 'Att_y', 'Cmp%_y', 'Att (GK)', 'Thr', 'Launch%', 'AvgLen',
       'Att_y.1', 'Launch%.1', 'AvgLen.1', 'Opp', 'Stp', 'Stp%', '#OPA',
       'AvgDist', 'GK'], axis = 1) # removing keeper statistics
outfielders

Unnamed: 0,Player,Nation,Min,Gls,Ast,PK,PKatt,Sh,SoT,CrdY,...,DM,FW,LB,LM,LW,MF,RB,RM,RW,WB
1,Arjen Robben,nl NED,30.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,False,False,False,False,False,False,False,False,True,False
2,Arturo Vidal,cl CHI,90.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,False,False,False,True,False,False,False,False,False,False
3,Corentin Tolisso,fr FRA,90.0,1.0,0.0,0.0,0.0,4.0,3.0,0.0,...,False,False,False,False,False,False,False,True,False,False
4,David Alaba,at AUT,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,True,False,False,False,False,False,False,False
5,Franck Ribéry,fr FRA,76.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
452034,Nicolás Valentini,ar ARG,90.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,False,False,False,False,False,False,False,False,False,False
452035,Ondrej Duda,sk SVK,90.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,...,True,False,False,False,False,False,False,False,False,False
452036,Paweł Dawidowicz,pl POL,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,False,False,False,False,False,False,False,False,False
452038,Suat Serdar,de GER,80.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,True,False,False,False,False,False,False,False,False,False


In [8]:
# Setup window for predictions
outfielders_pred = outfielders.groupby(["Player", "season"]).agg({
    'Min':'sum', 'Gls':'sum', 'Ast':'sum', 'PK':'sum', 'PKatt':'sum', 'Sh':'sum', 'SoT':'sum', 'CrdY':'sum', 'CrdR':'sum',
       'Touches':'sum', 'Tkl':'sum', 'Int':'sum', 'Blocks':'sum', 'xG':'sum', 'npxG':'sum', 'xAG':'sum', 'SCA':'sum', 'GCA':'sum',
       'Cmp_x':'sum', 'Att_x':'sum', 'Cmp%_x':'mean', 'PrgP':'sum', 'Carries':'sum', 'PrgC':'sum', 'Att_x.1':'sum',
       'Succ':'sum', 'home':'sum', 'top100':'mean', 'Age Years':'mean', 'Age Days':'mean', 'league_bundesliga':'mean',
       'league_la-liga':'mean', 'league_ligue-1':'mean', 'league_premier-league':'mean',
       'league_serie-a':'mean', 'AM':'mean', 'CB':'mean', 'CM':'mean', 'DF':'mean', 'DM':'mean', 'FW':'mean', 'LB':'mean', 'LM':'mean', 'LW':'mean',
       'MF':'mean', 'RB':'mean', 'RM':'mean', 'RW':'mean', 'WB':'mean'
}).reset_index() # how we are aggregating each of the player's statistics

outfielders_pred.top100 = (outfielders_pred.top100 != 0.0).astype(int) # make our target variable binary

train_seasons = [2018, 2019, 2020, 2021, 2022, 2023, 2024] # every season except 2025 is our training data
test_season = [2025] # we will test on the most recent season
train_df = outfielders_pred[outfielders_pred['season'].isin(train_seasons)] # get a dataframe of just training data
test_df = outfielders_pred[outfielders_pred['season'].isin(test_season)] # get a dataframe of just testing data

# Setting up X_train, y_train, X_test, and y_test
# Removing unnessary columns from training set
X_train = train_df.drop(columns = ['Player', 'season', 'home', 'Age Days', 'top100', 'league_bundesliga', 'league_la-liga', 'league_ligue-1', 'league_premier-league', 'league_serie-a'])
y_train = train_df['top100']
X_test = test_df.drop(columns = ['Player', 'season', 'home', 'Age Days', 'top100', 'league_bundesliga', 'league_la-liga', 'league_ligue-1', 'league_premier-league', 'league_serie-a'])
y_test = test_df['top100']
id_train = train_df.Player.reset_index().drop("index", axis = 1)
id_test = test_df.Player.reset_index().drop("index", axis = 1)

stars = list(outfielders_pred[(outfielders_pred['top100'] != 0) & (outfielders_pred['season'] == 2025)]['Player']) # The list of outfielders in the current top 100 who we do have in our dataset (77 of them)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
# Resuable function
def predictions(model):
  y_pred = model.predict(X_test) # prediction on if the player in question will be a top 100 player (1) or not (0)
  y_pred_prob = model.predict_proba(X_test) # predicted probabilities of being a top 100 player or not
  potential_indexes = [] # list to hold the indexes of our test data who are predicted to be a star
  for i in range(len(y_pred)):
    potential_indexes.append(i) # add the index of the future top 100 player into our array
  stardom = y_pred_prob[potential_indexes, 1] # get the predicted probability of everyone who is predicted to be a future top 100 player
  rankings = []

  # highest predicted probabilities will be how we base our rankings
  for i in range(len(stardom)):
    for j in range(i + 1):
      if (stardom[j] < stardom[i]):
        temp = stardom[i]
        temp2 = potential_indexes[i]
        stardom[i] = stardom[j]
        potential_indexes[i] = potential_indexes[j]
        stardom[j] = temp
        potential_indexes[j] = temp2

  correct = 0 # see how many were succesfully identified in top 100

  for i in range(100):
    rankings.append(id_test.iloc[potential_indexes[i], 0]) # append the player's with the highest predicted probabilities into our rankings
    if str(id_test.iloc[potential_indexes[i], 0]) in stars:
      correct += 1 # increase our correct if the player here actually appears in the top 100 somewhere

  return rankings, correct # return our rankings and correctness

In [10]:
log1 = LogisticRegression(class_weight="balanced", random_state = 42) # Logistic regression model
log1.fit(X_train, y_train) # fit the model
log1_rankings, log1_correct = predictions(log1) # get rankings and correctness

print("Balanced Players correct:", log1_correct) # 47 players were correct
print("Balanced Percentage correct:", str(log1_correct / len(stars) * 100) + "%") # 61.04% of the possible 77 players identified as true top 100s were in the top 100
print("Balanced Rankings:", log1_rankings)
print(" ")


log2 = LogisticRegression(class_weight=None, random_state = 42) # Logistic regression model
log2.fit(X_train, y_train) # fit the model
log2_rankings, log2_correct = predictions(log2) # get rankings and correctness

print("None Players correct:", log2_correct) # 43 players were correct
print("None Percentage correct:", str(log2_correct / len(stars) * 100) + "%") # 55.84% of the possible 77 players identified as true top 100s were in the top 100
print("None Rankings:", log2_rankings)


Balanced Players correct: 47
Balanced Percentage correct: 61.038961038961034%
Balanced Rankings: ['Kylian Mbappé', 'Raphinha', 'Ousmane Dembélé', 'Joshua Kimmich', 'Mohamed Salah', 'Bradley Barcola', 'Lamine Yamal', 'Michael Olise', 'Pierre Højbjerg', 'Omar Marmoush', 'Harry Kane', 'Vitinha', 'Hugo Ekitike', 'Achraf Hakimi', 'Leonardo Balerdi', 'Florian Wirtz', 'Alexander Isak', 'Luis Díaz', 'Granit Xhaka', 'Désiré Doué', 'João Neves', 'Robert Lewandowski', 'Khvicha Kvaratskhelia', 'Pau Cubarsí', 'Kim Min-jae', 'Pedri', 'Virgil van Dijk', 'Mateo Retegui', 'William Saliba', 'Cole Palmer', 'Joško Gvardiol', 'Mason Greenwood', 'Vinicius Júnior', 'Federico Valverde', 'Angelo Stiller', 'Jude Bellingham', 'Matheus Cunha', 'Leroy Sané', 'Fabián Ruiz Peña', 'Lucas Beraldo', 'Serhou Guirassy', 'Julián Álvarez', 'Mateo Kovačić', 'Amir Rrahmani', 'Iñigo Martínez', 'Levi Colwill', 'Alexander Sørloth', 'Andrey Santos', 'Luka Modrić', 'Ademola Lookman', 'Rúben Dias', 'Rayan Cherki', 'Dayot Upamecano

In [10]:
# We will also try a SVM (rbf kernel)
clf_rbf1 = svm.SVC(kernel = 'rbf', class_weight="balanced", probability=True, random_state = 42) # SVM model
clf_rbf1.fit(X_train, y_train) # fit the model
rbf1_rankings, rbf1_correct = predictions(clf_rbf1) # get the rankings and correctness

print("Balanced Players correct:", rbf1_correct) # 43 players were correct
print("Balanced Percentage correct:", str(rbf1_correct / len(stars) * 100) + "%") # 55.84% of the possible 77 players identified as true top 100s were in the top 100
print("Balanced Rankings:", rbf1_rankings)
print(" ")


clf_rbf2 = svm.SVC(kernel = 'rbf', class_weight=None, probability=True, random_state = 42) # SVM model
clf_rbf2.fit(X_train, y_train) # fit the model
rbf2_rankings, rbf2_correct = predictions(clf_rbf2) # get the rankings and correctness

print("None Players correct:", rbf2_correct) # 46 players were correct
print("None Percentage correct:", str(rbf2_correct / len(stars) * 100) + "%") # 59.74% of the possible 77 players identified as true top 100s were in the top 100
print("None Rankings:", rbf2_rankings)

Balanced Players correct: 36
Balanced Percentage correct: 46.75324675324675%
Balanced Rankings: ['Alessandro Bastoni', 'Ibrahima Konaté', 'Pau Cubarsí', 'Iñigo Martínez', 'Warren Zaïre-Emery', 'Alexander Sørloth', 'Antonio Rüdiger', 'Leonardo Balerdi', 'Désiré Doué', 'Achraf Hakimi', 'Omar Marmoush', 'Nico Elvedi', 'Luis Díaz', 'Willian Pacho', 'Rúben Dias', 'Waldemar Anton', 'Bafodé Diakité', 'Alexander Isak', 'Pierre Højbjerg', 'Ollie Watkins', 'William Saliba', 'Patrik Schick', 'Rayan Cherki', 'Bradley Barcola', 'Lee Kang-in', 'Mamadou Sarr', 'Marquinhos', 'Matheus Cunha', 'Amir Rrahmani', 'João Neves', 'Jonathan Burkardt', 'Khéphren Thuram', 'Mateo Retegui', 'Rodrigo De Paul', 'Federico Gatti', 'Kim Min-jae', 'Pierre Kalulu', 'Eric Dier', 'Xavi Simons', 'Lucas Beraldo', 'Mika Biereth', 'Fabián Ruiz Peña', 'David López', 'Yann Aurel Bisseck', 'Jonathan Tah', 'Manuel Akanji', 'Lewis Dunk', 'Amine Gouiri', "Obite N'Dicka", 'Gonçalo Ramos', 'Jhon Lucumí', 'Raphinha', 'Stefan de Vrij', 

In [11]:
# And Random Forests too
rf1 = RandomForestClassifier(class_weight='balanced', random_state = 42) # RF model
rf1.fit(X_train, y_train) # fit the model
rf1_rankings, rf1_correct = predictions(rf1) # get the rankings and correctness.

print("Balanced Players correct:", rf1_correct) # 43 players were correct
print("Balanced Percentage correct:", str(rf1_correct / len(stars) * 100) + "%") # 55.84% of the possible 77 players identified as true top 100s were in the top 100
print("Balanced Rankings:", rf1_rankings)
print(" ")


rf2 = RandomForestClassifier(class_weight=None, random_state = 42) # RF model
rf2.fit(X_train, y_train) # fit the model
rf2_rankings, rf2_correct = predictions(rf2) # get the rankings and correctness.

print("None Players correct:", rf2_correct) # 51 players were correct
print("None Subsample Percentage correct:", str(rf2_correct / len(stars) * 100) + "%") # 66.23% of the possible 77 players identified as true top 100s were in the top 100
print("None Rankings:", rf2_rankings)
print(" ")


rf3 = RandomForestClassifier(class_weight='balanced_subsample', random_state = 42) # RF model
rf3.fit(X_train, y_train) # fit the model
rf3_rankings, rf3_correct = predictions(rf3) # get the rankings and correctness.

print("Balanced Subsample Players correct:", rf3_correct) # 43 players were correct
print("Balanced Subsample Percentage correct:", str(rf3_correct / len(stars) * 100) + "%") # 55.84% of the possible 77 players identified as true top 100s were in the top 100
print("Balanced Subsample Rankings:", rf3_rankings)

Balanced Players correct: 42
Balanced Percentage correct: 54.54545454545454%
Balanced Rankings: ['Kylian Mbappé', 'Ousmane Dembélé', 'Alexander Isak', 'Raphinha', 'Cole Palmer', 'Bradley Barcola', 'Michael Olise', 'Harry Kane', 'Hugo Ekitike', 'Mason Greenwood', 'Omar Marmoush', 'Rafael Leão', 'Achraf Hakimi', 'Vinicius Júnior', 'Nicolò Barella', 'Luis Díaz', 'Serhou Guirassy', 'Maghnes Akliouche', 'Mateo Retegui', 'Mohamed Salah', 'Julián Álvarez', 'Florian Wirtz', 'Lamine Yamal', 'Lewis Dunk', 'Désiré Doué', 'Ollie Watkins', 'Granit Xhaka', 'Matheus Cunha', 'Ademola Lookman', 'Joshua Kimmich', 'Antonio Rüdiger', 'Erling Haaland', 'Jude Bellingham', 'Bernardo Silva', 'Khvicha Kvaratskhelia', 'Zuriko Davitashvili', 'Pierre Højbjerg', 'Nico Schlotterbeck', 'Leroy Sané', 'Nicolas Jackson', 'Angelo Stiller', 'Iñigo Martínez', 'Éderson', 'Bruno Guimarães', 'Christian Pulisic', 'Alessandro Bastoni', 'Bruno Fernandes', 'Sávio', 'Alex Iwobi', 'Jean-Philippe Mateta', 'Federico Valverde', 'Luka

In [11]:
# We are now trying RandomizedSearchCV to see if it changes anything
def custom_best_score(estimator, X, y):
  y_pred = estimator.predict(X)
  y_pred_prob = estimator.predict_proba(X)
  potential_indexes = []
  for i in range(len(y_pred)):
    potential_indexes.append(i) # add the index of the future top 100 player into our array
  stardom = y_pred_prob[potential_indexes, 1] # get the predicted probability of everyone who is predicted to be a future top 100 player

  # highest predicted probabilities will be how we base our rankings
  for i in range(len(stardom)):
    for j in range(i + 1):
      if (stardom[j] < stardom[i]):
        temp = stardom[i]
        temp2 = potential_indexes[i]
        stardom[i] = stardom[j]
        potential_indexes[i] = potential_indexes[j]
        stardom[j] = temp
        potential_indexes[j] = temp2

  correct = 0 # see how many were succesfully identified in top 100

  for i in range(100):
    if (y.iloc[potential_indexes[i]] == 1):
      correct += 1 # increase our correct if the player here actually appears in the top 100 somewhere
  return correct / 100

In [12]:
outfielders_pred2 = outfielders_pred.copy()
outfielders_pred2 = outfielders_pred2.drop(['Player', 'home', 'Age Days', 'league_bundesliga', 'league_la-liga', 'league_ligue-1', 'league_premier-league', 'league_serie-a'], axis = 1)
outfielders_pred2 = outfielders_pred2[outfielders_pred2.season != 2025]
cols = ['season', 'Min', 'Gls', 'Ast', 'PK', 'PKatt', 'Sh', 'SoT', 'CrdY',
       'CrdR', 'Touches', 'Tkl', 'Int', 'Blocks', 'xG', 'npxG', 'xAG', 'SCA',
       'GCA', 'Cmp_x', 'Att_x', 'Cmp%_x', 'PrgP', 'Carries', 'PrgC', 'Att_x.1',
       'Succ', 'top100', 'Age Years', 'AM', 'CB', 'CM', 'DF', 'DM', 'FW', 'LB', 'LM', 'LW',
       'MF', 'RB', 'RM', 'RW', 'WB']
rows_to_add = []
for i in range(231):
  rows_to_add.append([2018, 1.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0, 26.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
for i in range(224):
  rows_to_add.append([2019, 1.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0, 26.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 0.0])

for i in range(171):
  rows_to_add.append([2020, 1.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0, 26.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 0.0])

for i in range(78):
  rows_to_add.append([2021, 1.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0, 26.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                      1.0, 0.0, 0.0, 0.0, 0.0, 0.0])

for i in range(57):
  rows_to_add.append([2023, 1.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0, 26.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 1.0, 0.0, 0.0, 0.0])

for i in range(68):
  rows_to_add.append([2024, 1.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0, 26.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
                      0.0, 0.0, 0.0, 1.0, 0.0, 0.0])

df1 = pd.DataFrame(rows_to_add, columns=cols)
outfielders_pred2 = pd.concat([outfielders_pred2, df1], ignore_index=True)
outfielders_pred2

Unnamed: 0,season,Min,Gls,Ast,PK,PKatt,Sh,SoT,CrdY,CrdR,...,DM,FW,LB,LM,LW,MF,RB,RM,RW,WB
0,2020,1258.0,3.0,1.0,0.0,0.0,38.0,13.0,0.0,0.0,...,0.0,0.916667,0.000000,0.000000,0.041667,0.0,0.0,0.041667,0.083333,0.000000
1,2021,791.0,2.0,1.0,0.0,0.0,23.0,8.0,0.0,0.0,...,0.0,0.705882,0.000000,0.000000,0.235294,0.0,0.0,0.000000,0.000000,0.058824
2,2022,156.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.750000,0.000000,0.250000,0.000000,0.0,0.0,0.000000,0.000000,0.000000
3,2018,3069.0,1.0,7.0,0.0,0.0,21.0,6.0,7.0,0.0,...,0.0,0.000000,0.416667,0.111111,0.000000,0.0,0.0,0.000000,0.000000,0.138889
4,2019,1589.0,0.0,1.0,0.0,0.0,11.0,0.0,1.0,0.0,...,0.0,0.000000,1.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17999,2024,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.000000,0.000000
18000,2024,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.000000,0.000000
18001,2024,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.000000,0.000000
18002,2024,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.000000,0.000000


In [13]:
train_df2 = outfielders_pred2.set_index('season')
train_df2 = train_df2.sort_index()
seasons = np.sort(train_df2.index.unique())
tscv = TimeSeriesSplit(n_splits = 6)
X_train2 = train_df2.drop(columns = ['top100'])
y_train2 = train_df2["top100"]
for train_index, test_index in tscv.split(X_train2):
  print(train_index, test_index)

[   0    1    2 ... 2569 2570 2571] [2572 2573 2574 ... 5141 5142 5143]
[   0    1    2 ... 5141 5142 5143] [5144 5145 5146 ... 7713 7714 7715]
[   0    1    2 ... 7713 7714 7715] [ 7716  7717  7718 ... 10285 10286 10287]
[    0     1     2 ... 10285 10286 10287] [10288 10289 10290 ... 12857 12858 12859]
[    0     1     2 ... 12857 12858 12859] [12860 12861 12862 ... 15429 15430 15431]
[    0     1     2 ... 15429 15430 15431] [15432 15433 15434 ... 18001 18002 18003]


In [14]:
print("2018", len(train_df2.loc[2018]))
print("2019", len(train_df2.loc[2019]))
print("2020", len(train_df2.loc[2020]))
print("2021", len(train_df2.loc[2021]))
print("2022", len(train_df2.loc[2022]))
print("2023", len(train_df2.loc[2023]))
print("2024", len(train_df2.loc[2024]))

2018 2572
2019 2572
2020 2572
2021 2572
2022 2572
2023 2572
2024 2572


In [16]:
log_pipeline = Pipeline([('std', StandardScaler()),
                         ('log', LogisticRegression(random_state = 42))])

param_grid_log = {
    'log__penalty':['l1','l2','elasticnet', None],
    'log__C' : np.linspace(0.1, 2.0, 10),
    'log__solver': ['lbfgs','liblinear','saga'],
    'log__max_iter'  : [500,1000,2500,5000],
    'log__l1_ratio'  : np.linspace(0.0, 1.0, 5),
    'log__class_weight' : ['balanced', None]
} # parameter grid (ChatGPT generated this grid for me to use)


random_search_log1 = RandomizedSearchCV(
    estimator = log_pipeline,
    param_distributions = param_grid_log,
    n_iter = 100,
    scoring = custom_best_score,
    cv = tscv,
    verbose = 0,
    random_state = 42,
    n_jobs = -1
)

random_search_log1.fit(X_train2, y_train2)
print(random_search_log1.best_estimator_)
print(random_search_log1.best_params_)

log_pipeline_correct = predictions(random_search_log1)[1]
print("Players correct:", log_pipeline_correct)
print("Percentage correct:", str(log_pipeline_correct / len(stars) * 100) + "%")

312 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1382, i

Pipeline(steps=[('std', StandardScaler()),
                ('log',
                 LogisticRegression(C=np.float64(1.3666666666666667),
                                    l1_ratio=np.float64(0.0), max_iter=500,
                                    penalty='l1', random_state=42,
                                    solver='liblinear'))])
{'log__solver': 'liblinear', 'log__penalty': 'l1', 'log__max_iter': 500, 'log__l1_ratio': np.float64(0.0), 'log__class_weight': None, 'log__C': np.float64(1.3666666666666667)}




Players correct: 19
Percentage correct: 24.675324675324674%


In [17]:
svm_tuned_rbf_pipeline = Pipeline([('std', StandardScaler()),
                                   ('rbf', svm.SVC(kernel = 'rbf', probability=True, random_state = 42))])

# RBF param grid
param_grid_svm_rbf = {
    'rbf__C': [0.01, 0.1, 1, 10, 100],
    'rbf__gamma' : ['scale', 'auto', 0.01, 0.1],
    'rbf__class_weight': ['balanced', None]
}

# RandomizedSearchCV
random_search_svm_rbf = RandomizedSearchCV(
    estimator = svm_tuned_rbf_pipeline,
    param_distributions = param_grid_svm_rbf,
    n_iter = 40,
    scoring = custom_best_score,
    cv = tscv,
    verbose = 0,
    random_state = 42,
    n_jobs = -1
)

# Fitting best parameters
random_search_svm_rbf.fit(X_train2, y_train2)
print(random_search_svm_rbf.best_estimator_)
print(random_search_svm_rbf.best_params_)

# Accuracy
svm_rbf_correct = predictions(random_search_svm_rbf)[1]
print("RBF Players correct:", svm_rbf_correct)
print("RBF Percentage correct:", str(svm_rbf_correct / len(stars) * 100) + "%")

Pipeline(steps=[('std', StandardScaler()),
                ('rbf',
                 SVC(C=10, gamma=0.01, probability=True, random_state=42))])
{'rbf__gamma': 0.01, 'rbf__class_weight': None, 'rbf__C': 10}




RBF Players correct: 16
RBF Percentage correct: 20.77922077922078%


In [18]:
svm_tuned_sigmoid_pipeline = Pipeline([('std', StandardScaler()),
                                       ('sigmoid', svm.SVC(kernel = 'sigmoid', probability=True, random_state = 42))])

# Sigmoid param grid
param_grid_svm_sigmoid = {
    'sigmoid__C': [0.01, 0.1, 1, 10, 100],
    'sigmoid__gamma' : ['scale', 'auto', 0.01, 0.1],
    'sigmoid__class_weight': ['balanced', None]
}

# RandomizedSearchCV
random_search_svm_sigmoid = RandomizedSearchCV(
    estimator = svm_tuned_sigmoid_pipeline,
    param_distributions = param_grid_svm_sigmoid,
    n_iter = 40,
    scoring = custom_best_score,
    cv = tscv,
    verbose = 0,
    random_state = 42,
    n_jobs = -1
)

# Fitting best parameters
random_search_svm_sigmoid.fit(X_train2, y_train2)
print(random_search_svm_sigmoid.best_estimator_)
print(random_search_svm_sigmoid.best_params_)

# Accuracy
svm_sigmoid_correct = predictions(random_search_svm_sigmoid)[1]
print("Sigmoid Players correct:", svm_sigmoid_correct)
print("Sigmoid Percentage correct:", str(svm_sigmoid_correct / len(stars) * 100) + "%")

Pipeline(steps=[('std', StandardScaler()),
                ('sigmoid',
                 SVC(C=0.01, class_weight='balanced', gamma=0.01,
                     kernel='sigmoid', probability=True, random_state=42))])
{'sigmoid__gamma': 0.01, 'sigmoid__class_weight': 'balanced', 'sigmoid__C': 0.01}




Sigmoid Players correct: 26
Sigmoid Percentage correct: 33.76623376623377%


In [19]:
svm_tuned_poly_pipeline = Pipeline([('std', StandardScaler()),
                                    ('poly', svm.SVC(kernel = 'poly', probability=True, random_state = 42))])

# Poly param grid
param_grid_svm_poly = {
    'poly__C': [0.01, 0.1, 1, 10, 100],
    'poly__degree': [2, 3],
    'poly__gamma' : ['scale', 'auto', 0.01, 0.1],
    'poly__class_weight': ['balanced', None]
}

# RandomizedSearchCV
random_search_svm_poly = RandomizedSearchCV(
    estimator = svm_tuned_poly_pipeline,
    param_distributions = param_grid_svm_poly,
    n_iter = 80,
    scoring = custom_best_score,
    cv = tscv,
    verbose = 0,
    random_state = 42,
    n_jobs = -1
)

# Fitting best parameters
random_search_svm_poly.fit(X_train2, y_train2)
print(random_search_svm_poly.best_estimator_)
print(random_search_svm_poly.best_params_)

# Accuracy
svm_poly_correct = predictions(random_search_svm_poly)[1]
print("Poly Players correct:", svm_poly_correct)
print("Poly Percentage correct:", str(svm_poly_correct / len(stars) * 100) + "%")

Pipeline(steps=[('std', StandardScaler()),
                ('poly',
                 SVC(C=0.1, degree=2, kernel='poly', probability=True,
                     random_state=42))])
{'poly__gamma': 'scale', 'poly__degree': 2, 'poly__class_weight': None, 'poly__C': 0.1}




Poly Players correct: 2
Poly Percentage correct: 2.5974025974025974%


In [20]:
# We are now trying RandomizedSearchCV to see if it changes anything
rf_tuned_pipeline = Pipeline([('std', StandardScaler()),
                              ('rf', RandomForestClassifier(random_state = 42))])

param_grid_rf = {
    'rf__n_estimators': np.arange(100, 1001, 100),
    'rf__max_depth': [None, 5, 10, 20, 30, 50],
    'rf__min_samples_split': [2, 5, 10, 20],
    'rf__min_samples_leaf': [1, 2, 4, 10],
    'rf__max_features': ['sqrt', 'log2', None],
    'rf__bootstrap': [True, False],
    'rf__class_weight': ['balanced', 'balanced_subsample', None]
} # parameter grid (ChatGPT recommended this is the grid I use)


random_search_rf1 = RandomizedSearchCV(
    estimator = rf_tuned_pipeline,
    param_distributions = param_grid_rf,
    n_iter = 100,
    scoring = custom_best_score,
    cv = tscv,
    verbose = 0,
    random_state = 42,
    n_jobs = -1
)

random_search_rf1.fit(X_train2, y_train2)
print(random_search_rf1.best_estimator_)
print(random_search_rf1.best_params_)

rf_pipeline_correct = predictions(random_search_rf1)[1]
print("Players correct:", rf_pipeline_correct)
print("Percentage correct:", str(rf_pipeline_correct / len(stars) * 100) + "%")

Pipeline(steps=[('std', StandardScaler()),
                ('rf',
                 RandomForestClassifier(max_depth=30, max_features=None,
                                        min_samples_leaf=4,
                                        min_samples_split=20,
                                        n_estimators=np.int64(100),
                                        random_state=42))])
{'rf__n_estimators': np.int64(100), 'rf__min_samples_split': 20, 'rf__min_samples_leaf': 4, 'rf__max_features': None, 'rf__max_depth': 30, 'rf__class_weight': None, 'rf__bootstrap': True}




Players correct: 32
Percentage correct: 41.55844155844156%


In [19]:
# Let's test the best log model based on correctness.
best_log = LogisticRegression(C=np.float64(1.3666666666666667),
                                    l1_ratio=np.float64(0.0), max_iter=500,
                                    penalty='l1', random_state=42,
                                    solver='liblinear', class_weight = None)
best_log.fit(X_train, y_train)
best_log_rankings, best_log_correct = predictions(best_log)

print("Best Log Players correct:", best_log_correct)
print("Best Log Percentage correct:", str(best_log_correct / len(stars) * 100) + "%")
print(best_log_rankings) # These rankings are not bad



Best Log Players correct: 49
Best Log Percentage correct: 63.63636363636363%
['Kylian Mbappé', 'Mohamed Salah', 'Ousmane Dembélé', 'Raphinha', 'Bradley Barcola', 'Joshua Kimmich', 'Lamine Yamal', 'Michael Olise', 'Omar Marmoush', 'Harry Kane', 'Pierre Højbjerg', 'Hugo Ekitike', 'Mateo Retegui', 'Achraf Hakimi', 'Vitinha', 'Alexander Isak', 'Désiré Doué', 'Luis Díaz', 'Granit Xhaka', 'Florian Wirtz', 'Khvicha Kvaratskhelia', 'Vinicius Júnior', 'João Neves', 'Robert Lewandowski', 'Mason Greenwood', 'Pedri', 'Pau Cubarsí', 'Leonardo Balerdi', 'Cole Palmer', 'William Saliba', 'Kim Min-jae', 'Serhou Guirassy', 'Virgil van Dijk', 'Joško Gvardiol', 'Angelo Stiller', 'Leroy Sané', 'Jude Bellingham', 'Matheus Cunha', 'Fabián Ruiz Peña', 'Mateo Kovačić', 'Federico Valverde', 'Rayan Cherki', 'Bruno Fernandes', 'Maghnes Akliouche', 'Julián Álvarez', 'Ademola Lookman', 'Ayoze Pérez', 'Son Heung-min', 'Andrey Santos', 'Lucas Beraldo', 'Iñigo Martínez', 'Amir Rrahmani', 'Yoane Wissa', 'Rafael Leão', 

In [20]:
best_svm_rbf = svm.SVC(kernel = 'rbf', class_weight = None, C=10, gamma=0.01, probability=True, random_state=42)
best_svm_rbf.fit(X_train, y_train)
best_svm_rbf_rankings, best_svm_rbf_correct = predictions(best_svm_rbf)

print("Best RBF Players correct:", best_svm_rbf_correct)
print("Best RBF Percentage correct:", str(best_svm_rbf_correct / len(stars) * 100) + "%")
print(best_svm_rbf_rankings)

Best RBF Players correct: 49
Best RBF Percentage correct: 63.63636363636363%
['Omar Marmoush', 'Bradley Barcola', 'Mateo Retegui', 'Cole Palmer', 'Raphinha', 'Alexander Isak', 'Ousmane Dembélé', 'Robert Lewandowski', 'Michael Olise', 'Hugo Ekitike', 'Luis Díaz', 'Florian Wirtz', 'Kylian Mbappé', 'Harry Kane', 'Joshua Kimmich', 'Désiré Doué', 'Vinicius Júnior', 'Maghnes Akliouche', 'Marcos Alonso', 'Serhou Guirassy', 'Rayan Cherki', 'Jude Bellingham', 'Matheus Cunha', 'Pedri', 'Bukayo Saka', 'Martin Ødegaard', 'Achraf Hakimi', 'Mohamed Salah', 'Erling Haaland', 'Mason Greenwood', 'Luka Modrić', 'Bruno Fernandes', 'Jamal Musiala', 'Rafael Leão', 'Nicolás Paz', 'Lamine Yamal', 'Granit Xhaka', 'Khvicha Kvaratskhelia', 'Son Heung-min', 'Ademola Lookman', 'Lee Kang-in', 'Christian Pulisic', 'Iñigo Martínez', 'Álex Grimaldo', 'Riccardo Orsolini', 'Oihan Sancet', 'Federico Dimarco', 'Pau Cubarsí', 'Valentín Castellanos', 'Bernardo Silva', 'Gonçalo Ramos', 'Xavi Simons', 'Leroy Sané', 'Fabián R

In [26]:
best_svm_sigmoid = svm.SVC(C=0.01, class_weight='balanced', gamma=0.01, kernel='sigmoid', probability=True, random_state=42)
best_svm_sigmoid.fit(X_train, y_train)
best_svm_sigmoid_rankings, best_svm_sigmoid_correct = predictions(best_svm_sigmoid)

print("Best Sigmoid Players correct:", best_svm_sigmoid_correct)
print("Best Sigmoid Percentage correct:", str(best_svm_sigmoid_correct / len(stars) * 100) + "%")
print(best_svm_sigmoid_rankings)

Best Sigmoid Players correct: 17
Best Sigmoid Percentage correct: 22.07792207792208%
['Abdoulaye Touré', 'Hakan Çalhanoğlu', 'Marcos Alonso', 'Joshua Kimmich', 'Vitinha', 'Gonçalo Ramos', 'Nadiem Amiri', 'Isco', 'Dušan Vlahović', 'Artem Dovbyk', 'Iago Aspas', 'Mikel Oyarzabal', 'Georges Mikautadze', 'Joško Gvardiol', 'Chris Wood', 'Antoine Griezmann', 'Oihan Sancet', 'Patrik Schick', 'Gaëtan Laborde', 'Kiké', 'Himad Abdelli', 'Pedri', 'Jonathan Burkardt', 'Martin Ødegaard', 'Alexandre Lacazette', 'Vincenzo Grifo', 'Tijjani Reijnders', 'Paulo Dybala', 'Mika Biereth', 'Ermedin Demirović', 'Emanuel Emegha', 'Randal Kolo Muani', 'João Pedro', 'Alexander Sørloth', 'Ange-Yoan Bonny', 'Dani Olmo', 'Shuto Machino', 'Federico Valverde', 'Romelu Lukaku', 'Benjamin Šeško', 'Danny Welbeck', 'Javi Puado', 'Enzo Fernández', 'Nick Woltemade', 'Eliesse Ben Seghir', 'Dominic Solanke', 'Marvin Pieringer', 'Arnaud Kalimuendo', 'Raúl Jiménez', 'Jean-Philippe Mateta', 'Mauro Arambarri', 'Marco Asensio', 'A

In [22]:
best_svm_poly = svm.SVC(C=0.1, degree=2, kernel='poly', probability=True, class_weight = None, gamma = 'scale', random_state=42)
best_svm_poly.fit(X_train, y_train)
best_svm_poly_rankings, best_svm_poly_correct = predictions(best_svm_poly)

print("Best Poly Players correct:", best_svm_poly_correct)
print("Best Poly Percentage correct:", str(best_svm_poly_correct / len(stars) * 100) + "%")
print(best_svm_poly_rankings)

Best Poly Players correct: 51
Best Poly Percentage correct: 66.23376623376623%
['Kylian Mbappé', 'Mohamed Salah', 'Raphinha', 'Ousmane Dembélé', 'Bradley Barcola', 'Lamine Yamal', 'Omar Marmoush', 'Harry Kane', 'Michael Olise', 'Robert Lewandowski', 'Joshua Kimmich', 'Alexander Isak', 'Mason Greenwood', 'Mateo Retegui', 'Cole Palmer', 'Hugo Ekitike', 'Luis Díaz', 'Vinicius Júnior', 'Serhou Guirassy', 'Pedri', 'Rayan Cherki', 'Florian Wirtz', 'Erling Haaland', 'Désiré Doué', 'Khvicha Kvaratskhelia', 'Achraf Hakimi', 'Martin Ødegaard', 'Rafael Leão', 'Matheus Cunha', 'Granit Xhaka', 'Bruno Fernandes', 'Jude Bellingham', 'Maghnes Akliouche', 'Ademola Lookman', 'Leroy Sané', 'Jamal Musiala', 'Son Heung-min', 'Tijjani Reijnders', 'Luis Henrique', 'Angelo Stiller', 'Pierre Højbjerg', 'Julián Álvarez', 'Trent Alexander-Arnold', 'Alex Iwobi', 'Federico Valverde', 'Alexander Sørloth', 'Vincenzo Grifo', 'Fabián Ruiz Peña', 'Jonathan Burkardt', 'James Maddison', 'Ollie Watkins', 'Iñigo Martínez',

In [23]:
# Let's try the best random forest model based on correctness.
best_rf = RandomForestClassifier(max_depth=30, max_features=None,
                                        min_samples_leaf=4,
                                        min_samples_split=20,
                                        n_estimators=np.int64(100),
                                        random_state=42,
                                        class_weight = None,
                                        bootstrap = True)
best_rf.fit(X_train, y_train)
best_rf_rankings, best_rf_correct = predictions(best_rf)

print("Best RF Players correct:", best_rf_correct)
print("Best RF Percentage correct:", str(best_rf_correct / len(stars) * 100) + "%")
print(best_rf_rankings) # Holy shit, the predicted number one player in 2025 is actually the player who won the Ballon D'Or

Best RF Players correct: 49
Best RF Percentage correct: 63.63636363636363%
['Kylian Mbappé', 'Bradley Barcola', 'Ousmane Dembélé', 'Mohamed Salah', 'Raphinha', 'Omar Marmoush', 'Michael Olise', 'Mason Greenwood', 'Alexander Isak', 'Cole Palmer', 'Harry Kane', 'Lamine Yamal', 'Hugo Ekitike', 'Désiré Doué', 'Joshua Kimmich', 'Mateo Retegui', 'Robert Lewandowski', 'Serhou Guirassy', 'Pierre Højbjerg', 'Erling Haaland', 'Luis Díaz', 'Achraf Hakimi', 'Pedri', 'Maghnes Akliouche', 'Iñigo Martínez', 'Matheus Cunha', 'Dominik Szoboszlai', 'Rafael Leão', 'Florian Wirtz', 'Virgil van Dijk', 'Alexander Sørloth', 'Sávio', 'Hakan Çalhanoğlu', 'Moise Kean', 'Jude Bellingham', 'Joško Gvardiol', 'Bruno Fernandes', 'Martin Ødegaard', 'Pau Cubarsí', 'Amir Rrahmani', 'Luka Modrić', 'Granit Xhaka', 'Son Heung-min', 'Khvicha Kvaratskhelia', 'Nicolás Paz', 'Amad Diallo', 'Angelo Stiller', 'Lewis Dunk', 'Patrik Schick', 'Antoine Semenyo', 'Vinicius Júnior', 'Tijjani Reijnders', 'Gonçalo Ramos', 'Emanuel Emeg

In [24]:
# Where to next?

# Ranking of top 100 (1st, 10th, unranked) - Logistic Regression sort of does this, will need to elaborate further
# Above top 25, where could they land
# Transferability (need club name)?
# Team chemistry (combine with KNN)?
# Play-by-play (+ or -)?

# log = LogisticRegression(class_weight = "balanced", max_iter=5000) # Logistic regression model
# log = LogisticRegression(C = 0.25, class_weight = "balanced", solver = 'liblinear') # Logistic regression model