<a href="https://colab.research.google.com/github/shipley7/fifa/blob/main/FIFA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Necessary imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv("combined_match_data.csv") # getting the data
df.head() # printing out the data

Unnamed: 0,index,Player,#,Nation,Pos,Age,Min,Gls,Ast,PK,...,AvgLen.1,Opp,Stp,Stp%,#OPA,AvgDist,home,game_id,top100,source_file
0,0,14 Players,,,,,990.0,3.0,2.0,1.0,...,,,,,,,1,0,0,bundesliga_2017-2018_player_data.csv
1,1,Arjen Robben,10.0,nl NED,RW,33-207,30.0,0.0,0.0,0.0,...,,,,,,,1,0,0,bundesliga_2017-2018_player_data.csv
2,2,Arturo Vidal,23.0,cl CHI,LM,30-088,90.0,0.0,1.0,0.0,...,,,,,,,1,0,0,bundesliga_2017-2018_player_data.csv
3,3,Corentin Tolisso,24.0,fr FRA,RM,23-015,90.0,1.0,0.0,0.0,...,,,,,,,1,0,0,bundesliga_2017-2018_player_data.csv
4,4,David Alaba,27.0,at AUT,LB,25-055,90.0,0.0,0.0,0.0,...,,,,,,,1,0,0,bundesliga_2017-2018_player_data.csv


In [3]:
df = df[df["Player"].str.contains("Players")==False] # removing excess rows that have no player information
df = df.drop(['index', "#"], axis = 1) # index and player number have no impact on anything
df[["Age Years", "Age Days"]] = (df.Age.str.split("-", expand = True)).apply(pd.to_numeric) # splitting age column into two other columns
df[["league", "season"]] = df["source_file"].replace("_player_data.csv", "", regex = True).str.split("_", expand = True) # splitting the source file into a league and season
df["season"] = (df["season"].str[:4].astype(int)) + 1 # season will be an integer representing the year the season ended
df = pd.get_dummies(df, columns=["league"]) # league will be a boolean variable representing which leage we are using
df = df.drop(['Age','source_file', "game_id"], axis = 1) # removing age, source file, and game id as they are no longer needed
df = df.fillna(0) # filling any NA with 0
df # just to check

Unnamed: 0,Player,Nation,Pos,Min,Gls,Ast,PK,PKatt,Sh,SoT,...,home,top100,Age Years,Age Days,season,league_bundesliga,league_la-liga,league_ligue-1,league_premier-league,league_serie-a
1,Arjen Robben,nl NED,RW,30.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1,0,33.0,207.0,2018,True,False,False,False,False
2,Arturo Vidal,cl CHI,LM,90.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1,0,30.0,88.0,2018,True,False,False,False,False
3,Corentin Tolisso,fr FRA,RM,90.0,1.0,0.0,0.0,0.0,4.0,3.0,...,1,0,23.0,15.0,2018,True,False,False,False,False
4,David Alaba,at AUT,LB,90.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,25.0,55.0,2018,True,False,False,False,False
5,Franck Ribéry,fr FRA,LW,76.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,34.0,133.0,2018,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
452035,Ondrej Duda,sk SVK,DM,90.0,0.0,1.0,0.0,0.0,2.0,0.0,...,0,0,30.0,171.0,2025,False,False,False,False,True
452036,Paweł Dawidowicz,pl POL,DM,90.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,30.0,5.0,2025,False,False,False,False,True
452037,Simone Perilli,it ITA,GK,90.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,30.0,138.0,2025,False,False,False,False,True
452038,Suat Serdar,de GER,DM,80.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0,0,28.0,44.0,2025,False,False,False,False,True


In [4]:
# splitting into keepers and outfielders since they all have different statistics
keepers = df[df["Pos"] == "GK"]
outfielders = df[df["Pos"] != "GK"]

In [5]:
# for keepers, removing everything that a keeper doesn't normally do
keepers = keepers.drop(["Pos", 'Gls', 'Ast', 'PK', 'PKatt', 'Sh',
       'SoT', 'Touches', 'Tkl', 'Int', 'Blocks', 'xG', 'npxG',
       'xAG', 'SCA', 'GCA', 'Cmp_x', 'Att_x', 'Cmp%_x', 'PrgP', 'Carries',
       'PrgC', 'Att_x.1', 'Succ'], axis = 1)
keepers

Unnamed: 0,Player,Nation,Min,CrdY,CrdR,SoTA,GA,Saves,Save%,PSxG,...,home,top100,Age Years,Age Days,season,league_bundesliga,league_la-liga,league_ligue-1,league_premier-league,league_serie-a
13,Sven Ulreich,de GER,90.0,0.0,0.0,4.0,1.0,3.0,75.0,0.9,...,1,0,29.0,15.0,2018,True,False,False,False,False
19,Bernd Leno,de GER,90.0,0.0,0.0,8.0,3.0,5.0,75.0,3.5,...,0,0,25.0,167.0,2018,True,False,False,False,False
36,Koen Casteels,be BEL,90.0,0.0,0.0,5.0,3.0,2.0,40.0,1.9,...,1,0,25.0,55.0,2018,True,False,False,False,False
56,Roman Bürki,ch SUI,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,26.0,278.0,2018,True,False,False,False,False
70,Rune Jarstein,no NOR,90.0,0.0,0.0,2.0,0.0,2.0,100.0,0.1,...,1,0,32.0,324.0,2018,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451966,Michele Di Gregorio,it ITA,90.0,0.0,0.0,5.0,2.0,3.0,60.0,1.4,...,0,0,27.0,302.0,2025,False,False,False,False,True
451981,Marco Carnesecchi,it ITA,90.0,0.0,0.0,10.0,3.0,7.0,70.0,3.0,...,1,0,24.0,328.0,2025,False,False,False,False,True
452006,Zion Suzuki,jp JPN,90.0,0.0,0.0,4.0,2.0,2.0,50.0,1.9,...,0,0,22.0,277.0,2025,False,False,False,False,True
452010,Devis Vásquez,co COL,90.0,0.0,0.0,2.0,2.0,0.0,0.0,1.5,...,1,0,27.0,13.0,2025,False,False,False,False,True


In [6]:
position_dummies = outfielders.Pos.str.get_dummies(sep = ",") # since most players play mutliple positions in a match, we will have to multi-hot encode
outfielders = pd.concat([outfielders.drop(columns=["Pos"]), position_dummies], axis = 1) # combining our binary position columns with the remaining dataframe
outfielders[['AM', 'CB', 'CM', 'DF', 'DM',
       'FW', 'LB', 'LM', 'LW', 'MF', 'RB', 'RM', 'RW', 'WB']] = outfielders[['AM', 'CB', 'CM', 'DF', 'DM',
       'FW', 'LB', 'LM', 'LW', 'MF', 'RB', 'RM', 'RW', 'WB']].astype('bool') # change position to boolean (did they play this position in the match or not?)
outfielders = outfielders.drop(['SoTA', 'GA', 'Saves', 'Save%', 'PSxG',
       'Cmp_y', 'Att_y', 'Cmp%_y', 'Att (GK)', 'Thr', 'Launch%', 'AvgLen',
       'Att_y.1', 'Launch%.1', 'AvgLen.1', 'Opp', 'Stp', 'Stp%', '#OPA',
       'AvgDist', 'GK'], axis = 1) # removing keeper statistics
outfielders

Unnamed: 0,Player,Nation,Min,Gls,Ast,PK,PKatt,Sh,SoT,CrdY,...,DM,FW,LB,LM,LW,MF,RB,RM,RW,WB
1,Arjen Robben,nl NED,30.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,False,False,False,False,False,False,False,False,True,False
2,Arturo Vidal,cl CHI,90.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,False,False,False,True,False,False,False,False,False,False
3,Corentin Tolisso,fr FRA,90.0,1.0,0.0,0.0,0.0,4.0,3.0,0.0,...,False,False,False,False,False,False,False,True,False,False
4,David Alaba,at AUT,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,True,False,False,False,False,False,False,False
5,Franck Ribéry,fr FRA,76.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
452034,Nicolás Valentini,ar ARG,90.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,False,False,False,False,False,False,False,False,False,False
452035,Ondrej Duda,sk SVK,90.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,...,True,False,False,False,False,False,False,False,False,False
452036,Paweł Dawidowicz,pl POL,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,False,False,False,False,False,False,False,False,False
452038,Suat Serdar,de GER,80.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,True,False,False,False,False,False,False,False,False,False


In [7]:
# # here, I am trying to see if a model could accuartely predict the positions one would play
# pos_prediction = outfielders.drop(["Player", "Nation"], axis = 1) # we cannot use Player, since that is a dead giveaway, and nation is useless

# # since players only get better or worse with time, our testing data must come after our training data
# train_seasons = [2018, 2019, 2020, 2021]
# test_season = [2022]
# train_df = pos_prediction[pos_prediction['season'].isin(train_seasons)]
# test_df = pos_prediction[pos_prediction['season'].isin(test_season)]

# positions = ['AM', 'CB', 'CM', 'DF', 'DM',
#        'FW', 'LB', 'LM', 'LW', 'MF', 'RB', 'RM', 'RW', 'WB'] # our target variables

# X_train = train_df.drop(columns = positions)
# y_train = train_df[positions]
# X_test = test_df.drop(columns = positions)
# y_test = test_df[positions]

# model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state = 42, class_weight='balanced')) # ChatGPT recommended this
# model.fit(X_train, y_train)

# y_pred = model.predict(X_test)

# print(classification_report(y_test, y_pred, target_names=positions)) # results (not very good, but can be expected since there are 14 possible positions and not necessarily one right answer)

# # Going to try again but this time will generalize positions so maybe we will only have FW, MID, DEF


In [8]:
# # Let's generalize into FWD, MID, and DEF

# pos_prediction["Pos_Gen_FWD"] = np.where(pos_prediction['FW'] | pos_prediction['LW'] | pos_prediction['RW'], True, False)

# pos_prediction["Pos_Gen_MID"] = np.where(pos_prediction['AM'] | pos_prediction['CM'] | pos_prediction['DM'] | pos_prediction['LM'] | pos_prediction['MF'] | pos_prediction['RM'], True, False)

# pos_prediction["Pos_Gen_DEF"] = np.where(pos_prediction['CB'] | pos_prediction['DF'] | pos_prediction['LB'] | pos_prediction['RB'] | pos_prediction['WB'], True, False)

In [9]:
# pos_prediction = pos_prediction.drop(positions, axis = 1) # we cannot use Player, since that is a dead giveaway, and nation is useless

# # since players only get better or worse with time, our testing data must come after our training data
# train_seasons = [2018, 2019, 2020, 2021]
# test_season = [2022]
# train_df = pos_prediction[pos_prediction['season'].isin(train_seasons)]
# test_df = pos_prediction[pos_prediction['season'].isin(test_season)]

# positions = ["Pos_Gen_FWD", "Pos_Gen_MID", "Pos_Gen_DEF"] # Let's generalize

# # Same as before
# X_train = train_df.drop(columns = positions)
# y_train = train_df[positions]
# X_test = test_df.drop(columns = positions)
# y_test = test_df[positions]

# model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state = 42, class_weight='balanced')) # ChatGPT recommended this
# model.fit(X_train, y_train)

# y_pred = model.predict(X_test)

# print(classification_report(y_test, y_pred, target_names=positions)) # looks better

In [None]:
selected_season = int(input("Please enter the season you would like to investigate:"))

top_100_outfielders = outfielders[(outfielders['top100'] != 0) & (outfielders['season'] == selected_season)] # get players who at some point have been in the top 100
remainders = outfielders[(outfielders["top100"] == 0) & (outfielders['season'] == selected_season)] # players who are not in the top 100

top_100_outfielders_agg = top_100_outfielders.groupby("Player").mean(numeric_only=True).reset_index() # aggregate each top 100 player's stats (mean) into one row
remainders_agg = remainders.groupby("Player").mean(numeric_only=True).reset_index() # aggreate remainding players stats into one row

# Should we keep leagues? 'league_bundesliga', 'league_la-liga', 'league_ligue-1', 'league_premier-league', 'league_serie-a' For now we will not
top_agg = top_100_outfielders_agg.drop(["Player", "home", "Age Years", "Age Days", "top100", "season", 'league_bundesliga', 'league_la-liga', 'league_ligue-1', 'league_premier-league', 'league_serie-a'], axis = 1) # removing columns I do not believe will help
rest_agg = remainders_agg.drop(["Player", "home", "Age Years", "Age Days", "top100", "season", 'league_bundesliga', 'league_la-liga', 'league_ligue-1', 'league_premier-league', 'league_serie-a'], axis = 1) # removing columns I do not believe will help

# We should standardize our data
scaler = StandardScaler()
top_scaled_agg = scaler.fit_transform(top_agg)
rest_scaled_agg = scaler.fit_transform(rest_agg)

knn_out_agg = NearestNeighbors(n_neighbors=5, metric="euclidean") # KNN
knn_out_agg.fit(top_scaled_agg) # Fit with the top 100 player data

distances, indices = knn_out_agg.kneighbors(rest_scaled_agg) # for all non-top 100 players, find their neighbors (who will be top 100)
player = input("Please enter the name of an outfielder:") # Let's add player input for fun
player_number = remainders_agg[remainders_agg["Player"] == player].index[0] # Find the row where the player is
print("Candidate:", remainders_agg.iloc[player_number]['Player']) # Ensuring we have the right player
print("Similar top 100 players:")

# The top five top 100 players most similar to this rando for that year
for i, number in enumerate(indices[player_number]):
  print(f"{i+1}. {top_100_outfielders_agg.iloc[number]['Player']}")

# We will now do the same as above, but this time we will enter a top 100 player
# Then, we find the top five non-top 100 players who are most similar according to KNN
knn_out_agg_2 = NearestNeighbors(n_neighbors=5, metric="euclidean")
knn_out_agg_2.fit(rest_scaled_agg)

distances, indices = knn_out_agg_2.kneighbors(top_scaled_agg)
player = input("Please enter the name of an outfielder:")
player_number = top_100_outfielders_agg[top_100_outfielders_agg["Player"] == player].index[0]
print("Candidate:", top_100_outfielders_agg.iloc[player_number]['Player'])
print("Similar normal 100 players:")

for i, number in enumerate(indices[player_number]):
  print(f"{i+1}. {remainders_agg.iloc[number]['Player']}")

Please enter the season you would like to investigate:2018
Please enter the name of an outfielder:Anthony Martial
Candidate: Anthony Martial
Similar top 100 players:
1. Memphis
2. Lorenzo Insigne
3. Leroy Sané
4. Sadio Mané
5. Philippe Coutinho


In [None]:
# Now, let's do the keepers
selected_season = int(input("Please enter the season you would like to investigate:"))

top_100_keepers = keepers[(keepers['top100'] != 0) & (keepers['season'] == selected_season)]
remainders = keepers[(keepers["top100"] == 0) & (keepers['season'] == selected_season)]

top_100_keepers_agg = top_100_keepers.groupby("Player").mean(numeric_only=True).reset_index()
remainders_agg = remainders.groupby("Player").mean(numeric_only=True).reset_index()

top_agg = top_100_keepers_agg.drop(["Player", "home", "Age Years", "Age Days", "top100", "season", 'league_bundesliga', 'league_la-liga', 'league_ligue-1', 'league_premier-league', 'league_serie-a'], axis = 1)
rest_agg = remainders_agg.drop(["Player", "home", "Age Years", "Age Days", "top100", "season", 'league_bundesliga', 'league_la-liga', 'league_ligue-1', 'league_premier-league', 'league_serie-a'], axis = 1)

scaler = StandardScaler()
top_scaled_agg = scaler.fit_transform(top_agg)
rest_scaled_agg = scaler.fit_transform(rest_agg)

knn_keep_agg = NearestNeighbors(n_neighbors=5, metric="euclidean")
knn_keep_agg.fit(top_scaled_agg)

distances, indices = knn_keep_agg.kneighbors(rest_scaled_agg)
player = input("Please enter the name of a goalkeeper:")
player_number = remainders_agg[remainders_agg["Player"] == player].index[0]
print("Candidate:", remainders_agg.iloc[player_number]['Player'])
print("Similar top 100 players:")

for i, number in enumerate(indices[player_number]):
  print(f"{i+1}. {top_100_keepers_agg.iloc[number]['Player']}")

# "Reverse, reverse"
knn_keep_agg_2 = NearestNeighbors(n_neighbors=5, metric="euclidean")
knn_keep_agg_2.fit(rest_scaled_agg)

distances, indices = knn_keep_agg_2.kneighbors(top_scaled_agg)
player = input("Please enter the name of a goalkeeper:")
player_number = top_100_keepers_agg[top_100_keepers_agg["Player"] == player].index[0]
print("Candidate:", top_100_keepers_agg.iloc[player_number]['Player'])
print("Similar normal players:")

for i, number in enumerate(indices[player_number]):
  print(f"{i+1}. {remainders_agg.iloc[number]['Player']}")

Please enter the season you would like to investigate:2018
Please enter the name of a goalkeeper:Nick Pope
Candidate: Nick Pope
Similar top 100 players:
1. David de Gea
2. Jan Oblak
3. Jordan Pickford
4. Thibaut Courtois
5. Hugo Lloris


In [48]:
# Here we will try to predict likelihood of a player being in the top 100 for 2025
outfielders_pred = outfielders.groupby(["Player", "season"]).agg({
    'Min':'sum', 'Gls':'sum', 'Ast':'sum', 'PK':'sum', 'PKatt':'sum', 'Sh':'sum', 'SoT':'sum', 'CrdY':'sum', 'CrdR':'sum',
       'Touches':'sum', 'Tkl':'sum', 'Int':'sum', 'Blocks':'sum', 'xG':'sum', 'npxG':'sum', 'xAG':'sum', 'SCA':'sum', 'GCA':'sum',
       'Cmp_x':'sum', 'Att_x':'sum', 'Cmp%_x':'mean', 'PrgP':'sum', 'Carries':'sum', 'PrgC':'sum', 'Att_x.1':'sum',
       'Succ':'sum', 'home':'sum', 'top100':'mean', 'Age Years':'mean', 'Age Days':'mean', 'league_bundesliga':'mean',
       'league_la-liga':'mean', 'league_ligue-1':'mean', 'league_premier-league':'mean',
       'league_serie-a':'mean', 'AM':'mean', 'CB':'mean', 'CM':'mean', 'DF':'mean', 'DM':'mean', 'FW':'mean', 'LB':'mean', 'LM':'mean', 'LW':'mean',
       'MF':'mean', 'RB':'mean', 'RM':'mean', 'RW':'mean', 'WB':'mean'
}).reset_index()

outfielders_pred.top100 = (outfielders_pred.top100 != 0.0).astype(int)

train_seasons = [2018, 2019, 2020, 2021, 2022, 2023, 2024] # every season except 2025 is our training data
test_season = [2025] # we will test on the most recent season
train_df = outfielders_pred[outfielders_pred['season'].isin(train_seasons)] # get a dataframe of just training data
test_df = outfielders_pred[outfielders_pred['season'].isin(test_season)] # get a dataframe of just testing data

# columns = ['Player', 'season', 'Min', 'Gls', 'Ast', 'PK', 'PKatt', 'Sh', 'SoT',
      #  'CrdY', 'CrdR', 'Touches', 'Tkl', 'Int', 'Blocks', 'xG', 'npxG', 'xAG',
      #  'SCA', 'GCA', 'Cmp_x', 'Att_x', 'Cmp%_x', 'PrgP', 'Carries', 'PrgC',
      #  'Att_x.1', 'Succ', 'home', 'top100', 'Age Years', 'Age Days',
      #  'league_bundesliga', 'league_la-liga', 'league_ligue-1',
      #  'league_premier-league', 'league_serie-a', 'AM', 'CB', 'CM', 'DF', 'DM',
      #  'FW', 'LB', 'LM', 'LW', 'MF', 'RB', 'RM', 'RW', 'WB']

X_train = train_df.drop(columns = ['Player', 'season', 'home', 'Age Years', 'Age Days', 'top100', 'league_bundesliga', 'league_la-liga', 'league_ligue-1', 'league_premier-league', 'league_serie-a'])
y_train = train_df['top100']
X_test = test_df.drop(columns = ['Player', 'season', 'home', 'Age Years', 'Age Days', 'top100', 'league_bundesliga', 'league_la-liga', 'league_ligue-1', 'league_premier-league', 'league_serie-a'])
y_test = test_df['top100']
id_train = train_df.Player.reset_index().drop("index", axis = 1)
id_test = test_df.Player.reset_index().drop("index", axis = 1)

log = LogisticRegression(class_weight="balanced") # Logistic regression model
log.fit(X_train, y_train)
y_pred = log.predict(X_test) # prediction on if the player in question will be a top 100 player (1) or not (0)
y_pred_prob = log.predict_proba(X_test) # predicted probabilities of being a top 100 player or not
potential_indexes = [] # list to hold the indexes of our test data who are predicted to be a star
for i in range(len(y_pred)):
  if y_pred[i] == 1:
    potential_indexes.append(i) # add the index of the future top 100 player into our array
stardom = y_pred_prob[potential_indexes, 1] # get the predicted probability of everyone who is predicted to be a future top 100 player

# highest predicted probabilities will be how we base our rankings
for i in range(len(stardom)):
  for j in range(i + 1):
    if (stardom[j] < stardom[i]):
      temp = stardom[i]
      temp2 = potential_indexes[i]
      stardom[i] = stardom[j]
      potential_indexes[i] = potential_indexes[j]
      stardom[j] = temp
      potential_indexes[j] = temp2

stars = list(outfielders_pred[(outfielders_pred['top100'] != 0) & (outfielders_pred['season'] == 2025)]['Player']) # Only 77 players are in here (so 23 played in other leagues)
correct = 0 # see how many were succesfully identified in top 100
print(len(potential_indexes), potential_indexes)

for i in range(100):
  print(id_test.iloc[potential_indexes[i], 0])
  if str(id_test.iloc[potential_indexes[i], 0]) in stars:
    correct += 1

print("Players correct:", correct) # 47 players were correct
print("Percentage correct:", str(correct / len(stars) * 100) + "%") # 61% of the possible 77 players identified as true top 100s were in the top 100

358 [1338, 1174, 1348, 1960, 1881, 2363, 1573, 1712, 1669, 773, 1371, 1865, 1995, 2359, 27, 1197, 453, 740, 863, 1579, 916, 1916, 2392, 1313, 894, 1588, 2092, 153, 353, 1298, 1424, 1930, 1804, 215, 1192, 1522, 292, 2037, 1379, 721, 1213, 1377, 2273, 364, 41, 695, 110, 993, 1445, 1024, 2358, 612, 876, 1835, 1151, 118, 624, 196, 556, 1812, 1844, 149, 2494, 482, 2395, 1454, 50, 625, 1582, 1961, 602, 2357, 90, 2059, 1552, 329, 83, 2469, 112, 179, 1717, 1787, 371, 1214, 139, 1475, 461, 1994, 1227, 2223, 1501, 1343, 2375, 2264, 2370, 1508, 1750, 2002, 1364, 1820, 1498, 541, 679, 1718, 224, 2248, 1957, 1104, 2341, 821, 87, 2003, 1537, 2436, 164, 1092, 1981, 1278, 1381, 1356, 274, 1217, 1370, 1299, 558, 1663, 438, 2170, 1050, 1651, 739, 1714, 102, 936, 1840, 1315, 2444, 228, 2137, 176, 105, 1543, 1862, 379, 690, 471, 2336, 568, 2193, 691, 1260, 669, 287, 2152, 1631, 682, 1912, 80, 2485, 1273, 1426, 493, 2467, 1803, 1440, 1037, 899, 767, 138, 738, 2242, 232, 1900, 1562, 2389, 2010, 1148, 860, 6

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [50]:
# We will also try a SVM
clf = svm.SVC(class_weight="balanced", probability=True) # SVM model
clf.fit(X_train, y_train)
svm_pred = clf.predict(X_test) # prediction on if the player in question will be a top 100 player (1) or not (0)
svm_pred_prob = clf.predict_proba(X_test) # predicted probabilities of being a top 100 player or not
svm_potential_indexes = [] # list to hold the indexes of our test data who are predicted to be a star
for i in range(len(svm_pred)):
  if svm_pred[i] == 1:
    svm_potential_indexes.append(i) # add the index of the future top 100 player into our array
svm_stardom = svm_pred_prob[svm_potential_indexes, 1] # get the predicted probability of everyone who is predicted to be a future top 100 player

# highest predicted probabilities will be how we base our rankings
for i in range(len(svm_stardom)):
  for j in range(i + 1):
    if (svm_stardom[j] < svm_stardom[i]):
      temp = svm_stardom[i]
      temp2 = svm_potential_indexes[i]
      svm_stardom[i] = svm_stardom[j]
      svm_potential_indexes[i] = svm_potential_indexes[j]
      svm_stardom[j] = temp
      svm_potential_indexes[j] = temp2

svm_stars = list(outfielders_pred[(outfielders_pred['top100'] != 0) & (outfielders_pred['season'] == 2025)]['Player']) # Only 77 players are in here (so 23 played in other leagues)
svm_correct = 0 # see how many were succesfully identified in top 100
print(len(svm_potential_indexes), svm_potential_indexes)

for i in range(100):
  print(id_test.iloc[svm_potential_indexes[i], 0])
  if str(id_test.iloc[svm_potential_indexes[i], 0]) in svm_stars:
    svm_correct += 1

print("Players correct:", svm_correct) # 43 players were correct
print("Percentage correct:", str(svm_correct / len(svm_stars) * 100) + "%") # 55.84% of the possible 77 players identified as true top 100s were in the top 100

427 [1338, 1669, 1348, 1881, 1865, 2358, 773, 1579, 1454, 1717, 27, 695, 1552, 916, 876, 1573, 1298, 556, 1804, 721, 353, 1498, 2370, 2003, 453, 2375, 2059, 2494, 1213, 1820, 624, 1562, 1424, 110, 41, 1787, 164, 1192, 740, 1961, 2436, 2037, 1445, 2044, 1364, 493, 863, 224, 200, 2395, 2055, 2469, 83, 821, 1714, 1381, 196, 1024, 364, 2223, 1501, 1712, 1862, 90, 1475, 1808, 669, 329, 87, 1092, 1812, 894, 1588, 1840, 1278, 1343, 2092, 1151, 1387, 2273, 1543, 1522, 1039, 2359, 2069, 993, 292, 1930, 1631, 1030, 541, 2392, 1981, 118, 1197, 81, 1214, 1217, 2363, 1379, 1104, 1537, 739, 1835, 274, 153, 1960, 724, 1651, 1377, 1582, 215, 2485, 1916, 625, 1050, 1313, 1957, 1621, 966, 1371, 428, 1356, 1995, 1426, 1324, 482, 1078, 138, 1174, 602, 936, 76, 1041, 708, 1718, 1844, 2242, 139, 629, 2193, 461, 1900, 1299, 1663, 1554, 1750, 679, 50, 1951, 1490, 2341, 2467, 2444, 573, 989, 707, 1796, 1057, 2248, 905, 643, 2415, 1788, 2486, 690, 438, 371, 379, 2264, 2256, 105, 612, 2416, 1337, 1935, 1227, 127

In [51]:
rf_clf = RandomForestClassifier(class_weight='balanced')
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test) # prediction on if the player in question will be a top 100 player (1) or not (0)
rf_pred_prob = rf_clf.predict_proba(X_test) # predicted probabilities of being a top 100 player or not
rf_potential_indexes = [] # list to hold the indexes of our test data who are predicted to be a star
for i in range(len(rf_pred)):
  if rf_pred[i] == 1:
    rf_potential_indexes.append(i) # add the index of the future top 100 player into our array
rf_stardom = rf_pred_prob[rf_potential_indexes, 1] # get the predicted probability of everyone who is predicted to be a future top 100 player

# highest predicted probabilities will be how we base our rankings
for i in range(len(rf_stardom)):
  for j in range(i + 1):
    if (rf_stardom[j] < rf_stardom[i]):
      temp = rf_stardom[i]
      temp2 = rf_potential_indexes[i]
      rf_stardom[i] = rf_stardom[j]
      rf_potential_indexes[i] = rf_potential_indexes[j]
      rf_stardom[j] = temp
      rf_potential_indexes[j] = temp2

rf_stars = list(outfielders_pred[(outfielders_pred['top100'] != 0) & (outfielders_pred['season'] == 2025)]['Player']) # Only 77 players are in here (so 23 played in other leagues)
rf_correct = 0 # see how many were succesfully identified in top 100
print(len(rf_potential_indexes), rf_potential_indexes)

for i in range(100):
  print(id_test.iloc[rf_potential_indexes[i], 0])
  if str(id_test.iloc[rf_potential_indexes[i], 0]) in rf_stars:
    rf_correct += 1

print("Players correct:", rf_correct) # 43 players were correct
print("Percentage correct:", str(rf_correct / len(rf_stars) * 100) + "%") # 55.84% of the possible 77 players identified as true top 100s were in the top 100

16 [1669, 1338, 1881, 453, 1865, 1573, 110, 353, 1995, 916, 1445, 1348, 27, 1475, 773, 1298]
Michael Olise
Kylian Mbappé
Ousmane Dembélé
Cole Palmer
Omar Marmoush
Mason Greenwood
Alexander Isak
Bradley Barcola
Raphinha
Hugo Ekitike
Luis Díaz
Lamine Yamal
Achraf Hakimi
Maghnes Akliouche
Florian Wirtz
Khvicha Kvaratskhelia


IndexError: list index out of range

In [28]:
# Let's try and imporve our logistic regression model with hyperparameter tuning
param_grid = [
    {'penalty':['l1','l2','elasticnet','none'],
    'C' : [0.25, 0.50, 0.75, 1.00, 1.25, 1.50, 1.75, 2.00],
    'solver': ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter'  : [100,1000,2500,5000],
    'class_weight' : ['balanced']
}
]
clf = GridSearchCV(log, param_grid = param_grid, cv = 3, verbose=True,n_jobs=-1)
best_clf = clf.fit(X_train, y_train)
best_clf.best_estimator_

Fitting 3 folds for each of 640 candidates, totalling 1920 fits


1248 fits failed out of a total of 1920.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
96 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/linear_model/_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [33]:
# Now let's do everything again but with the best model.

# log = LogisticRegression(class_weight = "balanced", max_iter=5000) # Logistic regression model
# log = LogisticRegression(C = 0.25, class_weight = "balanced", solver = 'liblinear') # Logistic regression model
log = LogisticRegression(C = 0.5, class_weight = 'balanced', max_iter = 5000) # Logistic regression model (this one was the best with all grid search options)
log.fit(X_train, y_train)
y_pred = log.predict(X_test) # prediction on if the player in question will be a top 100 player (1) or not (0)
y_pred_prob = log.predict_proba(X_test) # predicted probabilities of being a top 100 player or not
potential_indexes = [] # list to hold the indexes of our test data who are predicted to be a star
for i in range(len(y_pred)):
  if y_pred[i] == 1:
    potential_indexes.append(i) # add the index of the future top 100 player into our array
stardom = y_pred_prob[potential_indexes, 1] # get the predicted probability of everyone who is predicted to be a future top 100 player

# highest predicted probabilities will be how we base our rankings
for i in range(len(stardom)):
  for j in range(i + 1):
    if (stardom[j] < stardom[i]):
      temp = stardom[i]
      temp2 = potential_indexes[i]
      stardom[i] = stardom[j]
      potential_indexes[i] = potential_indexes[j]
      stardom[j] = temp
      potential_indexes[j] = temp2

stars = list(outfielders_pred[(outfielders_pred['top100'] != 0) & (outfielders_pred['season'] == 2025)]['Player']) # Only 77 players are in here (so 23 played in other leagues)
correct = 0 # see how many were succesfully identified in top 100

for i in range(100):
  print(id_test.iloc[potential_indexes[i], 0])
  if str(id_test.iloc[potential_indexes[i], 0]) in stars:
    correct += 1

print("Players correct:", correct) # 48 players were correct
print("Percentage correct:", str(correct / len(stars) * 100) + "%") # 62.34% of the possible 77 players identified as true top 100s were in the top 100

Kylian Mbappé
Mohamed Salah
Joshua Kimmich
Ousmane Dembélé
Lamine Yamal
Raphinha
Michael Olise
Bradley Barcola
Pierre Højbjerg
Omar Marmoush
Vitinha
Harry Kane
Achraf Hakimi
Leonardo Balerdi
Granit Xhaka
Hugo Ekitike
Virgil van Dijk
Florian Wirtz
Alexander Isak
Robert Lewandowski
Pedri
Pau Cubarsí
Kim Min-jae
Mason Greenwood
João Neves
Cole Palmer
Mateo Retegui
Désiré Doué
William Saliba
Joško Gvardiol
Luis Díaz
Khvicha Kvaratskhelia
Angelo Stiller
Mateo Kovačić
Fabián Ruiz Peña
Vinicius Júnior
Leroy Sané
Iñigo Martínez
Federico Valverde
Amir Rrahmani
Matheus Cunha
Lucas Beraldo
Jude Bellingham
Serhou Guirassy
Luka Modrić
Rayan Cherki
Maghnes Akliouche
Alexander Sørloth
Andrey Santos
Ademola Lookman
Dayot Upamecano
Marcos Alonso
Rúben Dias
Levi Colwill
Jamal Musiala
Bafodé Diakité
Julián Álvarez
Adrien Rabiot
Hakan Çalhanoğlu
Erling Haaland
Rafael Leão
Marquinhos
Willian Pacho
Jonathan Tah
Dominik Szoboszlai
Piero Hincapié
Bruno Fernandes
Corentin Tolisso
Alexsandro Ribeiro
Daley Blind

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Where to next?

# Ranking of top 100 (1st, 10th, unranked) - Logistic Regression sort of does this, will need to elaborate further
# Above top 25, where could they land
# Transferability (need club name)?
# Team chemistry (combine with KNN)?
# Play-by-play (+ or -)?

# log = LogisticRegression(class_weight = "balanced", max_iter=5000) # Logistic regression model
# log = LogisticRegression(C = 0.25, class_weight = "balanced", solver = 'liblinear') # Logistic regression model

                Player        Min       Gls       Ast        PK     PKatt  \
0        Achraf Hakimi  78.454545  0.151515  0.212121  0.000000  0.000000   
1      Ademola Lookman  53.272727  0.136364  0.081818  0.000000  0.009091   
2      Alejandro Balde  37.600000  0.000000  0.000000  0.000000  0.000000   
3   Alessandro Bastoni  76.367521  0.034188  0.042735  0.000000  0.000000   
4       Alexander Isak  56.537037  0.296296  0.046296  0.018519  0.027778   
..                 ...        ...       ...       ...       ...       ...   
67     Vinicius Júnior  58.905983  0.213675  0.119658  0.000000  0.000000   
68     Virgil van Dijk  88.330579  0.107438  0.033058  0.000000  0.000000   
69             Vitinha  27.368421  0.000000  0.052632  0.000000  0.000000   
70      William Saliba  87.011905  0.011905  0.000000  0.000000  0.000000   
71         Xavi Simons  19.142857  0.000000  0.000000  0.000000  0.000000   

          Sh       SoT      CrdY      CrdR  ...        DM        FW        