<a href="https://colab.research.google.com/github/shipley7/fifa/blob/main/FIFA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("combined_match_data.csv") # getting the data
df.head()

Unnamed: 0,index,Player,#,Nation,Pos,Age,Min,Gls,Ast,PK,...,AvgLen.1,Opp,Stp,Stp%,#OPA,AvgDist,home,game_id,source_file,top_100
0,0,14 Players,,,,,990.0,3,2,1,...,,,,,,,1,0,bundesliga_2017-2018_player_data.csv,0
1,1,Arjen Robben,10.0,nl NED,RW,33-207,30.0,0,0,0,...,,,,,,,1,0,bundesliga_2017-2018_player_data.csv,0
2,2,Arturo Vidal,23.0,cl CHI,LM,30-088,90.0,0,1,0,...,,,,,,,1,0,bundesliga_2017-2018_player_data.csv,0
3,3,Corentin Tolisso,24.0,fr FRA,RM,23-015,90.0,1,0,0,...,,,,,,,1,0,bundesliga_2017-2018_player_data.csv,0
4,4,David Alaba,27.0,at AUT,LB,25-055,90.0,0,0,0,...,,,,,,,1,0,bundesliga_2017-2018_player_data.csv,0


In [3]:
df = df[df["Player"].str.contains("Players")==False] # removing excess rows that have no player information
df = df.drop(['index', "#"], axis = 1) # index and column have no impact on anything
df[["Age Years", "Age Days"]] = (df.Age.str.split("-", expand = True)).apply(pd.to_numeric) # splitting age column into two other columns
df[["league", "season"]] = df["source_file"].replace("_player_data.csv", "", regex = True).str.split("_", expand = True) # splitting the source file into a league and season
df["season"] = (df["season"].str[:4].astype(int)) + 1 # season will be an integer representing the year the season ended
df = pd.get_dummies(df, columns=["league"]) # league will be a boolean variable representing which leage we are using
df = df.drop(['Age','source_file', "game_id"], axis = 1) # removing age, source file, and game id as they are no longer needed
df = df.fillna(0) # filling any NA with 0
df

Unnamed: 0,Player,Nation,Pos,Min,Gls,Ast,PK,PKatt,Sh,SoT,...,home,top_100,Age Years,Age Days,season,league_bundesliga,league_la-liga,league_ligue-1,league_premier-league,league_serie-a
1,Arjen Robben,nl NED,RW,30.0,0,0,0,0,1,1,...,1,0,33,207,2018,True,False,False,False,False
2,Arturo Vidal,cl CHI,LM,90.0,0,1,0,0,1,0,...,1,0,30,88,2018,True,False,False,False,False
3,Corentin Tolisso,fr FRA,RM,90.0,1,0,0,0,4,3,...,1,0,23,15,2018,True,False,False,False,False
4,David Alaba,at AUT,LB,90.0,0,0,0,0,0,0,...,1,0,25,55,2018,True,False,False,False,False
5,Franck Ribéry,fr FRA,LW,76.0,0,0,0,0,0,0,...,1,0,34,133,2018,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276877,Marko Rog,hr CRO,CM,57.0,0,0,0,0,3,1,...,0,0,26,307,2022,False,False,False,False,True
276878,Matteo Lovato,it ITA,CB,32.0,0,0,0,0,2,0,...,0,0,22,97,2022,False,False,False,False,True
276879,Nahitan Nández,uy URU,"WB,CM",90.0,0,0,0,0,4,0,...,0,0,26,145,2022,False,False,False,False,True
276880,Raoul Bellanova,it ITA,WB,90.0,0,0,0,0,5,1,...,0,0,22,5,2022,False,False,False,False,True


In [4]:
# splitting into keepers and outfielders since they all have different statistics
keepers = df[df["Pos"] == "GK"]
outfielders = df[df["Pos"] != "GK"]

In [5]:
# for keepers, removing everything that a keeper doesn't normally do
keepers = keepers.drop(["Pos", 'Gls', 'Ast', 'PK', 'PKatt', 'Sh',
       'SoT', 'Touches', 'Tkl', 'Int', 'Blocks', 'xG', 'npxG',
       'xAG', 'SCA', 'GCA', 'Cmp_x', 'Att_x', 'Cmp%_x', 'PrgP', 'Carries',
       'PrgC', 'Att_x.1', 'Succ'], axis = 1)
keepers

Unnamed: 0,Player,Nation,Min,CrdY,CrdR,SoTA,GA,Saves,Save%,PSxG,...,home,top_100,Age Years,Age Days,season,league_bundesliga,league_la-liga,league_ligue-1,league_premier-league,league_serie-a
13,Sven Ulreich,de GER,90.0,0,0,4.0,1.0,3.0,75.0,0.9,...,1,0,29,15,2018,True,False,False,False,False
19,Bernd Leno,de GER,90.0,0,0,8.0,3.0,5.0,75.0,3.5,...,0,0,25,167,2018,True,False,False,False,False
36,Koen Casteels,be BEL,90.0,0,0,5.0,3.0,2.0,40.0,1.9,...,1,0,25,55,2018,True,False,False,False,False
56,Roman Bürki,ch SUI,90.0,0,0,0.0,0.0,0.0,0.0,0.0,...,0,0,26,278,2018,True,False,False,False,False
70,Rune Jarstein,no NOR,90.0,0,0,2.0,0.0,2.0,100.0,0.1,...,1,0,32,324,2018,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276807,Mike Maignan,fr FRA,90.0,0,0,3.0,0.0,3.0,100.0,0.6,...,0,1,26,323,2022,False,False,False,False,True
276831,Vid Belec,si SVN,90.0,1,0,11.0,4.0,6.0,63.6,4.1,...,1,0,31,350,2022,False,False,False,False,True
276835,Daniele Padelli,it ITA,90.0,0,0,10.0,0.0,10.0,100.0,1.7,...,0,0,36,209,2022,False,False,False,False,True
276860,Niki Mäenpää,fi FIN,90.0,0,0,7.0,0.0,7.0,100.0,0.5,...,1,0,37,119,2022,False,False,False,False,True


In [6]:
position_dummies = outfielders.Pos.str.get_dummies(sep = ",") # since most players play mutliple positions in a match, we will have to multi-hot encode
outfielders = pd.concat([outfielders.drop(columns=["Pos"]), position_dummies], axis = 1) # combining our binary position columns with the remaining dataframe
outfielders[['AM', 'CB', 'CM', 'DF', 'DM',
       'FW', 'LB', 'LM', 'LW', 'MF', 'RB', 'RM', 'RW', 'WB']] = outfielders[['AM', 'CB', 'CM', 'DF', 'DM',
       'FW', 'LB', 'LM', 'LW', 'MF', 'RB', 'RM', 'RW', 'WB']].astype('bool') # change position to boolean (did they play this position in the match or not?)
outfielders = outfielders.drop(['SoTA', 'GA', 'Saves', 'Save%', 'PSxG',
       'Cmp_y', 'Att_y', 'Cmp%_y', 'Att (GK)', 'Thr', 'Launch%', 'AvgLen',
       'Att_y.1', 'Launch%.1', 'AvgLen.1', 'Opp', 'Stp', 'Stp%', '#OPA',
       'AvgDist', 'GK'], axis = 1) # removing keeper statistics
outfielders

Unnamed: 0,Player,Nation,Min,Gls,Ast,PK,PKatt,Sh,SoT,CrdY,...,DM,FW,LB,LM,LW,MF,RB,RM,RW,WB
1,Arjen Robben,nl NED,30.0,0,0,0,0,1,1,0,...,False,False,False,False,False,False,False,False,True,False
2,Arturo Vidal,cl CHI,90.0,0,1,0,0,1,0,1,...,False,False,False,True,False,False,False,False,False,False
3,Corentin Tolisso,fr FRA,90.0,1,0,0,0,4,3,0,...,False,False,False,False,False,False,False,True,False,False
4,David Alaba,at AUT,90.0,0,0,0,0,0,0,0,...,False,False,True,False,False,False,False,False,False,False
5,Franck Ribéry,fr FRA,76.0,0,0,0,0,0,0,0,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276877,Marko Rog,hr CRO,57.0,0,0,0,0,3,1,0,...,False,False,False,False,False,False,False,False,False,False
276878,Matteo Lovato,it ITA,32.0,0,0,0,0,2,0,0,...,False,False,False,False,False,False,False,False,False,False
276879,Nahitan Nández,uy URU,90.0,0,0,0,0,4,0,1,...,False,False,False,False,False,False,False,False,False,True
276880,Raoul Bellanova,it ITA,90.0,0,0,0,0,5,1,0,...,False,False,False,False,False,False,False,False,False,True


In [7]:
# here, I am trying to see if a model could accuartely predict the positions one would play
pos_prediction = outfielders.drop(["Player", "Nation"], axis = 1) # we cannot use Player, since that is a dead giveaway, and nation is useless

# since players only get better or worse with time, our testing data must come after our training data
train_seasons = [2018, 2019, 2020, 2021]
test_season = [2022]
train_df = pos_prediction[pos_prediction['season'].isin(train_seasons)]
test_df = pos_prediction[pos_prediction['season'].isin(test_season)]

positions = ['AM', 'CB', 'CM', 'DF', 'DM',
       'FW', 'LB', 'LM', 'LW', 'MF', 'RB', 'RM', 'RW', 'WB'] # our target variables

X_train = train_df.drop(columns = positions)
y_train = train_df[positions]
X_test = test_df.drop(columns = positions)
y_test = test_df[positions]

model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state = 42, class_weight='balanced')) # ChatGPT recommended this
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred, target_names=positions)) # results (not very good, but can be expected since there are 14 possible positions and not necessarily one right answer)

# Going to try again but this time will generalize positions so maybe we will only have FW, MID, DEF


              precision    recall  f1-score   support

          AM       0.33      0.00      0.00      3534
          CB       0.82      0.55      0.66     10047
          CM       0.50      0.01      0.02      8623
          DF       0.00      0.00      0.00         0
          DM       0.00      0.00      0.00      3282
          FW       0.72      0.44      0.54      8962
          LB       0.46      0.01      0.02      3689
          LM       0.00      0.00      0.00      3503
          LW       0.22      0.00      0.00      3061
          MF       0.00      0.00      0.00         0
          RB       0.49      0.00      0.01      3785
          RM       0.67      0.00      0.00      3476
          RW       0.00      0.00      0.00      3051
          WB       0.33      0.00      0.00      2730

   micro avg       0.77      0.17      0.27     57743
   macro avg       0.32      0.07      0.09     57743
weighted avg       0.48      0.17      0.20     57743
 samples avg       0.19   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
pos_prediction["Pos_Gen_FWD"] = np.where(pos_prediction['FW'] | pos_prediction['LW'] | pos_prediction['RW'], True, False)

pos_prediction["Pos_Gen_MID"] = np.where(pos_prediction['AM'] | pos_prediction['CM'] | pos_prediction['DM'] | pos_prediction['LM'] | pos_prediction['MF'] | pos_prediction['RM'], True, False)

pos_prediction["Pos_Gen_DEF"] = np.where(pos_prediction['CB'] | pos_prediction['DF'] | pos_prediction['LB'] | pos_prediction['RB'] | pos_prediction['WB'], True, False)

In [9]:
pos_prediction = pos_prediction.drop(positions, axis = 1) # we cannot use Player, since that is a dead giveaway, and nation is useless

# since players only get better or worse with time, our testing data must come after our training data
train_seasons = [2018, 2019, 2020, 2021]
test_season = [2022]
train_df = pos_prediction[pos_prediction['season'].isin(train_seasons)]
test_df = pos_prediction[pos_prediction['season'].isin(test_season)]

positions = ["Pos_Gen_FWD", "Pos_Gen_MID", "Pos_Gen_DEF"]

X_train = train_df.drop(columns = positions)
y_train = train_df[positions]
X_test = test_df.drop(columns = positions)
y_test = test_df[positions]

model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state = 42, class_weight='balanced')) # ChatGPT recommended this
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred, target_names=positions))

              precision    recall  f1-score   support

 Pos_Gen_FWD       0.71      0.60      0.65     14378
 Pos_Gen_MID       0.62      0.48      0.54     20192
 Pos_Gen_DEF       0.81      0.62      0.70     19370

   micro avg       0.71      0.56      0.63     53940
   macro avg       0.71      0.57      0.63     53940
weighted avg       0.71      0.56      0.63     53940
 samples avg       0.57      0.57      0.57     53940



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
top_85_outfielders = outfielders[outfielders['top_100'] == True]
remainders = outfielders[outfielders["top_100"] == False]

top_85_outfielders_agg = top_85_outfielders.groupby("Player").mean(numeric_only=True).reset_index()
remainders_agg = remainders.groupby("Player").mean(numeric_only=True).reset_index()

top_agg = top_85_outfielders_agg.drop(["Player", "home", "Age Years", "Age Days", "top_100", "season"], axis = 1)
rest_agg = remainders_agg.drop(["Player", "home", "Age Years", "Age Days", "top_100", "season"], axis = 1)

scaler = StandardScaler()
top_scaled_agg = scaler.fit_transform(top_agg)
rest_scaled_agg = scaler.fit_transform(rest_agg)

knn_out_agg = NearestNeighbors(n_neighbors=5, metric="euclidean")
knn_out_agg.fit(top_scaled_agg)

distances, indices = knn_out_agg.kneighbors(rest_scaled_agg)
player = input("Please enter the name of an outfielder:")
player_number = remainders_agg[remainders_agg["Player"] == player].index[0]
print("Candidate:", remainders_agg.iloc[player_number]['Player'])
print("Similar top 100 players:")

for i, number in enumerate(indices[player_number]):
  print(f"{i+1}. {top_85_outfielders_agg.iloc[number]['Player']}")

Please enter the name of an outfielder:Rodri
Candidate: Rodri
Similar top 100 players:
1. Fabián Ruiz Peña
2. Bruno Guimarães
3. Frenkie de Jong
4. Declan Rice
5. Martin Ødegaard


In [12]:
top_15_keepers = keepers[keepers['top_100'] == True]
remainders = keepers[keepers["top_100"] == False]

top_15_keepers_agg = top_15_keepers.groupby("Player").mean(numeric_only=True).reset_index()
remainders_agg = remainders.groupby("Player").mean(numeric_only=True).reset_index()

top_agg = top_15_keepers_agg.drop(["Player", "home", "Age Years", "Age Days", "top_100", "season"], axis = 1)
rest_agg = remainders_agg.drop(["Player", "home", "Age Years", "Age Days", "top_100", "season"], axis = 1)

scaler = StandardScaler()
top_scaled_agg = scaler.fit_transform(top_agg)
rest_scaled_agg = scaler.fit_transform(rest_agg)

knn_keep_agg = NearestNeighbors(n_neighbors=5, metric="euclidean")
knn_keep_agg.fit(top_scaled_agg)

distances, indices = knn_keep_agg.kneighbors(rest_scaled_agg)
player = input("Please enter the name of a goalkeeper:")
player_number = remainders_agg[remainders_agg["Player"] == player].index[0]
print("Candidate:", remainders_agg.iloc[player_number]['Player'])
print("Similar top 100 players:")

for i, number in enumerate(indices[player_number]):
  print(f"{i+1}. {top_15_keepers_agg.iloc[number]['Player']}")

Please enter the name of a goalkeeper:Tim Krul
Candidate: Tim Krul
Similar top 100 players:
1. Emiliano Martínez
2. Yann Sommer
3. Thibaut Courtois
4. David Raya
5. Mike Maignan


In [32]:
outfielders_agg = outfielders.groupby("Player").mean(numeric_only=True).reset_index()
outfielders_agg = outfielders_agg.drop(["home", "Age Years", "Age Days", "season"], axis = 1)
X = outfielders_agg.drop(["top_100", "Player"], axis = 1)
y = outfielders_agg.top_100
ids = outfielders_agg.Player
X_train, X_test, y_train, y_test, id_train, id_test = train_test_split(
    X, y, ids, test_size=0.20, stratify = y, random_state=42)
id_train = id_train.reset_index().drop("index", axis = 1)
id_test = id_test.reset_index().drop("index", axis = 1)
log = LogisticRegression(class_weight="balanced")
log.fit(X_train, y_train)
y_pred = log.predict(X_test)
y_pred_prob = log.predict_proba(X_test)
potential = y_pred_prob[:, 1] > 0.7
potential_indexes = [i for i, x in enumerate(potential) if x]
for i in range(len(potential_indexes)):
  print(id_test.iloc[potential_indexes[i], 0])
print(classification_report(y_test, y_pred))

Marquinhos
Joshua Kimmich
Ismaila Sarr
Andy Delort
Ander Herrera
Nicolò Zaniolo
Chiquinho
Seko Fofana
Randal Kolo Muani
Fabian Delph
Radja Nainggolan
Ritsu Doan
Adnan Januzaj
Youcef Belaïli
Riyad Mahrez
Harvey Elliott
Alexander Isak
Allan Saint-Maximin
Nathaniel Phillips
Takefusa Kubo
Pierre Højbjerg
Leandro Trossard
Ivan Rakitić
Theo Hernández
Anthony Martial
Jadon Sancho
Tammy Abraham
Mikel Oyarzabal
Marco Reus
Moussa Diaby
Borja Herrera
Harry Kane
Moussa Dembélé
Lucas Vázquez
Scott McTominay
Rodri
Neymar
Toni Kroos
Rominigue Kouamé
Marcelo
Castello Lukeba
Corentin Tolisso
N'Golo Kanté
Eric Dier
Jamie Vardy
Ademola Lookman
Raphaël Guerreiro
Gerard Moreno
Paul Pogba
Jonathan Bamba
Yannis Salibur
Dejan Lovren
Lassana Diarra
John McGinn
Jordi Alba
Florian Wirtz
André Carrillo
Zinedine Ferhat
Lucas Ocampos
Timo Werner
Wilfried Zaha
Willian
Óscar Trejo
Eddy Gnahoré
Joelinton
David Silva
Lewis Dunk
Eden Hazard
Josip Iličić
Thiago Alcántara
Florian Sotoca
Saïd Benrahma
Kike Hermoso
Jesper L

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
