In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

The first thing to do is to load the datasets

In [None]:
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 26 14:13:03 2021

@author: Martin
"""
#Load data
fifa19=pd.read_csv("../input/fifa19/data.csv",index_col=0,usecols = [i for i in range(1,88)])
fifa21=pd.read_csv("../input/fifa-2021-complete-player-data/FIFA-21 Complete.csv",sep=";",index_col=0)

The next thing to do is to find the ids of all players in both FIFA 19 and FIFA 21, clean both datasets to only contain the player names, overalls and potentials. 

In [None]:
#find all in both games
index_in_both_games=fifa19.index.intersection(fifa21.index)
fifa21_in_fifa19=fifa21[fifa21.index.isin(index_in_both_games)]
fifa19_in_fifa21=fifa19[fifa19.index.isin(index_in_both_games)]

fifa19_small=fifa19_in_fifa21[["Name","Overall","Potential"]]
fifa21_small=fifa21_in_fifa19[["name","overall","potential"]]

#Create merged data set 
fifa_merged=fifa19_small.merge(fifa21_small,left_index=True,right_index=True)

Note that fifa_merged could have been created by explictedly using a inner join. But because I need the indexes for a later task, I'm doing an inner join implictedly.

The next thing to do is to find the differences in overall and potential and showing them

In [None]:
#create difference columns
fifa_merged["ovr_diff"]=fifa_merged["overall"]-fifa_merged["Overall"]
fifa_merged["pot_diff"]=fifa_merged["potential"]-fifa_merged["Potential"]
#Find names and overall/potential differences of biggest movers
biggest_ovr_loss=fifa_merged.sort_values(by="ovr_diff").iloc[0:10,[3,6]]
biggest_pot_loss=fifa_merged.sort_values(by="ovr_diff").iloc[0:10,[3,7]]
biggest_ovr_gain=fifa_merged.sort_values(by="ovr_diff",ascending=False).iloc[0:10,[3,6]]
biggest_pot_gain=fifa_merged.sort_values(by="ovr_diff",ascending=False).iloc[0:10,[3,7]]
#Print them
print("Biggest overall losers")
print(biggest_ovr_loss)
print("Biggest potential losers")
print(biggest_pot_loss)
print("Biggest overall gainers")
print(biggest_ovr_gain)
print("Biggest potential losers")
print(biggest_pot_gain)



Now, we find the players that aren't in FIFA 21, but is in FIFA 19 and vice versa. 

In [None]:
#Player differences
players_in_19_not_in_21=fifa19[~fifa19.index.isin(index_in_both_games)]
print("In total "+str(len(players_in_19_not_in_21.index))+" players not in FIFA 21")

players_in_21_not_in_19=fifa21[~fifa21.index.isin(index_in_both_games)]
print("In total "+str(len(players_in_21_not_in_19.index))+" new players in FIFA 21")



We can then show all players dropped for FIFA21 and new players in FIFA19 in tabel

In [None]:
players_in_19_not_in_21

In [None]:
players_in_21_not_in_19

I am wondering if there's something that predicts players being dropped. My first idea is to look at age, as I am thinking that most of the players removed are retired from football

In [None]:
#Does age predict probablity of being removed from 21?
age_of_players_removed=players_in_19_not_in_21.groupby("Age").Age.count()
age_of_players=fifa19.groupby("Age").Age.count()
odds_by_age=age_of_players_removed/age_of_players
upper_limit=odds_by_age+np.sqrt(odds_by_age*(1-odds_by_age)/age_of_players)*2
upper_limit[upper_limit>1]=1
lower_limit=odds_by_age-np.sqrt(odds_by_age*(1-odds_by_age)/age_of_players)*2

plt.plot(age_of_players_removed/age_of_players,color="b",label="Fraction of players removed from FIFA at age x")
plt.plot(upper_limit,"--",color="b",label="95% confidence interval at age x")
plt.plot(lower_limit,"--",color="b")
plt.legend()
plt.title("Does age predict chance of being in FIFA 19 but not FIFA 21?")
plt.xlabel("Age")
plt.ylabel("Fraction of players removed")
fig=plt.gcf()
fig.set_size_inches(10, 10)


Looking at this, it seems like most players that were removed is either young players that didn't break through or old players that retired. Another way to look at this is to look at how good the players are, as that is probably an even better predictor

In [None]:
ovr_of_players_removed=players_in_19_not_in_21.groupby("Overall").Overall.count()
ovr_of_players=fifa19.groupby("Overall").Overall.count()
odds_by_ovr=ovr_of_players_removed/ovr_of_players
upper_limit=odds_by_ovr+np.sqrt(odds_by_ovr*(1-odds_by_ovr)/ovr_of_players)*2
upper_limit[upper_limit>1]=1
lower_limit=odds_by_ovr-np.sqrt(odds_by_ovr*(1-odds_by_ovr)/ovr_of_players)*2
lower_limit[lower_limit<0]=0

plt.plot(odds_by_ovr,color="b",label="Fraction of players removed from FIFA with overall x")
plt.plot(upper_limit,"--",color="b",label="95% confidence interval at overall x")
plt.plot(lower_limit,"--",color="b")
plt.legend()
plt.title("Does overall predict chance of being in FIFA 19 but not FIFA 21?")
plt.xlabel("Overall")
plt.ylabel("Fraction of players removed")
fig=plt.gcf()
fig.set_size_inches(10, 10)


Looking that this chart, it seems more likely that the players that were removed, rather than young or old, seems to bad players. Finally, I will look at how potential predicts removal of players

In [None]:
pot_of_players_removed=players_in_19_not_in_21.groupby("Potential").Potential.count()
pot_of_players=fifa19.groupby("Potential").Potential.count()
odds_by_pot=pot_of_players_removed/pot_of_players
upper_limit=odds_by_pot+np.sqrt(odds_by_pot*(1-odds_by_pot)/ovr_of_players)*2
upper_limit[upper_limit>1]=1
lower_limit=odds_by_pot-np.sqrt(odds_by_pot*(1-odds_by_pot)/ovr_of_players)*2
lower_limit[lower_limit<0]=0

plt.plot(odds_by_pot,color="b",label="Fraction of players removed from FIFA at potential x")
plt.plot(upper_limit,"--",color="b",label="95% confidence interval at potential x")
plt.plot(lower_limit,"--",color="b")
plt.legend()
plt.title("Does potential predict chance of being in FIFA 19 but not FIFA 21?")
plt.xlabel("Potential")
plt.ylabel("Fraction of players removed")
fig=plt.gcf()
fig.set_size_inches(10, 10)


As there's likely a high degree of correlation between overall and potential among all but the youngest players, it makes sense that potential also seems to predict the probability that a player is removed from FIFA19. A worse predictor than overall or a 2nd degree polynomial fitted to age, but still good predictor.

In order to see how well these 3 factors (age, overall and potential) predict whether a player is in FIFA 21, one can build a model that predicts whether or not a player is in FIFA21, based on their statistics in FIFA19. A simple model is do a logistic regression, which is regression on the form $\frac{1}{1+e^{\beta_0+\beta_1*x_1*...}}$. The result of this regression is presented below

In [None]:
import statsmodels.api as sm
x=fifa19[["Age","Overall","Potential"]]
x = sm.add_constant(x)
y=fifa19.index.isin(players_in_19_not_in_21.index)
model = model = sm.Logit(y, x)
result = model.fit(method='newton')

result.summary()

In [None]:
result.pred_table()

As noted above, age and overall in FIFA19 seems to very good predictors of the probability that a player is in FIFA21, while potential, when you already know age and overall, seems to be quite weak. However, this model have a false positve rate of 60% and false negative rate of 10%, so there's a lot of variables that this model doesn't consider and should be viewed with some scepticism

Finally, I want to look at some statistics related to the nationality of the players in FIFA21. First, I want to count how many players of each nationality there is in the dataset and present the countries with the most and least players

In [None]:
#nationality distribution in FIFA 21
nationality_dist_21=fifa21.groupby(["nationality"]).nationality.count().sort_values(ascending=False)
print(nationality_dist_21)


Now, I want to count how many countries have x amount of players

In [None]:
nationality_dist_21.value_counts().sort_index()

As this table shows, the distribution is very left skewed, with there being more countries with 5 players than countries with more than 950 players.

Finally, I will create a histogram of how many countries have how many players in FIFA 21

In [None]:
plt.hist(nationality_dist_21,bins=1500)

plt.title("Most countries have very few players in FIFA 21")
fig=plt.gcf()
fig.set_size_inches(18.5, 10.5)