In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from math import sqrt
import scipy.stats
import sys

import  matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from pylab import rcParams
rcParams['figure.figsize'] = 15, 10

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Overview and cleaning

In [None]:
df = pd.read_csv('/kaggle/input/fifa-20-complete-player-dataset/players_20.csv')
df.head()

In [None]:
df.columns

In [None]:
#adding new column BMI to check the health and fitness of players
df["BMI"] = df['weight_kg'] / (df['height_cm'] / 100) ** 2
df.describe()

In [None]:
df.info()

In [None]:
#choosing only the usefull columns for the analysis
df1 = df[['short_name','age','dob','height_cm','weight_kg','nationality','club','overall','potential',
          'value_eur','wage_eur','player_positions','preferred_foot','international_reputation',
          'skill_moves', 'work_rate',"BMI"]]
print(df1.isnull().sum())
df1

Let's create some functions to make it easier for us analyze the data

In [None]:
def plot_top20(dataframe, metric):
    """
    Gives a barplot with top 20 players using the given metric 

        Parameters
        ----------
        dataframe : Pandas.Dataframe
            parent dataframe 
        
        club_name : str
            metric to be used as the sorting parameter

        Returns
        -------
        plots a barplot with players with top 20 values of the metric
    """
    metric_df = df1[['short_name', metric]].sort_values(by = [metric], ascending = False)[:20]
    plt.title(f"Top 20 highest {metric} players in FIFA20")
    sns.barplot(y = metric_df.short_name, x = metric_df[metric])
    for index, value in enumerate(metric_df[metric]):
        plt.text(value//2, index, str(round(value,2)), verticalalignment='center')

# Performance and Earnings

Let us plot the top 20 clubs, countries and players in FIFA20

In [None]:
plt.subplot(121)
top_clubs = df1.groupby(['club']).overall.mean().sort_values(ascending  = False)[:20]
plt.title("Top 20 Clubs in FIFA20")
sns.barplot(y = top_clubs.index, x = top_clubs)
for index, value in enumerate(top_clubs):
    plt.text(value//2, index, str(round(value,2)))
    
plt.subplot(122)
top_countries = df1.groupby(['nationality']).overall.mean().sort_values(ascending  = False)[:20]
plt.title("Top 20 Countries in FIFA20")
sns.barplot(y = top_countries.index, x = top_countries)
for index, value in enumerate(top_countries):
    plt.text(value//2, index, str(round(value,2)))
    
plt.tight_layout(pad=3.0)

In [None]:
plt.subplot(121)
top_players = df1[['short_name', 'overall']][:20]
plt.title("Top 20 players in FIFA20")
sns.barplot(y = top_players.short_name, x = top_players.overall)
for index, value in enumerate(top_players.overall):
    plt.text(value//2, index, str(value))
    
plt.subplot(122)
plot_top20(df1, 'potential')

plt.tight_layout(pad=3.0)

In [None]:
plt.subplot(121)
plot_top20(df1, 'value_eur')
plt.subplot(122)
plot_top20(df1, 'wage_eur')
plt.tight_layout(pad=3.0)

In [None]:
wage_skew = df1.wage_eur.skew().round(4)
value_skew = df1.value_eur.skew().round(4)
print(f'Wage skewness value is {wage_skew} and value skewness value is {value_skew}')

These skewness values are way more than 1, this means that the data is highly skewed towards the right, i.e. players with higher overall. 

# What makes a good footballer?

In [None]:
sns.heatmap(data = df1[['age','height_cm','weight_kg','overall','potential','skill_moves',
                        'value_eur','wage_eur','international_reputation']].corr(), 
            annot = True,  cmap = "vlag", vmin = -1, vmax = 1, center = 0)

Aside from some obvious correlations like height vs weight and wage vs value, we can see some other correlations as well. 

Weirdly enough, increase in height and weight decreases the amount of skill moves you have. Age also has a negative correlation with potential, which makes sense as older players would have reached their potential and would not have as high potentials as younger players do, as seen by the positive correlation with overall.

Higher overall and potential players are paid and valued higher which would come of as obvious but another thing to notice here is that their international reputation also plays some role in the amount of wage they get and the value they have. Extra pay for the PR they provide perhaps. The better the player plays, higher the reputation becomes.

In [None]:
sns.relplot(data = df1, x = 'overall', y = 'value_eur', palette = 'viridis',
            hue='age',aspect=2, kind = 'line')

Here we can see a gradient of age. Young players are valued more than older players of the same overall performance, perhaps because they will serve the club for longer and have greater potential for growth.

In [None]:
sns.relplot(data = df1, x = 'overall', y = 'wage_eur', palette = 'viridis',
            hue='age',aspect=2)

When it comes to wage, we cannot see any such trend. Everyone gets paid on the basis of their overall performance.

In [None]:
plt.subplot(121)
plt.pie(x = df.groupby(['preferred_foot']).preferred_foot.count().to_list(),
        labels = ["Left", "Right"], autopct='%1.2f%%', explode = (0, 0.1))

plt.subplot(122)
sns.countplot(data = df1, x = 'preferred_foot')
right, left = df1.preferred_foot.value_counts()
total = right + left
plt.text(0, left//2, left, fontsize = 20,  horizontalalignment='center')
plt.text(1, right//2, right, fontsize = 20, horizontalalignment='center')


In [None]:
avg_left = df1.groupby(["preferred_foot"]).overall.mean()[0].round(2)
avg_right = df1.groupby(["preferred_foot"]).overall.mean()[1].round(2)
print(f'Left footed players have the average overall of {avg_left}, whereas\
 right footed players have the average overall of {avg_right}')

Left handed people constitute about 10% of the population in the world. We can see that left footed players are much more in proportion in football as compared to the population proportion. This trend can be seen across various sports. It is theorized that left handed people are often better at sports, let us check this hypothesis using a simple z-test.

In [None]:
print("""H0 = There is no significant difference between left handed people and right handed people in sports
H1 = There is a significant difference between left handed people and right handed people in sports
""")
z_score = ((right-left)/total)/(sqrt(0.10*0.90*(1/left+1/right)))
p_value = scipy.stats.norm.sf(abs(z_score))*2
if (p_value>0.05):
    print("Since p-value > α, we cannot reject the null hypthesis")
    print("There is no significant difference between left handed people and right handed people in sports")
else:
    print("Since p-value < α, we can reject the null hypthesis")
    print("There is a significant difference between left handed people and right handed people in sports")

# Distributions

In [None]:
sns.histplot(x= df.age, kde=True, bins=20)
plt.axvline(x= df.age.mean(),c='green',ls='-.',label='Mean Age')
plt.legend()
plt.title('Distribution of Age')

The drops in number of players at the ages of 20, 25 and 30 might be players leaving college football, professional football for some other job or retiring from the game respectively

In [None]:
plt.subplot(121)
sns.histplot(x= df.weight_kg, kde=True, bins=20)

plt.subplot(122)
sns.histplot(x= df.height_cm, kde=True, bins=20)

The plots look normally distributed

In [None]:
sns.histplot(data = df, x= 'BMI', bins=20, hue = 'work_rate', element = 'poly')

In [None]:
print(f'The mean BMI {df1.BMI.mean().round(2)} is with a standard deviation of {df1.BMI.std().round(2)}')

This is why playing any kind of outdoor sports is great! Altough BMI is a controversial statistic to measure fitness and obesity, almost every football here has an healthy BMI and do not show a lot of deviation from the mean( the range for healthy BMI for men is 18.5 – 24.9 ). 

In [None]:
highest_bmi_player = df[df.BMI == df.BMI.max()].short_name.to_string(index = False)
lowest_bmi_player = df[df.BMI == df.BMI.min()].short_name.to_string(index = False)
print(f'Player with highest BMI is {highest_bmi_player} with an BMI of {df.BMI.max().round(2)} and\
 the player with the lowest BMI is {lowest_bmi_player} with an BMI of {df.BMI.min().round(2)}')

![Adebayo Akinfenwa](https://img.republicworld.com/republic-prod/stories/promolarge/xhdpi/ba9nxb7nb7tbskdh_1594727317.jpeg)
Adebayo Akinfenwa

![Jorge Carrascal](https://i.imgur.com/D3Pexui.png)
Jorge Carrascal

In [None]:
country_count = df1.nationality.value_counts()[:30]
plt.xticks(rotation = 60)
sns.barplot(x = country_count.index, y = country_count)

Despite having much lower number of players than England, Uruguay has won 2 FIFA World Cups, whereas England has only won 1. Sometimes quality beats quantity.

In [None]:
club_count = df1.club.value_counts()[:25]
plt.xticks(rotation = 60)
sns.barplot(x = club_count.index, y = club_count)

In [None]:
def club_view(dataframe, club_name):
    """
    Gives a dataframe of the players from the given club name

        Parameters
        ----------
        dataframe : Pandas.Dataframe
            parent dataframe
        
        club_name : str
            string of the club's name

        Returns
        -------
        dataframe with all players from the club
    """
    return dataframe.loc[dataframe.club == club_name]


def nation_view(dataframe, country_name):
    """
    Gives a dataframe of the players from the given club name

        Parameters
        ----------
        dataframe : Pandas.Dataframe
            parent dataframe with 
        
        club_name : str
            string of the club's name

        Returns
        -------
        dataframe with all players from the club
    """
    return dataframe.loc[dataframe.nationality == country_name]

Let us predict the Euro Cup Final through the data

In [None]:
nation_view(df1, 'England').describe()

In [None]:
nation_view(df1, 'Italy').describe()

In [None]:
england_avg = nation_view(df1, 'England').sort_values('overall', ascending  = False)['overall'].mean().round(2)
italy_avg = nation_view(df1, 'Italy').sort_values('overall', ascending  = False)['overall'].mean().round(2)
print(f'The average Italian player has an average overall of {italy_avg} and an average English player has an average overall of {england_avg}')

Looks like Italy is going to win!

P.S. They did win!

![The Euro Cup 2020 was won by Italy](https://external-content.duckduckgo.com/iu/?u=https%3A%2F%2Ftse1.mm.bing.net%2Fth%3Fid%3DOIF.ir2jQjso4n%252bbRAkeRX6g8Q%26pid%3DApi&f=1)

**<font size = 6>If you liked this notebook, dont forget to upvote it!</font>**