# Analyzing the Chelsea Team

Initial Setup of Environment:

In [None]:
# importing libraries
import numpy as np
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
# important pandas options
pd.set_option("display.max.columns", None)

Importing the FIFA 20 Dataset:

In [None]:
fifa_df = pd.read_csv('../input/fifa-20-complete-player-dataset/players_20.csv')

In [None]:
fifa_df.head()

Next I am going to filter the above dataset to output all players currently belonging to Chelsea Club. I will follow 2 procedures here to ensure we get very player:
* First get all the players who are playing for Chelsea this season.
* Second, get Hakim Ziyech's row of data since we acquired him recently but the dataset is updated. Then remove information that we don't yet have on him, i.e. contract expiry, wage and value.

In [None]:
chelsea_df = fifa_df.loc[(fifa_df['club'] == "Chelsea") | (fifa_df['short_name'] == "H. Ziyech")]

Now let us see what we are working with:

In [None]:
# shape of data
chelsea_df.shape

In [None]:
chelsea_df.head()

We will not use every column. I am more interested in these particular ones: Name, Age, Height, Weight, Nationality, Overall, Potential, Value, Wage, Preferred Foot, Contract Expiry.

I select the Columns  I want and Rename them to suitable names and copy over the dataset so we can make changes to it. I also apply changes to Hakim Ziyech's row of data.

In [None]:
# selecting the columns i want
chelsea_squad = chelsea_df[['short_name', 'age','player_positions', 'overall', 'potential', 'nationality' ,'value_eur', 'wage_eur', 'preferred_foot','height_cm', 'weight_kg', 'contract_valid_until']].copy()
# replacing values i don't have on H. Ziyech
chelsea_squad.loc[chelsea_squad['short_name'] == 'H. Ziyech', ['contract_valid_until', 'wage_eur', 'value_eur']] = np.NaN

In [None]:
# renaming some columns
chelsea_squad.rename(columns={"short_name": "name", "contract_valid_until": "contract_expiry"}, inplace=True)

Final Dataset:

In [None]:
chelsea_squad

Statistics involving the Overall rating of players:

In [None]:
chelsea_squad['overall'].describe()

Let us see the age distribution in the club:

In [None]:
# Age Distribution 
plt.figure(figsize=(18,10))
plt.title('Age Distribution in Club')
sns.distplot(a=chelsea_squad['age'], kde=False, bins=10)

As expected, the plot reveals that we have a lot of youngsters at the club, which is good long-term for the club.

Having a lot of youngsters is one thing, but how much potential is in this group of people? I decided to select the players 25 years/under and compare their current overall with their potential, excluding players who have already hit their potential mark."

In [None]:
chelsea_squad[(chelsea_squad['overall'] != chelsea_squad['potential']) & (chelsea_squad['age'] <= 25)].sort_values(by='potential', ascending=False)[['name', 'age', 'player_positions','overall', 'potential']]

If I were playing career mode, some of these players are the ones I would keep an keen eye for the long-term future of the club.

Another interesting group of players is the older ones, who have already hit their potential. The following information would be useful when considering who to keep longer at the club or who to let go:

In [None]:
chelsea_squad[chelsea_squad['overall'] == chelsea_squad['potential']][['name', 'age', 'overall', 'contract_expiry' ,'value_eur', 'wage_eur']].sort_values(by='age', ascending=False)

Let's look at the Nationality representation at the club:

In [None]:
# Nationality Representation
chelsea_squad['nationality'].value_counts()

In [None]:
plt.figure(figsize=(18,13))
plt.title('Nationality Represention at the Club')
sns.countplot(x="nationality", data=chelsea_squad, order = chelsea_squad['nationality'].value_counts().index)

 Height and Weight Distribution at the club:

In [None]:
# Height Distribution 
plt.figure(figsize=(18,10))
plt.title('Height Distribution in Club')
sns.distplot(a=chelsea_squad['height_cm'], kde=False)

In [None]:
# mean height
chelsea_squad['height_cm'].mean()

In [None]:
# Weight Distribution 
plt.figure(figsize=(18,10))
plt.title('Weight Distribution in Club')
sns.distplot(a=chelsea_squad['weight_kg'], kde=False)

In [None]:
# mean weight
chelsea_squad['weight_kg'].mean()

What is the relationship between Age and Potential and Overall Rating of a Player?

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(18,10))
sns.regplot(x=chelsea_squad['age'], y=chelsea_squad['overall'], ax=ax[0])
sns.regplot(x=chelsea_squad['age'], y=chelsea_squad['potential'], ax=ax[1])
ax[0].set_title('Age vs Overall')
ax[1].set_title('Age vs Potential')

Overall Rating seems to improve with age while Potential Rating reduces with increasing age.

Next, we can generate a heatmap for all the numerical values, to see how they correlate to one another.

In [None]:
plt.figure(figsize=(18, 10))
plt.title('Heatmap of Numerical Values in the Club')
sns.heatmap(data=chelsea_squad[['age', 'overall', 'potential', 'height_cm', 'weight_kg', 'value_eur', 'wage_eur']].corr(), annot=True)

Next, we going to analyze players by the position they play, so we get an idea of the positions we lack enough talent for and what we should look for when signing players.
I am going to create a new column in the dataframe called 'position' based on these 4 rules: 
* Attackers - ST, RW, LW
* Midfielders - CAM, CM, RM, LM, CDM
* Defenders - CB, LB, RB, LWB, RWB
* Keepers - GK

(For those who can play more than one position, we will group them according to the first position they can play, since I assume that is their main playing position)

In [None]:
attackers = ['ST', 'RW', 'LW']
mid = ['CAM', 'CM', 'RM', 'LM', 'CDM']
defenders = ['CB', 'LB', 'RB', 'LWB', 'RWB']

# method to find main playing position
def player_position(positions):
    # main position is going to be in the first three letters in the string
    main = positions[:4]
    main = main.replace(',','') # removing commas
    main = main.strip() # removing spaces
    
    if main in attackers:
        return 'Attacker'
    elif main in mid:
        return 'Midfielder'
    elif main in defenders:
        return 'Defender'
    else:
        return 'Goalkeeper'

Creating a New column 'position' in our dataset and grouping each player accordingly

In [None]:
result = []

for idx, pos in chelsea_squad.iterrows():
    position = player_position(pos['player_positions'])
    result.append(position)

chelsea_squad['position'] = result

In [None]:
chelsea_squad

Important Statistics involving the different positions:

In [None]:
chelsea_squad.groupby('position').overall.describe()

Let's show the same information on a plot instead:

In [None]:
plt.figure(figsize=(15,8))
plt.title("Box Plot: Position vs Overall Rating")
sns.boxplot(x='position', y='overall', data=chelsea_squad)

This information might help you when you considering which area to bolster:

Attacking:

In [None]:
chelsea_squad[chelsea_squad['position'] == 'Attacker'].sort_values(by='overall', ascending=False)

Midfield:

In [None]:
chelsea_squad[chelsea_squad['position'] == 'Midfielder'].sort_values(by='overall', ascending=False)

Defending:

In [None]:
chelsea_squad[chelsea_squad['position'] == 'Defender'].sort_values(by='overall', ascending=False)

Goalkeepers:

In [None]:
chelsea_squad[chelsea_squad['position'] == 'Goalkeeper'].sort_values(by='overall', ascending=False)

As we conclude, let's see the data of our loan players, to see who is worth calling back:

In [None]:
loan_df = fifa_df[fifa_df['loaned_from'] == 'Chelsea'][['short_name', 'age', 'club', 'overall', 'potential', 'player_positions', 'contract_valid_until']]

In [None]:
loan_df