# ------------English Premier League 2020-21 Analysis------------


## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import squarify
import matplotlib

## Importing Dataset

In [None]:
df= pd.read_csv("../input/english-premier-league202021/EPL_20_21.csv")
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

## Correlation Matrix

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True, cmap='Pastel1')

## Distinct Postions 

In [None]:
df['Position'].unique()

## Age Distribution in the league.

In [None]:
df[df['Age']==df['Age'].min()]

- **Above four** are the **youngest players** in English Premier league.

In [None]:
df[df['Age']==df['Age'].max()]

- **Willy Cabellero** of Chelsea is the **oldest player** in the English Premier League.

In [None]:
plt.figure(figsize=(18,8))
sns.boxplot(x='Club',y='Age', data=df)
plt.xticks(rotation=90)

- Crystal Palace and West Ham United has one of the eldest players percentile and Tottenham Hotspur has the youngest player in their squad as well as their eldest player is around 26-27 years old. Machester United's average player age  is also the lowest amongst all the clubs in Premier League.

In [None]:
plt.figure(figsize=(12,8))
sns.displot(data=df, x="Age", kde=True )

- Density plot above shown that most players in premier league are in there mid 20's which is prime age for a footballer.

## Player distribution by nationality

In [None]:
fig = plt.figure(figsize=(18,12))
sizes=df.groupby('Nationality',)['Name'].count().sort_values(ascending= False).tolist() 
label=df.groupby('Nationality',)['Name'].count().sort_values(ascending= False).index.values.tolist()
final_label = [i+" "+j for i, j in zip(label, [str(x) for x in sizes])]

squarify.plot(sizes=sizes, label=final_label, alpha=0.6 )

- **Most players** are from **England(192**) as expected, followed by **France(31)**, **Brazil(27)** and then **Spain (26)**.

In [None]:
fig = plt.figure(figsize=(18,12))
sizes=df.groupby('Club',)['Name'].count().sort_values(ascending= False).tolist() 
label=df.groupby('Club',)['Name'].count().sort_values(ascending= False).index.values.tolist()
final_label = [i+" "+j for i, j in zip(label, [str(x) for x in sizes])]

squarify.plot(sizes=sizes, label=final_label, alpha=0.6 )


- **West Brom** has **largest squad** with 30 players in it, followed by Arsenel and Man United with 29 each.
  **Leeds United** has the **smallest squad** with 23 players under them.

## Goals Scored by Clubs

In [None]:
goal_team = pd.DataFrame(df.groupby('Club', as_index=False)['Goals'].sum() )
plt.figure(figsize = (18,8))

ax =sns.barplot(x='Club', y='Goals', data=goal_team.sort_values(by="Goals"))
plt.xticks(rotation=45)



- **Manchester City** scored the **most goals** in the league with 80+ goals and **Sheffield United** scored the **least goals** scoring just under 20.

## Top Goal Scorer

In [None]:
top_scorers= df.sort_values(by="Goals", ascending=False)[:5]
top_scorers

In [None]:
ax =sns.barplot(x='Name', y='Goals', data=top_scorers)
plt.xticks(rotation=45)


- **Harry kane** is the **top scorer** of the league with 23 Goals followed by **Mohammed salah** with 22 Goals.

## Top Assist Getter

In [None]:
top_assists= df[['Name', 'Club', 'Assists']].sort_values(by="Assists", ascending=False)[:5]
top_assists

In [None]:
ax =sns.barplot(x='Name', y='Assists', data=top_assists)
plt.xticks(rotation=45)

- Again **Harry Kane** scored the **most-14 assists** followed by **Kevin De Bruyne's 12 assists**.

## Efficiency of Top Players

In [None]:
attack_df=df[['Name','Club', 'Mins', 'Goals', 'Assists', 'Penalty_Goals', 'Penalty_Attempted', 'xG', 'xA']]
attack_df['Goal_involvement'] = attack_df['Goals']+attack_df['Assists']
attack_df['min_per_goal_inv'] = attack_df['Mins']/attack_df['Goal_involvement']
attack_df['expected_goal_inv'] = (attack_df['xG']+df['xA'])*(attack_df['Mins']/90)
attack_df['min_per_exp_goal_inv']= df['Mins']/attack_df['expected_goal_inv']
attack_df.replace([np.inf, -np.inf], np.nan, inplace=True)
attack_df.head()

 Here We defined new dataframe with attack relevent columns that are following :-
  
- Goal_involvement     - Total Goals involvement of player which is sum of 'Goals' and 'Assists'.
- min_per_goal_inv     - Minutes taken by a player per goal invovement in the league.
- expected_goal_inv    - expected goal involvement is the sum of expected goal per match(xG) and expected assist per match(xA)                            multiiplied by total full matches played by player which inturn is equal to total minutes played divided by 90.
- min_per_exp_goal_inv - Minutes taken by a player per expected goal invovement in the league.

## Goal Involvement Efficiency

In [None]:
name = attack_df[['Name','Club', 'Goals', 'Assists', 'Goal_involvement', 'expected_goal_inv' ]].sort_values(by="Goal_involvement", ascending = False)[:10]
name.head()

In [None]:
attack_df[['Name', 'xG', 'xA', 'Goal_involvement', 'expected_goal_inv']].sort_values(by="expected_goal_inv", ascending = False)[:10].head()


In [None]:
plt.figure(figsize=(15, 8))
ax = sns.barplot(x='Name', y='Goal_involvement', data=name)
width_scale = 0.45
plt.xticks(rotation=90)
for bar in ax.containers[0]:
    bar.set_width(bar.get_width() * width_scale)

ax2 = ax.twinx()
sns.barplot(x='Name', y='expected_goal_inv', data=name, alpha=0.7, hatch='xx', ax=ax2)
for bar in ax2.containers[0]:
    x = bar.get_x()
    w = bar.get_width()
    bar.set_x(x + w * (1- width_scale))
    bar.set_width(w * width_scale)

plt.show()


- Harry Kane, Son Heung-min and Marcus Rashford are the only players who had more Goal Involvement than expected among the top    attacking players. These three players had good efficiency infront of the Goal.

## Minutes Per Goal Involvement Efficiency

In [None]:
name = attack_df[['Name', 'Club', 'Mins', 'min_per_goal_inv', 'min_per_exp_goal_inv']][attack_df['Mins']>1000].sort_values(by="min_per_goal_inv")[:10]
name

In [None]:
attack_df[['Name', 'Club', 'Mins', 'min_per_goal_inv', 'min_per_exp_goal_inv']][attack_df['Mins']>1000].sort_values(by="min_per_exp_goal_inv")[:10]

In [None]:
plt.figure(figsize=(15, 10))
ax = sns.barplot(x='Name', y='min_per_goal_inv', data=name)
width_scale = 0.45
plt.xticks(rotation=90)
for bar in ax.containers[0]:
    bar.set_width(bar.get_width() * width_scale)
ax.set_ylim(0, 250)

ax2 = ax.twinx()
sns.barplot(x='Name', y='min_per_exp_goal_inv', data=name, alpha=0.7, hatch='xx', ax=ax2)
for bar in ax2.containers[0]:
    x = bar.get_x()
    w = bar.get_width()
    bar.set_x(x + w * (1- width_scale))
    bar.set_width(w * width_scale)
ax2.set_ylim(0, 250)

plt.show()

- Jesse Lingard has the biggest diference between the minutes per Goal Involvement and expected Goal Involvement. He took the least minutes per Goal Involvement in comparision to the expectation.
- So Jesse Lingard was most efficient attacking player when looked in terms of minutes taken per Goal Involvement.

## Penalty Goals

- Penalty Goals are the easiest goal a player can score just like free throws in basketball.
- Any player can score penalty awarded to the fouled player unlike basketball.
- So, Penalty Goals should not symbolize the goal scoring ability of player.

In [None]:
attack_df.sort_values(by="Penalty_Goals", ascending= False)[:5]

- **Bruno Fernandes of Manchester United** Scored the **most Penalty Goals (9)** followed by **Jamie vardy of leicester City (8)**.

In [None]:
attack_df['Penalty_perc']=attack_df['Penalty_Goals']/attack_df['Penalty_Attempted']
attack_df[['Name', 'Club', 'Penalty_Goals', 'Penalty_Attempted', 'Penalty_perc']].sort_values(by='Penalty_perc', ascending = False)[:30]



In [None]:
plt.figure(figsize=(18,8))
ax =sns.barplot(x='Name', y='Penalty_perc', data=attack_df.sort_values(by='Penalty_perc', ascending = False)[:30])
plt.xticks(rotation=90)

## Defender Goalscorer

In [None]:
df[['Name', 'Club', 'Position', 'Goals', 'Assists']][df['Position']=='DF'].sort_values(by='Goals', ascending = False)[:10]

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x='Name', y='Goals', data=df[df['Position']=='DF'].sort_values(by='Goals', ascending = False)[:10])
plt.xticks(rotation=90)

- **Lewis Dunk** and **Kurt Zuma** scored **most Goals (5)** playing as **Defenders**.

## Most Passes

In [None]:
most_pass_att = df[['Name','Club','Passes_Attempted','Position']].nlargest(10,'Passes_Attempted')
most_pass_att

In [None]:
plt.figure(figsize=(10,6))
ax =sns.barplot(x='Name', y='Passes_Attempted', data=most_pass_att)
plt.xticks(rotation=90)

- **Andrew Robertson** followed by **Trent Alexander-Arnold** both of Liverpool FC **attempted most Passes** in the league.

## Most Efficient Passers

In [None]:
pass_acc = df[(df['Position'] != 'GK') & (df['Position'] != 'DF') & (df['Passes_Attempted']>= 1000)]
pass_acc[['Name', 'Club', 'Passes_Attempted', 'Perc_Passes_Completed']].nlargest(10, 'Perc_Passes_Completed')

In [None]:
plt.figure(figsize=(10,6))
ax =sns.barplot(x='Name', y='Perc_Passes_Completed', data=pass_acc.nlargest(10, 'Perc_Passes_Completed'))
plt.xticks(rotation=90)

- **Mohamed Elneny of Arsenal FC** followed by **Georginio Wjinaldum of Liverpool FC** were the **most efficient passer**s with atleast 1000 passes in the League.

## Most Agressive Team and Player

In [None]:
df['Total_Cards'] = df['Yellow_Cards']+df['Red_Cards']
df[['Name', 'Club', 'Yellow_Cards', 'Red_Cards', 'Total_Cards']].nlargest(10, 'Total_Cards')

In [None]:
plt.figure(figsize=(10,6))
ax =sns.barplot(x='Name', y='Total_Cards', data=df.nlargest(10, 'Total_Cards'))
plt.xticks(rotation=90)

- **John Mcginn of Aston Villa** followed by **Harry Maguire of Man United** got the most number of Cards for aggresive actions.

In [None]:
team_cards=df.groupby(by='Club', as_index=False)['Total_Cards'].sum().sort_values(by='Total_Cards',ascending=False)
team_cards

In [None]:
plt.figure(figsize=(10,6))
ax =sns.barplot(x='Club', y='Total_Cards', data=team_cards)
plt.xticks(rotation=90)

- **Sheffield United** was the **most agressive team** in the league with most cards overall and **Liverpool FC** was the **least agressive** as they recieved the least number of cards in total.

SUMMARY :
- **Crystal Palace** and **West Ham United** has one of the eldest players percentile and** Tottenham Hotspur** has the youngest player in their squad
- **West Brom** has largest squad with 30 players in it. **Leeds United** has the smallest squad with 23 players under them.
- The most number of Goals scored by a Team : **Manchester City**
- The most number of Goals scored by a player : **Harry Kane** 
- **Harry Kane, Son Heung-min and Marcus Rashford** are the most efficient players in term of Goal Involvement.
- **Jesse Lingard** is the most efficient player in terms of minutes per Goal Involvement.
- The most number of Penalty Goals scored by a Player : **Bruno Fernandes(9)**.
- The most number of Goals by a Defender : **Lewis Dunk and Kurt Zuma (5)**.
- The most number of Passes attempted by a player : **Andrew Robertson**.
- The most Efficient Passer with atleast 1000 passes : **Mohamed Elneny**.
- The most number of Total Cards recieved by a player: **John Mcginn**.
- The most number of Total Cards recieved by a Team : **Sheffield United**.