In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#import dataframe_image as dfi

from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('../input/english-premier-leage-season-202021/EPL_GW4_standard.csv')
df_fix=pd.read_csv('../input/english-premier-leage-season-202021/EPL_fixture.csv')

In [None]:
#xG and xGA per match home and away 
df['xGpm_h']=df.xG_h/df.MP_h
df['xGpm_a']=df.xG_h/df.MP_a
df['xGApm_h']=df.xGA_h/df.MP_h
df['xGApm_a']=df.xGA_h/df.MP_a

In [None]:
#Aggregate data
df['MP']=df['MP_h']+df['MP_a']
df['xG']=df['xG_h']+df['xG_a']
df['xGA']=df['xGA_h']+df['xGA_a']
df['delta_xG']=df['xG']-df['xGA']
df['GF']=df.GF_a+df.GF_h
df['GA']=df.GA_a+df.GA_h
df['xaG']=df.GF-df.xG
df['xaGA']=df.xGA-df.GA
#Aggregate per match
df['xGpm']=df['xG']/df['MP']
df['xGApm']=df['xGA']/df['MP']
df['delta_xGpm']=df['delta_xG']/df['MP']
df['GFpm']=df.GF/df.MP
df['GApm']=df.GA/df.MP
df['xaGpm']=df.xaG/df.MP
df['xaGApm']=df.xaGA/df.MP

Going into the international break after Gameweek 4, 38 matches have been played which is exactly 10% of the total 380 matches to be played during the season.
Current season has been by far been unpredictable with last years top teams dropping points and some mid-table teams and minnows performing remarkable well.
In this post I try to analyze the performance of teams and try to predict the result of upcoming fixtures.

# Home Advantage, Does that even exist in Post Covid Era?

Due to the pandemic, matches are currently happening in empty stadiums, home advantage in my opinion is more than the familiarity with the playing turf, it is the spirit and encouragement by tens of thousands of die-hard fans rooting for the victory of home team.

Thats why even a thought of visiting Anfield or Old Trafford sends shivers down the spine of away teams. Generally teams perform better in front of their home crowd compared to away fixture.

<img src="https://miro.medium.com/max/335/1*9TooAmJF0ec0w-TOCxSRVA.gif">

In [None]:
print("Total Matches : {}".format(df.MP_h.sum()))
print("Home Team Win : {}".format(df.W_h.sum()))
print("Away Team Win : {}".format(df.L_h.sum()))
print("Draw : {}".format(df.D_h.sum()))
x=[df.W_h.sum(), df.D_h.sum(), df.L_h.sum()]
labels=["Home Win", "Draw", "Away Win"]
explode=[0.01,0.01,0.01]

plt.pie(x=x, labels=labels,explode=explode, startangle=90,
        autopct='%1.2f%%',wedgeprops={"width":0.6})
plt.savefig('h_vs_a.png')
plt.show()

In the current season, so far there is no evidence of home advantage. Out of the 38 matches played so far, 19 resulted in the away team winning, 3 were draws and Home team only managed to win 16 matches which is around 42% of the total matches played.

# Curious case of Home Disadvantage!

In [None]:
labels=["Home","Away"]
explode=[0.01,0.01]
plt.figure(figsize=(8,8))
plt.subplot(221)
plt.title("Goals Scored")
x=[sum(df.GF_h),sum(df.GF_a)]
plt.pie(x=x, labels=labels,explode=explode, startangle=90,
        autopct='%1.2f%%',wedgeprops={"width":0.6})

plt.subplot(222)
plt.title("Goals Conceded")
x=[sum(df.GA_h),sum(df.GA_a)]
plt.pie(x=x, labels=labels,explode=explode, startangle=90,
        autopct='%1.2f%%',wedgeprops={"width":0.6})

plt.subplot(223)
plt.title("xG Scored")
x=[sum(df.xG_h),sum(df.xG_a)]
plt.pie(x=x, labels=labels,explode=explode, startangle=90,
        autopct='%1.2f%%',wedgeprops={"width":0.6})

plt.subplot(224)
plt.title("xG Conceded")
x=[sum(df.xGA_h),sum(df.xGA_a)]
plt.pie(x=x, labels=labels,explode=explode, startangle=90,
        autopct='%1.2f%%',wedgeprops={"width":0.6})
plt.savefig('pie_xg.png')
plt.show()

Home teams have been outscored by away teams in this season. Moreover home teams have underperfomed in both xG(Expected Goals Scored) and xGA(Expected Goals Conceded).

In [None]:
print("Average xG per match(Home) : {}"
     .format(df.xGpm_h.mean()))
print("Average xG per match(Away) : {}"
     .format(df.xGpm_a.mean()))
print("Average xG Conceaded per match(Home) : {}"
     .format(round(df.xGApm_h.mean(),2)))
print("Average xG Conceaded per match(Away) : {}"
     .format(round(df.xGApm_a.mean(),2)))

plt.figure(figsize=(4,4))
sns.scatterplot(data=df,x='xGApm_h', y='xGpm_h', 
                color='blue', label='Home', alpha=0.5)
sns.scatterplot(data=df,x='xGApm_a', y='xGpm_a', 
                color='red', label='Away', alpha=0.5)
plt.show()

In [None]:
df_agg=df[['Squad','MP', 'xG','xGA', 'delta_xG', 'xGpm', 'xGApm',
           'delta_xGpm']]
#df_agg.sort_values(by='delta_xGpm', ascending=False)

### Expected Goals Scored and Conceaded

In [None]:
plt.figure(figsize=(8,5))
#plt.suptitle("EPL 2020/21 SEASON UPTO GW4")

plt.subplot(1,2,1)
plt.title("xG Scored per match")
sns.barplot(orient='h', x='xGpm',y='Squad',
            data=df_agg.sort_values(by='xGpm',ascending=False))
plt.grid(which='both', axis='x')

plt.subplot(1,2,2)
plt.title("xG Conceaded per match")
sns.barplot(orient='h', x='xGApm',y='Squad',
            data=df_agg.sort_values(by='xGApm',ascending=True))
plt.grid(which='both', axis='x')

plt.tight_layout()
plt.savefig('xg_xa.png')
plt.show()

The defending champions Liverpool is leading in expected goals per game with around 2.5 xG per game including penalties. Spurs, Aston Villa and Everton have also managed xG greater than 2.

West Brom which is currently the lowest ranking team is struggling to create chances with very low xG.
Teams with low xG lacks creativity, they should reinforce the team with creative players.

xG can be low for teams which employ defensive style of play which rely on counter attacks or mistakes from the opposition to score goals.

Everton, Southampton and West Ham seems to have impenetrable defense registering xGA below 1. Sheffield, Aston Villa and Brighton have also displayed solid defense.

3 out out of last season’s top four teams-Liverpool, Manchester City and Mancester United features in the worst 5 defensive teams as per xG conceded. Manchester United recorded the highest expected goals conceded averaging around 2.5 goals per game. Leeds and West Brom also have more than 2 xGA.

Teams with high xGA should immediately address their problems in defense.

<img src="https://miro.medium.com/max/625/1*UQ4iicvVVl5mDNKKuAvWGw.gif">
> Woodward, Give that man a centre-defender

In [None]:
plt.figure(figsize=(6,4))
plt.title("xG Scored - xG Conceded")
sns.barplot(orient='h', x='delta_xGpm',y='Squad', 
            data=df_agg.sort_values(by='delta_xGpm', ascending=False))
plt.grid(which='both', axis='x')
plt.xlabel('Delta xG')
plt.tight_layout()
plt.savefig('delta_xg.png')
plt.show()

Delta xG is the difference between expected goals scored and expected goals conceded which can be used to evaluate the form of a team. Everton seems to dominate the league with delta xG per game almost 1.5. 

Tottenham and Aston Villa also have delta xG more than 1. 

Despite 7–2 humiliation at the hands of Aston Villa, the Reds are close behind at 4th position. 

In the other end, Manchester united and West Brom have delta xG more than 1.5.

In [None]:
plt.figure(figsize=(10,6))
plt.title("xG Scored Vs xG Conceded")
sns.scatterplot(data=df_agg, x='xGApm', y='xGpm')
for i in range(df_agg.shape[0]):
    plt.text(df_agg.xGApm[i]+0.01, df_agg.xGpm[i]+0.01, 
             df_agg.Squad[i], fontdict={'fontsize':8})
plt.xlabel("xG conceded Per match")
plt.ylabel("xG Scored Per match")
plt.plot([0,3],[0,3],'r--')
plt.xlim(df_agg.xGApm.min()-0.2,df_agg.xGApm.max()+0.2)
plt.ylim(df_agg.xGpm.min()-0.2,df_agg.xGpm.max()+0.2)
plt.savefig('scatter_xg_xa.png')
plt.show()

The teams above the red line have better xG compared to xGA. Last seasons top scorers Manchester City who usually dominates the league has failed to produce more chances than their opponents. Their city rivals Manchester United had the worst start in decades.

# Expectation Vs Reality

In [None]:
plt.figure(figsize=(8,5))
sns.scatterplot(data=df, x=df.xGpm, y=df.GFpm)
for i in range(df.shape[0]):
    plt.text(x=df.xGpm[i]+0.01, y=df.GFpm[i]+0.01, 
             s=df.Squad[i], fontsize=8)
plt.plot([-3,3],[-3,3],'r--')
plt.xlabel("Expected Goals per match")
plt.ylabel("Goals per match")
plt.xlim(df.xGpm.min()-0.2, df.xGpm.max()+0.2)
plt.ylim(df.GFpm.min()-0.2, df.GFpm.max()+0.2)
plt.tight_layout()
#plt.savefig('expvsact_g.png')
plt.show()

A team could outperform xG either when the team has prolific strikers who can convert half chances or luckily managed to score some goals. 

In [None]:
plt.figure(figsize=(4,5))
sns.barplot(orient='h', y='Squad',x='xaGpm', 
            data=df.sort_values(by='xaGpm', ascending=False))
plt.grid(which='both', axis='x')
#plt.savefig('expvsact_gbar.png')
plt.show()

Aston Villa is outperforming xG by 1.5 goals per match which they are not likely to carry forward.

Leicester City also seems to outperform the xG by huge margin, this can be attributed to 3 penalties scored by Vardy. Unless Leicester is awarded penalties every weekend(which is highly unlikely even with the new lenient handball policy), their margin would come down.

2 out of Manchester United’s 5 goals are also from penalties, xG from open play for the red devils also would be much lower.

In [None]:
plt.figure(figsize=(8,5))
sns.scatterplot(data=df, x=df.xGApm, y=df.GApm)
for i in range(df.shape[0]):
    plt.text(x=df.xGApm[i], y=df.GApm[i], 
             s=df.Squad[i], fontsize=7)
plt.plot([0,3],[0,3],'r--')
plt.xlabel('Expected Goals Conceded per match')
plt.ylabel('Goals conceded per match')
plt.tight_layout()
#plt.savefig('xaGA.png')
plt.show()

In [None]:
plt.figure(figsize=(4,5))
sns.barplot(orient='h', y='Squad',x='xaGApm', 
            data=df.sort_values(by='xaGApm', ascending=False))
plt.grid(which='both', axis='x')
#plt.savefig('xaGA_b.png')
plt.show()

Aston Villa, New Castle, Arsenal and Leeds are the only 4 teams out of 20 who managed to outperform the expected goals conceded. It seems that goal scoring has been very easy this season.

Liverpool, Manchester United, Fulham, Burnley and Brighton were unlucky to concede more than one goal per match than expected.

# Quadrant Analysis

In [None]:
plt.figure(figsize=(8,5))
sns.scatterplot(x='xaGApm', y='xaGpm', data=df, hue='delta_xGpm',
                size='delta_xGpm')
plt.grid(which='both')
plt.legend()
plt.plot([5,-5],[0,0], 'k--')
plt.plot([0,0],[5,-5], 'k--')
plt.xlim(df.xaGApm.min()-0.2,df.xaGApm.max()+0.3)
plt.ylim(df.xaGpm.min()-0.3,df.xaGpm.max()+0.2)
for i in range(df.shape[0]):
    plt.text(x=df.xaGApm[i]+0.02, y=df.xaGpm[i]+0.02, 
             s=df.Squad[i], fontsize=7)
    
plt.text(x=0.1, y=1, s="Q1\nOverperformed xG\nOverperformed xA", 
         alpha=0.7,fontsize=9, color='red')
plt.text(x=0.1, y=-0.5, s="Q4\nUnder performed xG\nOverperformed xA", 
         alpha=0.7,fontsize=9, color='red')
plt.text(x=-1, y=1, s="Q2\nOverperformed xG\nUnderperformed xA", 
         alpha=0.7,fontsize=9, color='red')
plt.text(x=-1, y=-0.5, s="Q3\nUnder performed xG\nUnderperformed xA", 
         alpha=0.7,fontsize=9, color='red')
plt.xlabel("Goals Coceded(Expected-Actual) per match")
plt.ylabel("Goals Scored(Actual-Expected) per match")
plt.tight_layout()
#plt.savefig('op_up.png')
plt.show()

The best scenario is over-performing xG and xGA by a healthy margin.

Over-performing xG by a high margin is generally not sustainable in the long run.

Falling behind xG is not a big problem if the team manages to create more chances(xG).

Size of the bubble indicates the the expected net goals scored(xG-xGA) per match. First aim of the teams should be to improve the net goals scored.



1st Quadrant: Over-performed xG and Over-performed xGA

Teams in Q1 are doing well in over performing expected goals they have to carry on the momentum teams like Leeds and New Castle should try to improve the net Goals scored.


2nd Quadrant:Over-performed xG and Under-performed xGA

Teams in Q2 should try to move towards Q1, that is improve defense.


3rd Quadrant: Under-performed xG and Under-performed xGA

Teams in Q3 should improve both defense and attack.

4th Quadrant: Under-performed xG and Over-performed xGA

There are no teams in Q4.

# Prediction

The logic used to predict the scores of the match is based on xG per game and xGA per game for the previous period.
Home team’s score is calculated as average of xG of home team and xGA of away team.

G_home=(xG_home+xGA_away)/2<br>
Away team’s score is calculated as average of xG of away team and xGA of home team.

G_away=(xG_away+xGA_home)/2

Goal Difference(GD) is calculated as difference between home team’s score and away team’s score.

GD=G_home-G_away

Total goals scored(GS) is calculated as sum of home team’s score and away team’s score.

GD=G_home+G_away

Result of Gameweek 5 matches predicted is as below.

In [None]:
df_fix['G_home']=0.0
df_fix['G_away']=0.0

for i in range(df_fix.shape[0]):
    df_fix.G_home[i]=(df_agg.xGpm[df_agg.Squad==df_fix.Home[i]].sum()+
                   df_agg.xGApm[df_agg.Squad==df_fix.Away[i]].sum())/2
    df_fix.G_away[i]=(df_agg.xGpm[df_agg.Squad==df_fix.Away[i]].sum()+
                   df_agg.xGApm[df_agg.Squad==df_fix.Home[i]].sum())/2

df_fix['GD']=df_fix['G_home']-df_fix['G_away']
df_fix['GS']=df_fix['G_home']+df_fix['G_away']
df_fix=df_fix.sort_values(by='GD', ascending=False)
df_styled=df_fix.iloc[:,1:][df_fix.GW==5].style.background_gradient(cmap='RdYlGn',subset=['GD','GS']).hide_index()
#dfi.export(df_styled,"mytable.png")
df_styled

Higher the absolute value of Goal Difference, more one sided the match would be and value of Goal Difference closer to zero mean an evenly contested match can be expected.<br>
Negative value of GD implies that Away team is likely to win and positive value favors the home team to win.<br>

Higher value of Goals Scored implies that we can expect a goal fest from the fixture and lower value indicates a low scoring match.<br>

Based on the model, Burnley has the best fixture away against West Brom with goal difference 0.77 in favor of the Clarets.<br>
Brighton’s game vs Crystal palace is also expected to be walk in the (Selhurst) park for the seagulls.<br>
Man United’s visit to St. James’ Park is estimated to result in yet another heart break for the Red Devils.<br>

Manchester City vs Arsenal match is the most evenly contested match of the gameweek with xG slightly in favor of the Gunners.<br>
Chelsea vs Saints and Leeds vs Wolves are also going to be closely contested.<br>

Mersey side Derby is expected to be the goal shower with the hosts having and edge over the defending champions.<br>

# Can’t wait for the next gameweek to see how close the predictions are!!!

# Fixture Difficulty Rating

In [None]:
gw_dict={'GW5':5,'GW6':6,'GW7':7,'GW8':8}

df_fdr=pd.DataFrame({'Squad':df.Squad})

for GW in gw_dict.keys():
    df_temp=df_fix[df_fix.GW==gw_dict[GW]]

    df_fdr[GW]=df_fdr.Squad\
    .apply(lambda x:(df_temp[df_temp.Home==x].GD.sum()) 
           if x in (df_temp.Home.unique()) 
           else -df_temp[df_temp.Away==x].GD.sum())

In [None]:
sc=MinMaxScaler()
df_fdr['Mean']=df_fdr.mean(axis=1)
for col in gw_dict.keys():
    df_fdr[col]=sc.fit_transform(np.array(df_fdr[col]).reshape(-1,1))
df_fdr=df_fdr.sort_values(by='Mean', ascending=False)
df_fdr.style.background_gradient(cmap='RdYlGn',
                                 subset=list(gw_dict.keys()))

Based on the current performance of the teams, the teams are arranged in the order of increasing difficulty rating over the next four matches.