In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#import dataframe_image as dfi
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('../input/english-premier-league-xg-data/EPL_result.csv')

#Name to abbreviation
team_abb={'Everton':'EVE', 'Aston Villa':'AVL',
          'Leicester City':'LEI', 'Arsenal':'ARS',
          'Liverpool':'LIV','Tottenham':'TOT', 
          'Chelsea':'CHE', 'Leeds United':'LEE', 
          'Newcastle Utd':'NEW','West Ham':'WHU', 
          'Southampton':'SOU', 'Crystal Palace':'CRY', 
          'Wolves':'WOL','Manchester City':'MCI', 
          'Brighton':'BHA', 'Manchester Utd':'MUN', 
          'West Brom':'WBA','Burnley':'BUR', 
          'Sheffield Utd':'SHU', 'Fulham':'FUL'}
df.Home=df.Home.apply(lambda x: team_abb[x])
df.Away=df.Away.apply(lambda x: team_abb[x])

df['GD']=df.G_Home-df.G_Away
df['Pts_Home']=df.GD.apply(lambda x : 3 if x>0 else (0 if x<0 else 1))
df['Pts_Away']=df.GD.apply(lambda x : 0 if x>0 else (3 if x<0 else 1))
df.head()

In [None]:
gw_last=7        #Last Gameweek number to be updated
gw_next=gw_last+1

In [None]:
df_temp=pd.DataFrame({'Team':list(team_abb.values())})
df_temp['M_h']=df_temp.Team\
.apply(lambda x:df[df.Home==x][df.GW<gw_next].count()[0])

df_temp['M_a']=df_temp.Team\
.apply(lambda x:df[df.Away==x][df.GW<gw_next].count()[0])

df_temp['M']=df_temp.M_h+df_temp.M_a

df_temp['xG_h']=df_temp.Team\
.apply(lambda x:df.xG_Home[df.Home==x][df.GW<gw_next].sum())

df_temp['xG_a']=df_temp.Team\
.apply(lambda x:df.xG_Away[df.Away==x][df.GW<gw_next].sum())

df_temp['xG']=df_temp.xG_a+df_temp.xG_h

df_temp['xGpm_h']=df_temp.xG_h/df_temp.M_h
df_temp['xGpm_a']=df_temp.xG_a/df_temp.M_a
df_temp['xGpm']=df_temp.xG/df_temp.M

df_temp['xGA_h']=df_temp.Team\
.apply(lambda x:df.xG_Away[df.Home==x][df.GW<gw_next].sum())

df_temp['xGA_a']=df_temp.Team\
.apply(lambda x:df.xG_Home[df.Away==x][df.GW<gw_next].sum())

df_temp['xGA']=df_temp.xGA_a+df_temp.xGA_h

df_temp['xGApm_h']=df_temp.xGA_h/df_temp.M_h
df_temp['xGApm_a']=df_temp.xGA_a/df_temp.M_a
df_temp['xGApm']=df_temp.xGA/df_temp.M

df_temp['delta_xGpm']=df_temp.xGpm-df_temp.xGApm


df_temp['P_h']=df_temp.Team\
.apply(lambda x:df.Pts_Home[df.Home==x][df.GW<gw_next].sum())

df_temp['P_a']=df_temp.Team\
.apply(lambda x:df.Pts_Away[df.Away==x][df.GW<gw_next].sum())

df_temp['P']=df_temp.P_a+df_temp.P_h

df_temp['Ppm_h']=df_temp.P_h/df_temp.M_h
df_temp['Ppm_a']=df_temp.P_a/df_temp.M_a
df_temp['Ppm']=df_temp.P/df_temp.M

df_temp['G_h']=df_temp.Team\
.apply(lambda x:df.G_Home[df.Home==x][df.GW<gw_next].sum())

df_temp['G_a']=df_temp.Team\
.apply(lambda x:df.G_Away[df.Away==x][df.GW<gw_next].sum())

df_temp['G']=df_temp.G_a+df_temp.G_h

df_temp['GA_h']=df_temp.Team\
.apply(lambda x:df.G_Away[df.Home==x][df.GW<gw_next].sum())

df_temp['GA_a']=df_temp.Team\
.apply(lambda x:df.G_Home[df.Away==x][df.GW<gw_next].sum())

df_temp['GA']=df_temp.GA_a+df_temp.GA_h

df_temp['GD']=df_temp.G-df_temp.GA
df_temp['delta_xG_ha']=df_temp.xG_h-df_temp.xG_a
df_temp['delta_xGA_ha']=df_temp.xGA_h-df_temp.xGA_a

### EPL TABLE

In [None]:
df_temp[['Team','M','P','GD','G','GA' ]].sort_values(by=['P','GD','G'], ascending=False)\
.style.background_gradient(cmap='RdYlGn',subset=['G','GA'])

### xG Scored vs Conceded

In [None]:
plt.figure(figsize=(8,5))
#plt.suptitle("EPL 2020/21 SEASON UPTO GW4")

plt.subplot(1,2,1)
plt.title("xG Scored per match")
sns.barplot(orient='h', x='xGpm',y='Team',
            data=df_temp.sort_values(by='xGpm',ascending=False))
plt.grid(which='both', axis='x')

plt.subplot(1,2,2)
plt.title("xG Conceaded per match")
sns.barplot(orient='h', x='xGApm',y='Team',
            data=df_temp.sort_values(by='xGApm',ascending=True))
plt.grid(which='both', axis='x')

plt.tight_layout()
plt.savefig('xg_xa.png')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
plt.title("xG Scored Vs xG Conceded")
sns.scatterplot(data=df_temp, x='xGApm', y='xGpm')
for i in range(df_temp.shape[0]):
    plt.text(df_temp.xGApm[i]+0.01, df_temp.xGpm[i]+0.01, 
             df_temp.Team[i], fontdict={'fontsize':8})
plt.xlabel("xG conceded Per match")
plt.ylabel("xG Scored Per match")
#plt.plot([0,3],[0,3],'r--')
plt.xlim(df_temp.xGApm.min()-0.2,df_temp.xGApm.max()+0.2)
plt.ylim(df_temp.xGpm.min()-0.2,df_temp.xGpm.max()+0.2)
plt.axhline(y=df_temp.xGpm.mean(),ls='--', color='k')
plt.axvline(x=df_temp.xGApm.mean(),ls='--', color='k')

plt.text(x=1.6, y=2.25, s="Q1\nStrong Attack\nWeak Defence", 
         alpha=0.7,fontsize=9, color='red')
plt.text(x=0.9, y=2.25, s="Q2\nStrong Attack\nStrong Defence", 
         alpha=0.7,fontsize=9, color='red')
plt.text(x=0.9, y=0.9, s="Q3\nWeak Attack\nStrong Defence", 
         alpha=0.7,fontsize=9, color='red')
plt.text(x=1.6, y=0.9, s="Q3\nWeak Attack\nWeak Defence", 
         alpha=0.7,fontsize=9, color='red')

#plt.savefig('scatter_xg_xa.png')
plt.show()

In [None]:
plt.figure(figsize=(6,4))
plt.title("xG Scored - xG Conceded")
sns.barplot(orient='h', x='delta_xGpm',y='Team', 
            data=df_temp.sort_values(by='delta_xGpm', ascending=False))
plt.grid(which='both', axis='x')
plt.xlabel('Delta xG')
plt.tight_layout()
#plt.savefig('delta_xg.png')
plt.show()

### Home Advantage

In [None]:
plt.figure(figsize=(8,5))
#plt.suptitle("EPL 2020/21 SEASON UPTO GW4")

plt.subplot(1,2,1)
plt.title("xG Scored at home per match")
sns.barplot(orient='h', x='xG_h',y='Team',
            data=df_temp.sort_values(by='xG_h',ascending=False))
plt.grid(which='both', axis='x')

plt.subplot(1,2,2)
plt.title("xG Scored at away per match")
sns.barplot(orient='h', x='xG_a',y='Team',
            data=df_temp.sort_values(by='xG_a',ascending=False))
plt.grid(which='both', axis='x')

plt.tight_layout()
#plt.savefig('xg_xa.png')
plt.show()

In [None]:
plt.figure(figsize=(8,5))
#plt.suptitle("EPL 2020/21 SEASON UPTO GW4")

plt.subplot(1,2,1)
plt.title("xG Conceded at home per match")
sns.barplot(orient='h', x='xGA_h',y='Team',
            data=df_temp.sort_values(by='xGA_h',ascending=True))
plt.grid(which='both', axis='x')

plt.subplot(1,2,2)
plt.title("xG Conceded at away per match")
sns.barplot(orient='h', x='xGA_a',y='Team',
            data=df_temp.sort_values(by='xGA_a',ascending=True))
plt.grid(which='both', axis='x')

plt.tight_layout()
#plt.savefig('xg_xa.png')
plt.show()

In [None]:
sns.barplot(data=df_temp.sort_values(by='xG', ascending=False), x='delta_xG_ha', y='Team')
plt.show()

### Prediction

In [None]:
df_fix=df[df.GW>gw_last].iloc[:,:3].reset_index()

def predict_xg(df_fix):
    df_fix['G_home']=0.0
    df_fix['G_away']=0.0

    for i in range(df_fix.shape[0]):
        df_fix.G_home[i]=(df_temp.xGpm[df_temp.Team==df_fix.Home[i]].sum()+
                       df_temp.xGApm[df_temp.Team==df_fix.Away[i]].sum())/2
        df_fix.G_away[i]=(df_temp.xGpm[df_temp.Team==df_fix.Away[i]].sum()+
                       df_temp.xGApm[df_temp.Team==df_fix.Home[i]].sum())/2

    df_fix['GD']=df_fix['G_home']-df_fix['G_away']
    df_fix['GS']=df_fix['G_home']+df_fix['G_away']
    df_fix=df_fix.sort_values(by='GD', ascending=False)
    return df_fix

df_fix=predict_xg(df_fix)

df_styled=df_fix.iloc[:,1:][df_fix.GW==gw_next].style.background_gradient(cmap='RdYlGn',subset=['GD','GS']).hide_index()
#dfi.export(df_styled,"mytable.png")
df_styled

### Fixture Difficulty Rating

In [None]:
weeks_for_fdr=3
gw_dict={}
for i in range(1,weeks_for_fdr+1):
    gw_dict.update({str("GW"+str(gw_last+i)):gw_last+i})


df_fdr=pd.DataFrame({'Team':df_temp.Team})

for GW in gw_dict.keys():
    temp_df=df_fix[df_fix.GW==gw_dict[GW]]

    df_fdr[GW]=df_fdr.Team\
    .apply(lambda x:(temp_df[temp_df.Home==x].GD.sum()) 
           if x in (temp_df.Home.unique()) 
           else -temp_df[temp_df.Away==x].GD.sum())
    
sc=MinMaxScaler()
df_fdr['Mean']=df_fdr.mean(axis=1)
for col in gw_dict.keys():
    df_fdr[col]=sc.fit_transform(np.array(df_fdr[col]).reshape(-1,1))
df_fdr=df_fdr.sort_values(by='Mean', ascending=False)
df_fdr.style.background_gradient(cmap='RdYlGn',
                                 subset=list(gw_dict.keys()))

### Actual Vs Expected Goals For and Against (After updating GW8 results)

In [None]:
df_result=pd.read_csv("../input/english-premier-league-xg-data/EPL_result.csv")
df_result=df_result[df_result.GW==gw_next]

#Name Preprocess
df_result.Home=df_result.Home.apply(lambda x: team_abb[x])
df_result.Away=df_result.Away.apply(lambda x: team_abb[x])

df_result['Match']=df_result.Home+" Vs "+df_result.Away
df_result["act_GD"]=df_result.G_Home-df_result.G_Away
df_result["act_xGD"]=df_result.xG_Home-df_result.xG_Away
df_result["pred_xG_Home"]=df_result.Home\
.apply(lambda x:df_fix.G_home[df_fix.Home==x][df_fix.GW==gw_next].sum())
df_result["pred_xG_Away"]=df_result.Home\
.apply(lambda x:df_fix.G_away[df_fix.Home==x][df_fix.GW==gw_next].sum())
df_result["pred_xGD"]=df_result.Home\
.apply(lambda x:df_fix.GD[df_fix.Home==x][df_fix.GW==gw_next].sum())
df_result['xG_diff']=abs(df_result.pred_xGD-df_result.act_xGD)
df_result.sort_values(by='xG_diff', ascending=True, inplace=True)
df_result

In [None]:
plt.figure(figsize=(8,4))
plt.title("Predicted xG difference vs Actual xG difference")
sns.barplot(y='Match', x='pred_xGD', orient='h',data=df_result, color='red', alpha=0.8
            , label="Predicted")
sns.barplot(y='Match', x='act_xGD', orient='h', data=df_result, color='blue', alpha=0.8
            , label="Actual")
plt.xlabel("Expected Goal Difference")
#plt.xticks(rotation=90)
plt.legend(loc="upper left")
plt.grid(axis='x')
plt.savefig('pred_vs_act.png')
plt.show()