![Image](https://e0.365dm.com/20/05/2048x1152/skysports-bundesliga-dortmund_4983488.jpg)
# Are teams playing worse?  
A friend commented on the fact that the quality of play seemed worse since the league has resumed, and says an effect might be the lack of fans watching live in the stadium (i.e. the 12th player). An example of how this can manifest is the crowd encouraging the home team, cheering passes etc. while booing and creating a general hostile environment for when the away team has the ball.

Another aspect of home-field advantage is the impact a crowd can have on a referee. A considerable body of academic research, in fact, has long suggested that “all or part of home advantage” is down to “refereeing decisions being subconsciously in favor of the home team" (Gleave). 

This notebook will be updated as more games are played.

Factors to be explored:
1. Change in Home win rate.
2. Change in Home team penalisation by referee.

In [None]:
import sklearn
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
game_data = pd.read_csv(r'/kaggle/input/european-football-database-20192020/D1.csv')
game_data

In [None]:
# removing extraneous data - just want to start with win rate analysis. Will do other info later

win_data = game_data[['Date','HomeTeam','AwayTeam', 'FTHG', 'FTAG','FTR']]
win_data

In [None]:
# Adjusting the info to make the upcoming steps simpler

pd.set_option('mode.chained_assignment', None)
win_data['HomeWin'] = win_data['FTR'].apply(lambda x: 1 if x is 'H' else 0)
win_data['HomeLoss'] = win_data['FTR'].apply(lambda x: 1 if x is 'A' else 0)
win_data['HomeDraw'] = win_data['FTR'].apply(lambda x: 1 if x is 'D' else 0)

win_data['AwayWin'] = win_data['FTR'].apply(lambda x: 1 if x is 'A' else 0)
win_data['AwayLoss'] = win_data['FTR'].apply(lambda x: 1 if x is 'H' else 0)
win_data['AwayDraw'] = win_data['FTR'].apply(lambda x: 1 if x is 'D' else 0)
# win_data = win_data[['Date','HomeTeam', 'AwayTeam', 'HomeWin', 'HomeLoss','HomeDraw']]
win_data = win_data.drop('FTR', axis=1)
win_data

In [None]:
# splitting data for before and after break
preCOVID = win_data.iloc[:224]
postCOVID = win_data.iloc[224:]

# Current Table  
 (as of 28/06/2020)

In [None]:
pd.options.display.float_format = '{:,.1f}'.format
table = win_data.groupby('HomeTeam').agg({'HomeTeam': 'count', 'FTHG': 'sum', 'FTAG' : 'sum', 'HomeWin' : 'sum', 'HomeDraw' : 'sum', 'HomeLoss' : 'sum'})
away = win_data.groupby('AwayTeam').agg({'AwayTeam': 'count', 'FTAG': 'sum', 'FTHG' : 'sum', 'AwayWin' : 'sum', 'AwayDraw' : 'sum', 'AwayLoss' : 'sum'})
table = pd.concat([table,away], axis=1)
table['Points'] = table['HomeWin']*3 + table['HomeDraw']*1 + table['AwayWin']*3 + table['AwayDraw']*1
table = table.rename(columns={'HomeTeam': 'HomeGames', 'AwayTeam': 'AwayGames'})
table = table.sort_values('Points', ascending = False)
table['MP'] = table['HomeGames'] + table['AwayGames']
table['GF'] = table.iloc[:,1] + table.iloc[:,7]
table['GA'] = table.iloc[:,2] + table.iloc[:,8]
table['GD'] = table['GF'] - table['GA']
table['W'] = table['HomeWin'] + table['AwayWin']
table['D'] = table['HomeDraw'] + table['AwayDraw']
table['L'] = table['HomeLoss'] + table['AwayLoss']
table = table[['MP','W','D','L','GF','GA','GD','Points']]
table

## A table showing more detailed information

In [None]:
table = win_data.groupby('HomeTeam').agg({'HomeTeam': 'count', 'FTHG': 'mean', 'FTAG' : 'mean', 'HomeWin' : 'sum', 'HomeDraw' : 'sum', 'HomeLoss' : 'sum'})
away = win_data.groupby('AwayTeam').agg({'AwayTeam': 'count', 'FTAG': 'mean', 'FTHG' : 'mean', 'AwayWin' : 'sum', 'AwayDraw' : 'sum', 'AwayLoss' : 'sum'})
table = pd.concat([table,away], axis=1)
table['Points'] = table['HomeWin']*3 + table['HomeDraw']*1 + table['AwayWin']*3 + table['AwayDraw']*1
table = table.rename(columns={'HomeTeam': 'HomeGames', 'AwayTeam': 'AwayGames'})
table = table.sort_values('Points', ascending = False)
table = table.rename(columns={'FTHG': 'AveHomeGoals', 'FTAG': 'AveAwayGoals'})
table

# Home Stats

The hypothesis is that the lack of fans in the stadium may result in a drop in both home win rate and drop in the number of home goals scored.  

The final three columns of the following dataframe give a good insight into the claim.

In [None]:
def color_negative_red(value):
  """
  Colors elements in a dateframe
  green if positive and red if
  negative. Does not color NaN
  values.
  """

  if value < 0:
    color = 'red'
  elif value > 0:
    color = 'green'
  else:
    color = 'black'

  return 'color: %s' % color

In [None]:
# pre-COVID break
pd.options.display.float_format = '{:,.1f}'.format

data = preCOVID.groupby('HomeTeam').agg({'HomeTeam': 'count', 'FTHG': 'mean', 'FTAG' : 'mean', 'HomeWin' : 'sum', 'HomeDraw' : 'sum', 'HomeLoss' : 'sum'})
data.columns = ['# pre-COVID matches', 'AveGF1', 'AveGA1', 'W', 'D', 'L']

data['WinRate1'] = data['W']/data['# pre-COVID matches']
data = data.sort_values('WinRate1', ascending = False)

# post-COVID break


data2 = postCOVID.groupby('HomeTeam').agg({'HomeTeam': 'count', 'FTHG': 'mean', 'FTAG' : 'mean', 'HomeWin' : 'sum', 'HomeDraw' : 'sum', 'HomeLoss' : 'sum'})
data2.columns = ['# post-COVID matches', 'AveGF2', 'AveGA2', 'W2', 'D2', 'L2']

data2['WinRate2'] = data2['W2']/data2['# post-COVID matches']
# data2 = data2.sort_values('WinRate2', ascending = False)

home = pd.concat([data,data2], axis=1)
home['ΔGF'] = home['AveGF2'] - home['AveGF1']
home['ΔGA'] = home['AveGA2'] - home['AveGA1']
home['ΔWinRate'] = home['WinRate2'] - home['WinRate1']

s = home.style.applymap(color_negative_red, subset=['ΔWinRate','ΔGF','ΔGA'])
s = s.format("{:.0f}")
s = s.format({'ΔWinRate': "{:.1%}",'WinRate1': "{:.1%}", 'WinRate2': "{:.1%}",'ΔGF': "{:.1f}",'ΔGA': "{:.1f}",'AveGF1': "{:.1f}",'AveGA1': "{:.1f}",'AveGF2': "{:.1f}",'AveGA2': "{:.1f}"})
s

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
axs[0].bar(['Pre-COVID', 'Post-COVID'], [home.AveGF1.mean(), home.AveGF2.mean()])
axs[0].set_title('Home Goals Scored')
axs[1].bar(['Pre-COVID', 'Post-COVID'], [home.WinRate1.mean(), home.WinRate2.mean()])
axs[1].set_title('Home Win Rate')
print('Change in Mean Home Goals: '+'{:.1f}'.format(home.ΔGF.mean()))
print('Change in Home Win Rate: '+'{:.1%}'.format(home.ΔWinRate.mean()))

### statistical significance of the change in win rate
This isn't finished yet

In [None]:
# want to assess statistical significance of the change in win rate at some point

print('Pre-COVID Win Rate: '+'{:.3f}'.format(home.WinRate1.mean()))
print('Pre-COVID Win Rate sd: '+'{:.3f}'.format(home.WinRate1.std()))
print('Post-COVID Win Rate: '+'{:.3f}'.format(home.WinRate2.mean()))
print('Post-COVID Win Rate: '+'{:.3f}'.format(home.WinRate2.std()))
data = win_data.groupby('HomeTeam').agg({'HomeTeam': 'count', 'FTHG': 'mean', 'FTAG' : 'mean', 'HomeWin' : 'sum', 'HomeDraw' : 'sum', 'HomeLoss' : 'sum'})
data.columns = ['# pre-COVID matches', 'AveGF1', 'AveGA1', 'W', 'D', 'L']
data['WinRate'] = data['W']/data['# pre-COVID matches']
print('Season Average Win Rate: '+'{:.3f}'.format(data.WinRate.mean()))




## Conclusions
1. There is a small decrease in home goals scored after the COVID break, which gives support to the hypothesis.
2. There is also a decent reduction in Home win rate, again supporting the hypothesis.
3. Contributing factor to big drop for Dortmund is likely due to the fact that Bayern have already won the league, so there is less drive for players to win.
4. There is also noticible post-COVID improvement for Bayern and Hertha, would be interesting to understand why. (Just checked, Hertha got a new manager - would explain the big performance difference).

# Away Stats  
We can use the away stats to further interrogate the significance of the Home game stats.  
According to our hypothesis, the lack of home team advantage from fans should result in improved Away team performance.  

The final three columns of the following dataframe give a good insight into the claim.

In [None]:
# pre-COVID break
pd.options.display.float_format = '{:,.1f}'.format

data3 = preCOVID.groupby('AwayTeam').agg({'AwayTeam': 'count', 'FTAG': 'mean', 'FTHG' : 'mean', 'AwayWin' : 'sum', 'AwayDraw' : 'sum', 'AwayLoss' : 'sum'})
data3.columns = ['# pre-COVID matches', 'AveGF1', 'AveGA1', 'W', 'D', 'L']

data3['WinRate1'] = data3['W']/data3['# pre-COVID matches']
data3 = data3.sort_values('WinRate1', ascending = False)

# post-COVID break
data4 = postCOVID.groupby('AwayTeam').agg({'AwayTeam': 'count', 'FTAG': 'mean', 'FTHG' : 'mean', 'AwayWin' : 'sum', 'AwayDraw' : 'sum', 'AwayLoss' : 'sum'})
data4.columns = ['# post-COVID matches', 'AveGF2', 'AveGA2', 'W2', 'D2', 'L2']

data4['WinRate2'] = data2['W2']/data2['# post-COVID matches']

home = pd.concat([data3,data4], axis=1)
home['ΔGF'] = home['AveGF2'] - home['AveGF1']
home['ΔGA'] = home['AveGA2'] - home['AveGA1']
home['ΔWinRate'] = home['WinRate2'] - home['WinRate1']

s = home.style.applymap(color_negative_red, subset=['ΔWinRate','ΔGF','ΔGA'])
s = s.format("{:.0f}")
s = s.format({'ΔWinRate': "{:.1%}",'WinRate1': "{:.1%}", 'WinRate2': "{:.1%}",'ΔGF': "{:.1f}",'ΔGA': "{:.1f}",'AveGF1': "{:.1f}",'AveGA1': "{:.1f}",'AveGF2': "{:.1f}",'AveGA2': "{:.1f}"})
s

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
axs[0].bar(['Pre-COVID', 'Post-COVID'], [home.AveGF1.mean(), home.AveGF2.mean()])
axs[0].set_title('Away Goals Scored')
axs[1].bar(['Pre-COVID', 'Post-COVID'], [home.WinRate1.mean(), home.WinRate2.mean()])
axs[1].set_title('Away Win Rate')
print('Change in Mean Away Goals: '+'{:.1f}'.format(home.ΔGF.mean()))
print('Change in Away Win Rate: '+'{:.1%}'.format(home.ΔWinRate.mean()))

## Conclusions  
1. Slight increase in average goals scored by away team again supports the hypothesis that the removal of fans has led to a reduction in home advantage.
2. There is still a reduction in away team win rate, though this is much smaller than the reduction in win rate for home teams. This means that perhaps there was a negative effect on team performance in general from the break, due to other factors such as disrupted training.

# Home and Away Combined  
showing overall stats for before and after the COVID break.

In [None]:
table = preCOVID.groupby('HomeTeam').agg({'HomeTeam': 'count', 'FTHG': 'sum', 'FTAG' : 'sum', 'HomeWin' : 'sum', 'HomeDraw' : 'sum', 'HomeLoss' : 'sum'})
away = preCOVID.groupby('AwayTeam').agg({'AwayTeam': 'count', 'FTAG': 'sum', 'FTHG' : 'sum', 'AwayWin' : 'sum', 'AwayDraw' : 'sum', 'AwayLoss' : 'sum'})
table = pd.concat([table,away], axis=1)
table['Points'] = table['HomeWin']*3 + table['HomeDraw']*1 + table['AwayWin']*3 + table['AwayDraw']*1
table = table.rename(columns={'HomeTeam': 'HomeGames', 'AwayTeam': 'AwayGames'})
table = table.sort_values('Points', ascending = False)
table['MP'] = table['HomeGames'] + table['AwayGames']
table['GF'] = table.iloc[:,1] + table.iloc[:,7]
table['GA'] = table.iloc[:,2] + table.iloc[:,8]
table['GD'] = table['GF'] - table['GA']
table['W'] = table['HomeWin'] + table['AwayWin']
table['D'] = table['HomeDraw'] + table['AwayDraw']
table['L'] = table['HomeLoss'] + table['AwayLoss']
table = table[['MP','W','D','L','GF','GA','GD','Points']]

table1 = postCOVID.groupby('HomeTeam').agg({'HomeTeam': 'count', 'FTHG': 'sum', 'FTAG' : 'sum', 'HomeWin' : 'sum', 'HomeDraw' : 'sum', 'HomeLoss' : 'sum'})
away = postCOVID.groupby('AwayTeam').agg({'AwayTeam': 'count', 'FTAG': 'sum', 'FTHG' : 'sum', 'AwayWin' : 'sum', 'AwayDraw' : 'sum', 'AwayLoss' : 'sum'})
table1 = pd.concat([table1,away], axis=1)
table1['Points'] = table1['HomeWin']*3 + table1['HomeDraw']*1 + table1['AwayWin']*3 + table1['AwayDraw']*1
table1 = table1.rename(columns={'HomeTeam': 'HomeGames', 'AwayTeam': 'AwayGames'})
table1 = table1.sort_values('Points', ascending = False)
table1['MP'] = table1['HomeGames'] + table1['AwayGames']
table1['GF'] = table1.iloc[:,1] + table1.iloc[:,7]
table1['GA'] = table1.iloc[:,2] + table1.iloc[:,8]
table1['GD'] = table1['GF'] - table1['GA']
table1['W'] = table1['HomeWin'] + table1['AwayWin']
table1['D'] = table1['HomeDraw'] + table1['AwayDraw']
table1['L'] = table1['HomeLoss'] + table1['AwayLoss']
table1 = table1[['MP','W','D','L','GF','GA','GD','Points']]

table = pd.concat([table,table1], axis=1)
table

# Referee Home Bias  
Will explore this next