In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 23)


pl_df = pd.read_csv('../input/english-premier-league-results/results.csv')
pl_df.head()

In [None]:
# We will discard these years, as all analysis will be done with
# regards to half-time events and other events not accounted for here
pl_df = pl_df[(pl_df.Season != '1993-94') & (pl_df.Season != '1994-95')]

In [None]:
# still some data is missing, but we can analyze what is present even in these earlier years
# For the sake of graphing, let's convert seasons from strings to integers; use the starting year to keep things simple.
pl_df.Season = pl_df.Season.str.extract('(\d+)').astype(int)
pl_df

# First analysis: in how many games did the half-time result match the end in general? How has this changed over the years?

In [None]:
# how many rows have the values match over total rows: 60.4%
# half-time results predict full-time results a majority of the time
pl_df.loc[pl_df['FTR'] == pl_df['HTR']].shape[0] / pl_df.shape[0] * 100

In [None]:
# No discernible pattern; half-time results have become neither more nor less reliable
# as a method to predict full-time results over this period
gp = pl_df.groupby('Season')['FTR'].count().rename('GP')
htr = pl_df[pl_df['FTR'] == pl_df['HTR']].groupby(['Season'])['FTR'].count().rename("HalfPredictsFull")
results = pd.concat([htr, gp], axis=1).reset_index()
results['PredictionRate'] = results['HalfPredictsFull'] / results['GP'] * 100
results

In [None]:
# We do see less spread as time goes on, though
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=results, x="Season", y="PredictionRate")
plot.set_xticks(range(1995,2020));

# Second analysis: Are some refs disproportionally biased by home crowds?

In [None]:
# drop rows where referee data isn't found
rest = pl_df[(pl_df.Season != 1995) & (pl_df.Season != 1996) & (pl_df.Season != 1997) & (pl_df.Season != 1998) & (pl_df.Season != 1999)].reset_index().drop('index', axis=1)
rest.head()

Let's first look at teams' general home trends

In [None]:
hp = rest.groupby(['Season','HomeTeam'])['FTR'].count().rename("HP")
hw = rest[rest['FTR'] == 'H'].groupby(['Season','HomeTeam'])['FTR'].count().rename("HW")
hl = rest[rest['FTR'] == 'A'].groupby(['Season','HomeTeam'])['FTR'].count().rename("HL")
hd = rest[rest['FTR'] == 'D'].groupby(['Season','HomeTeam'])['FTR'].count().rename("HD")
home_games = pd.concat([hp,hw,hl,hd], axis=1).reset_index()
# no NaNs in rest
home_games.fillna(0, inplace=True)
home_games

In [None]:
home_records = home_games.groupby('Season').sum()
home_records

In [None]:
hp = rest.groupby(['Season','Referee'])['FTR'].count().rename("HP")
hw = rest[rest['FTR'] =='H'].groupby(['Season','Referee'])['FTR'].count().rename("HW")
hl = rest[rest['FTR'] =='A'].groupby(['Season','Referee'])['FTR'].count().rename("HL")
hd = rest[rest['FTR'] =='D'].groupby(['Season','Referee'])['FTR'].count().rename("HD")


refs = pd.concat([hp,hw,hl,hd], axis=1).reset_index()
# no NaNs in rest
refs.fillna(0, inplace=True)
refs

In [None]:
# calculate points for the season out of max available
home_records['HWPoints'] = home_records.HW * 3
home_records['HDPoints'] = home_records.HD.astype(int)
home_records['AverageHomePoints'] = (home_records['HDPoints'] + home_records['HWPoints']) / home_records['HP']
home_records

In [None]:
# referees' home win rates by season
refs['HWPoints'] = refs.HW.astype(int) * 3
refs['HDPoints'] = refs.HD.astype(int)
refs['AverageHomePoints'] = (refs['HDPoints'] + refs['HWPoints']) / refs['HP']
refs

Let's do an example analysis of just the most recent season, 2020-21

In [None]:
refs21 = refs.loc[refs['Season'] == 2020]
# A Moss has one game; some research shows J Moss has one game too few, and there is no A Moss
# on the Premier League's site, suggesting A Moss is actually J Moss. Add his HP and HL columns, calculate new rates.
refs21 = refs21.drop(454).reset_index()
refs21.loc[8, 'HP'] += 1
refs21.loc[8, 'HL'] += 1
refs21.loc[8, 'AverageHomePoints'] = (refs21.loc[8, 'HDPoints'] + refs21.loc[8, 'HWPoints']) / refs21.loc[8, 'HP']
refs21

In [None]:
sns.set(rc={'figure.figsize':(25, 10)})
plt.axhline(y=home_records['AverageHomePoints'][2020], color='blue', linewidth = 2)
sns.swarmplot(data=refs21, x="Referee", y="AverageHomePoints")

2021 was a year of the lowest home advantage in general, due to a lack of crowds caused by the COVID pandemic; this is represented in the data by the lowest AverageHomePoints of any of the seasons we are examining. For the remainder of this analysis, we shall drop that row, as well as any refs with under 10 games for the remaining period, and calculate the average over the entire period for teams in general and for specific referees.

In [None]:
home_records = home_records.drop(2020)
home_records

In [None]:
# Note that since there is an equal number of games for each season, the averages hold the same weight;
# we can just take their mean as needed
home_records.AverageHomePoints.mean()

In [None]:
# We can see a problematic convention with the ref names;
# some have full first-and-last-names, some use only the first initial.
# Let's take the 'last name, initials' format and change it to 'initials last name'
# Now splitting on spaces, the last substring would always refer to the referee's last name
# Last names are not unique on their own: see 'Taylor' below, for example.
mask = refs["Referee"].str.contains(",")
refs.loc[mask, "Referee"] = refs.loc[mask, "Referee"].str.split(", ").apply(reversed).str.join(" ")
refs.Referee.unique()

In [None]:
# So, we must split by space then get the first initial and last name
# of the referees. Must also trim �s.
mask = refs["Referee"].str.contains("�")
refs.loc[mask, "Referee"] = refs.loc[mask, "Referee"].str.split("�").str.join("")

refs.Referee.unique()

In [None]:
# some have problems with the casing of their name; make all names uppercase
refs["Name"] = (refs["Referee"].str[0] + " " + refs["Referee"].str.split(" ").str[-1]).str.upper()

refs = refs.drop('Referee', axis=1)
refs

In [None]:
# now combine each referee's row, summing the values, and recalculating the AHP

refNames = refs.groupby('Name').sum()
refNames['AverageHomePoints'] = (refNames['HDPoints'] + refNames['HWPoints']) / refNames['HP']
refNames

In [None]:
# Let's drop refereees who managed under 20 games as outliers whose data is insignificant due to such a small sample size.
refNames = refNames[refNames.HP >= 20]
refNames

In [None]:
# Now, let's see how they compare to the average

sns.set(rc={'figure.figsize':(75, 10)})
plt.axhline(y=home_records.AverageHomePoints.mean(), color='blue', linewidth = 2)
sns.swarmplot(data=refNames, x=refNames.index, y="AverageHomePoints")

In [None]:
# Looking at the graph, we can see that the likes of S Tanner and P Jones tend to manage games
# where home teams do better than average; analysis of who played in games they were in charge of,
# and how successful those teams were, could explain this, rather than simply assuming they are
# just easily swayed by home crowds.

# Third Analysis: How has shot conversion changed over time?

In [None]:
rest

In [None]:
# Get total shots and goals by season, calculate the average conversion rate
# combined for home and away and a separate one, see how they changed over time
home_shots = rest.groupby('Season')['HS'].sum()
away_shots = rest.groupby('Season')['AS'].sum()
home_shots_target = rest.groupby('Season')['HST'].sum()
away_shots_target = rest.groupby('Season')['AST'].sum()
home_goals = rest.groupby('Season')['FTHG'].sum()
away_goals = rest.groupby('Season')['FTAG'].sum()

shot_conv = pd.concat([home_shots, away_shots, home_shots_target, away_shots_target, home_goals, away_goals], axis=1).reset_index()

In [None]:
# Pure: goals/shot
# Target: shots-on-target/shot
# goalTarget: goals/shot-on-target
shot_conv['Shots_Total'] = shot_conv['HS'] + shot_conv['AS']
shot_conv['Shots_OnTarget_Total'] = shot_conv['HST'] + shot_conv['AST']
shot_conv['Goals_Total'] = shot_conv['FTHG'] + shot_conv['FTAG']
shot_conv['Total_Pure_Rate'] = shot_conv['Goals_Total'] / shot_conv['Shots_Total']
shot_conv['Home_Pure_Rate'] = shot_conv['FTHG'] / shot_conv['HS']
shot_conv['Away_Pure_Rate'] = shot_conv['FTAG'] / shot_conv['AS']
shot_conv['Total_Target_Rate'] = shot_conv['Shots_OnTarget_Total'] / shot_conv['Shots_Total']
shot_conv['Home_Target_Rate'] = shot_conv['HST'] / shot_conv['HS']
shot_conv['Away_Target_Rate'] = shot_conv['AST'] / shot_conv['AS']
shot_conv['Total_GoalTarget_Rate'] = shot_conv['Goals_Total'] / shot_conv['Shots_OnTarget_Total']
shot_conv['Home_GoalTarget_Rate'] = shot_conv['FTHG'] / shot_conv['HST']
shot_conv['Away_GoalTarget_Rate'] = shot_conv['FTAG'] / shot_conv['AST']
shot_conv

In [None]:
# Of the years examined, we can see that total shot conversion peaked in 2001-02, and while it has been
# up-and-down after that, the general trend is that conversion rates are decreasing over time. 
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=shot_conv, x="Season", y="Total_Pure_Rate")
plot.set_xticks(range(2000, 2020));

In [None]:
# We see here a peak in home conversion rate in 2009-10, while the rate in 2001-02 is nowhere near as far off
# from the regression as the total rate. Generally, home conversion rate has followed the overall rate in
# decreasing over time.
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=shot_conv, x="Season", y="Home_Pure_Rate")
plot.set_xticks(range(2000, 2020));

In [None]:
# The high peak in 2001-02 seems to stem from the away conversion rate, as seen here. Again, this rate is
# following the general trend of decreasing over time.
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=shot_conv, x="Season", y="Away_Pure_Rate")
plot.set_xticks(range(2000, 2020));

In [None]:
# There is a drastic drop-off in 2013-14; prior numbers are well above what we'd expect.
# Perhaps pre-2012 numbers are innacurate?
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=shot_conv, x="Season", y="Total_Target_Rate")
plot.set_xticks(range(2000, 2020));

In [None]:
# Before we dissect this, though, lets look at, generally, how the number of shots taken changed over time.
# We see a general increase over time. Let's do the same for both home and away teams.
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=shot_conv, x="Season", y="Shots_Total")
plot.set_xticks(range(2000, 2020));

In [None]:
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=shot_conv, x="Season", y="HS")
plot.set_xticks(range(2000, 2020));

In [None]:
# Both seem to follow the general trend shown
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=shot_conv, x="Season", y="AS")
plot.set_xticks(range(2000, 2020));

In [None]:
# We can see here that shots-on-target drop drastically in 2013, leading to the phenomenon shown above.
# Since the remainder of analysis in this section relies on shots-on-target, let's drop the prior rows.
shot_conv

In [None]:
#Now let's try that plot again, looking for any concrete trends
shot_conv = shot_conv[shot_conv.Season >= 2013]
shot_conv

In [None]:
# Here we can see that from 2013 onwards, shots have tended to be more accurate.
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=shot_conv, x="Season", y="Total_Target_Rate")
plot.set_xticks(range(2013, 2020));

In [None]:
# The correlation for shots-on-target to goals also trends positively, albeit loosely.
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=shot_conv, x="Season", y="Total_GoalTarget_Rate")
plot.set_xticks(range(2013, 2020));

In [None]:
# We can see that home teams specifically have followed the general trend of taking more accurate shots over time.
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=shot_conv, x="Season", y="Home_Target_Rate")
plot.set_xticks(range(2013, 2020));

In [None]:
# The away teams also follow the trend, and actually seem to be improving in this regard faster than home teams.
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=shot_conv, x="Season", y="Away_Target_Rate")
plot.set_xticks(range(2013, 2020));

In [None]:
# We can see here that in terms of converting shots-on-target, home teams have generally improved,
# although the 2020-21 season weighs things down and causes the regression to be nearly horizontal.
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=shot_conv, x="Season", y="Home_GoalTarget_Rate")
plot.set_xticks(range(2013, 2020));

In [None]:
# With away teams we see the inverse; it's hard to see a real pattern in away teams' conversion
# of shots on target, but the 2020-21 season brings the regression slope up
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=shot_conv, x="Season", y="Away_GoalTarget_Rate")
plot.set_xticks(range(2013, 2020));

In [None]:
# Suspicions for home goals are confirmed; 2020-21 was such an outlier it caused the general trend to be ignored.
no20 = shot_conv.drop(20)
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=no20, x="Season", y="Home_GoalTarget_Rate")
plot.set_xticks(range(2013, 2019));

In [None]:
# 2020 was indeed making the relationship between time and away goals from shots on target
# more interesting; the slope of the regression is much lower now
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=no20, x="Season", y="Away_GoalTarget_Rate")
plot.set_xticks(range(2013, 2019));

In [None]:
# Since we've been analyzing a smaller sample size, let's look at the general goals/shot rates for this period.
# We can see more shots are tending to end in goals from 2013-14 on.
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=shot_conv, x="Season", y="Total_Pure_Rate")
plot.set_xticks(range(2013, 2020));

In [None]:
# We see a generally positive trend over time for home teams, with the 2020-21 season being an exception
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=shot_conv, x="Season", y="Home_Pure_Rate")
plot.set_xticks(range(2013, 2020));

In [None]:
# We see the same for away teams, except here 2020-21 is more of a positive outlier.
# Given the reduced home-field advantage for that season mentioned earlier, let's omit that and look at the trends again.
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=shot_conv, x="Season", y="Away_Pure_Rate")
plot.set_xticks(range(2013, 2020));

In [None]:
# General trend is still positive
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=no20, x="Season", y="Total_Pure_Rate")
plot.set_xticks(range(2013, 2019));

In [None]:
# The home trend is indeed more positive than before, and the regression seems to fit better now
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=no20, x="Season", y="Home_Pure_Rate")
plot.set_xticks(range(2013, 2019));

In [None]:
# The away trend is less positive than before, as anticipated.
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=no20, x="Season", y="Away_Pure_Rate")
plot.set_xticks(range(2013, 2019));

In [None]:
# Earlier we looked at general shot numbers and saw they tended to increase; let's look at that for this period
# We actually see a decrease in shots taken over the last 7 seasons, despite the ratios of goals/shots improving
# This suggests strikers are taking lower-risk efforts, explaining the reduced volume and increased success rates of their shots.
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=shot_conv, x="Season", y="Shots_Total")
plot.set_xticks(range(2013, 2020));

Trends over the past 20 seasons suggest shots are less likely to be goals, but over the past 7 seasons, shots are becoming more accurate, and both shots in general and shots on target are becoming more likely to result in goals, suggesting either an improvement in strikers or a decline in the abilities of defenders and goalkeepers. The reduced volume of total shots over the more recent period, in conjunction with this, implies forwards are taking shots less frequently but more accurately. This suggests to me that players are not being as risky with their efforts over the past few years; they are preferring to wait for better chances before taking their shots.

# Fourth analysis: Can corners predict goals or results?

Generally, teams win corners from attacking moves. Teams tend to win corners when they are on the front foot. So let's see if there are patterns we can find with them. Do teams with more corners tend to win? Do they tend to score more goals? Do they tend to have more shots? Intuitively, one would think there's some relationship here.

In [None]:
# let's first get the relevant columns
corners = rest[['FTHG','FTAG','FTR','HS','AS','HC','AC']]
corners

In [None]:
# Check results first; let's give a 3 corner leeway. Being within 3 corners means either a draw-or-win for the
# team with more is still 'accurate'
# Even with that bit of leeway, using corners to determine the winner is accurate only 38.6% of the time.
corner_res = corners[((abs(corners['HC'] - corners['AC']) <= 3) & (corners['FTR'] == 'D') | ((corners['HC'] - corners['AC'] >= 3) & (corners['FTR'] == 'H')) | ((corners['HC'] - corners['AC'] <= -3) & (corners['FTR'] == 'A')))]
corner_res.shape[0]/corners.shape[0]

In [None]:
# How about goals, though? Let's plot goals against corners and see what we find.
# In general, we see that games with many corners aren't often high-scoring games.
corners['TotGoals'] = corners['FTHG'] + corners['FTAG']
corners['TotCorners'] = corners['HC'] + corners['AC']

sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=corners, x="TotCorners", y="TotGoals", scatter=False)

In [None]:
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=corners, x="HC", y="FTHG", scatter=False)

In [None]:
# When we plot the each team's goals against their corners, though, we see a different pattern.
# Individually, teams with more corners tend to have more goals, even if games with more total corners don't.
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=corners, x="AC", y="FTAG", scatter=False)

In [None]:
# Let's do the same, this time looking for a relationship with shots, rather than goals.
# Here, in all variations, we see a direct relationship between corners and shots.
corners['TotShots'] = corners['HS'] + corners['AS']

sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=corners, x="TotCorners", y="TotShots", scatter=False)

In [None]:
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=corners, x="HC", y="HS", scatter=False)

In [None]:
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=corners, x="AC", y="AS", scatter=False)

So, in conclusion: corners have a strong direct relationship with shots taken, both in total for the game and for each team. They have a clear relationship with goals as well, when looking at specific teams, but not at the overall game (that is, looking at away goals and away corners, not total goals and total corners). There isn't a strong relationship between final results and corners, however.

# Fifth analysis: How have bookings-per-foul changed over time?

There is a common complaint that the sport has 'gone soft' referring to the fact that 'lighter' fouls are punished more harshly than before. We can't analyze severity of fouls themselves, but we can see if more fouls have been given over time (which would suggest more incidents are being called fouls), and if more yellow-or-red cards are being given relative to the amount of fouls a team makes. We're going to look at this by season, rather than individual games.

In [None]:
hf = rest.groupby(['Season'])['HF'].sum().rename("Home_Fouls")
af = rest.groupby(['Season'])['AF'].sum().rename("Away_Fouls")
hy = rest.groupby(['Season'])['HY'].sum().rename("Home_Yellows")
ay = rest.groupby(['Season'])['AY'].sum().rename("Away_Yellows")
hr = rest.groupby(['Season'])['HR'].sum().rename("Home_Reds")
ar = rest.groupby(['Season'])['AR'].sum().rename("Away_Reds")


fouls = pd.concat([hf,af,hy,ay,hr,ar], axis=1).reset_index()
fouls['Total_Fouls'] = fouls['Home_Fouls'] + fouls['Away_Fouls']
fouls['Total_Yellows'] = fouls['Home_Yellows'] + fouls['Away_Yellows']
fouls['Total_Reds'] = fouls['Home_Reds'] + fouls['Away_Reds']
fouls

In [None]:
# Let's first look at how fouls over time have changed
# Interestingly, we actually see less fouls being given over time. Does this hold true for both home and away teams?

sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=fouls, x="Season", y="Total_Fouls")
plot.set_xticks(range(2000, 2020));

In [None]:
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=fouls, x="Season", y="Home_Fouls")
plot.set_xticks(range(2000, 2020));

In [None]:
# Yep, for both home and away teams, the trend seems to be that less fouls are being given.
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=fouls, x="Season", y="Away_Fouls")
plot.set_xticks(range(2000, 2020));

In [None]:
# Now, how about we examine yellow and red cards per foul?
fouls['Total_Yellow_Rate'] = fouls['Total_Yellows'] / fouls['Total_Fouls']
fouls['Home_Yellow_Rate'] = fouls['Home_Yellows'] / fouls['Home_Fouls']
fouls['Away_Yellow_Rate'] = fouls['Away_Yellows'] / fouls['Away_Fouls']
fouls['Total_Red_Rate'] = fouls['Total_Reds'] / fouls['Total_Fouls']
fouls['Home_Red_Rate'] = fouls['Home_Reds'] / fouls['Home_Fouls']
fouls['Away_Red_Rate'] = fouls['Away_Reds'] / fouls['Away_Fouls']

In [None]:
# We do see that a greater percent of fouls are receiving yellow cards recently.
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=fouls, x="Season", y="Total_Yellow_Rate")
plot.set_xticks(range(2000, 2020));

In [None]:
# Again, both home and away teams seem to follow this trend.
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=fouls, x="Season", y="Home_Yellow_Rate")
plot.set_xticks(range(2000, 2020));

In [None]:
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=fouls, x="Season", y="Away_Yellow_Rate")
plot.set_xticks(range(2000, 2020));

In [None]:
# While more yellow cards are given per foul, it seems red cards are actually less often given now.
# The correlation is not great, though.
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=fouls, x="Season", y="Total_Red_Rate")
plot.set_xticks(range(2000, 2020));

In [None]:
# Again, the trend follows for both home and away teams, and the correlation is just as weak for them, if not more so, than
# looking at the total numbers.
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=fouls, x="Season", y="Home_Red_Rate")
plot.set_xticks(range(2000, 2020));

In [None]:
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=fouls, x="Season", y="Away_Red_Rate")
plot.set_xticks(range(2000, 2020));

So, it seems complaints about the game being 'soft', in regards to more things being given as fouls, aren't statistically valid. When it comes to what counts as a bookable offense, leading to yellow cards, one has a point, but red cards are also not being handed out more frequently now than twenty years ago. The amount of fouls given has also been decreasing.

# Sixth Analysis: Is there a relationship between fouls and goals conceded?

Intuitively, fouls tend to come from defensive actions. More fouls would imply more defending. More defending would imply the opposition are getting more chances, and thus goals. Let's look for a correlation.

In [None]:
foulsConc = rest[['FTHG','FTAG','HS','AS','HF','AF']]
foulsConc

In [None]:
# What we see in this plot is that there is a direct relationship between home fouls and away goals.
# This implies that as home teams have to do more defending, they are more likely to concede.
# Not that the correlation doesn't seem particularly strong.
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=foulsConc, x="HF", y="FTAG", scatter=False)

In [None]:
# For away teams, however, we see the inverse. The more fouls an away team commits, the less goals they concede.
# The correlation here also appears to be much stronger than in the above plot.
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=foulsConc, x="AF", y="FTHG", scatter=False)

In [None]:
# We see the same trend in terms of shots; away teams shoot more when home teams commit more fouls,
# while home teams shoot less as away teams foul more.
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=foulsConc, x="HF", y="AS", scatter=False)

In [None]:
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=foulsConc, x="AF", y="HS", scatter=False)

In [None]:
# In these next couple of graphs, we see both home and away teams are less likely to score the more they foul.
# This follows from the earlier intuition, that higher-fouling teams are on the defensive more often.
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=foulsConc, x="HF", y="FTHG", scatter=False)

In [None]:
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=foulsConc, x="AF", y="FTAG", scatter=False)

In [None]:
# We see here that the same holds true for shots, as one would expect.
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=foulsConc, x="HF", y="HS", scatter=False)

In [None]:
sns.set(rc={'figure.figsize':(25, 10)})
plot = sns.regplot(data=foulsConc, x="AF", y="AS", scatter=False)

We see here that home teams score less the more an away team fouls them, while away teams score more the more fouls a home team makes. The same trends persist for shots. Both, however, score/shoot less as they themselves make more fouls.