In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly_express as px

In [None]:
df = pd.read_csv('../input/all-premier-league-matches-20102021/df_full_premierleague.csv')

In [None]:
obj_cols = df.dtypes[df.dtypes == 'object'].index

In [None]:
df[obj_cols].head()

In [None]:
df['date'] = pd.to_datetime(df['date'],format= '%Y-%m-%d')

In [None]:
x, y = df['result_full'].str.split('-',1).str
df['home_goals'] = x
df['away_goals'] = y
df[['home_goals','away_goals']] = df[['home_goals','away_goals']].astype(int)
df['total_goals'] = df['home_goals'] + df['away_goals']

In [None]:
df['home_win'] = (df.loc[:,'home_goals'] - df.loc[:,'away_goals'])>0
df['away_win'] = (df.loc[:,'away_goals'] - df.loc[:,'home_goals'])>0
df['home_win'] = df['home_win'].astype(int)
df['away_win'] = df['away_win'].astype(int)

In [None]:
df.head()

In [None]:
overall_home_goals = pd.DataFrame(df.groupby('home_team')['home_goals'].sum().sort_values(ascending=False)).reset_index()
overall_home_goals.columns = ['Home Team','Goals at Home']

### Most Goals scored at Home

In [None]:
fig = px.bar(overall_home_goals.head(10),x='Home Team',y='Goals at Home',color='Home Team')
fig.update_layout(title = 'Most Home Goals since 10/11 Season',xaxis_title = 'Team', yaxis_title = 'Goals')
fig.show()

### Most Goals Scored Away

In [None]:
overall_away_goals = pd.DataFrame(df.groupby('away_team')['away_goals'].sum().sort_values(ascending=False)).reset_index()
fig = px.bar(overall_away_goals.head(10),x='away_team',y='away_goals',color='away_team')
fig.update_layout(title = 'Most Away Goals Since 10/11 Season',xaxis_title = 'Team', yaxis_title = 'Goals')

In [None]:
game_count = pd.DataFrame(df.groupby('home_team')['season'].count()).reset_index()
game_count['away'] = pd.DataFrame(df.groupby('away_team')['season'].count()).reset_index()['season']
game_count['total'] = game_count['season'] + game_count['away']
game_count.columns = ['Team','Home Games','Away Games','Total Games']

### Overall table of Home/Away games (sorted by Total Goals)

In [None]:
overall_goals = pd.DataFrame(df.groupby('home_team')['home_goals'].sum())
overall_goals['away_goals'] = list(pd.DataFrame(df.groupby('away_team')['away_goals'].sum()).reset_index()['away_goals'])
overall_goals['difference'] = overall_goals['home_goals'] - overall_goals['away_goals']
overall_goals = overall_goals.reset_index()
overall_goals.columns = ['Team','Home Goals','Away Goals','Difference']
overall_goals['Total Goals'] = overall_goals['Home Goals'] + overall_goals['Away Goals']
overall_goals = overall_goals.sort_values('Total Goals',ascending=False)
overall_goals = overall_goals.merge(game_count,how='inner',on='Team')
overall_goals['Goals/Game'] = overall_goals['Total Goals']/overall_goals['Total Games']
overall_goals['Home Goals/Game'] = overall_goals['Home Goals']/overall_goals['Home Games']
overall_goals['Away Goals/Game'] = overall_goals['Away Goals']/overall_goals['Away Games']
overall_goals

### Which team has scored the most % of goals since 10/11 season?

In [None]:
px.pie(overall_goals,values='Total Goals',names='Team',title = 'Goals scored as % of total')

In [None]:
def get_value(x):
    if x == 'Arsenal':
        return 'red'
    elif x == 'Aston Villa':
        return px.colors.qualitative.Safe[1]
    elif x == 'Burnley':
        return px.colors.qualitative.Safe[8]
    elif x == 'Chelsea':
        return px.colors.qualitative.G10[0]
    elif x == 'Everton':
        return px.colors.qualitative.G10[0]
    elif x == 'Leeds United':
        return 'white'
    elif x == 'Liverpool':
        return 'red'
    elif x == 'Manchester City':
        return px.colors.qualitative.Light24[13]
    elif x == 'Manchester United':
        return 'red'
    elif x == 'Southampton':
        return px.colors.qualitative.Set1[0]
    elif x == 'Tottenham Hotspur':
        return px.colors.qualitative.Bold[2]
    elif x == 'West Ham United':
        return px.colors.qualitative.Safe[8]

### How often have these teams score x number of goals at home?

In [None]:
home_goals_freq = pd.DataFrame(df.groupby(['home_team','home_goals'])['season'].count()).reset_index()
for i in ['Arsenal','Aston Villa','Burnley','Chelsea','Everton','Leeds United','Leicester City','Liverpool','Manchester City','Manchester United','Southampton','Tottenham Hotspur','West Ham United']:
    print("The distribution of goals for {}'s home games are: -".format(i))
    fig = px.bar(home_goals_freq[home_goals_freq['home_team'] == i],x='home_goals',y='season')
    fig.update_layout(title = i,xaxis_title = 'Number of Goals',yaxis_title = 'Frequency')
    fig.update_traces(marker_color=get_value(i))
    fig.show()

### Top 20 Higest Scoring games since 10/11 season

In [None]:
high_scoring_games = df[df['total_goals']>5]
high_scoring_games.sort_values('total_goals',ascending=False)[['home_team','away_team','date','result_full','result_ht','total_goals']].head(20)

In [None]:
avg_goals = pd.DataFrame(df.groupby('home_team')['home_goals'].mean()).reset_index()
avg_goals['away_goals'] = list(pd.DataFrame(df.groupby('away_team')['away_goals'].mean()).reset_index()['away_goals']) 
avg_goals.columns = ['Team','Home Average','Away Average']

### How do teams play at home compared to away?

In [None]:
fig = px.scatter(avg_goals,x='Home Average',y='Away Average',color='Team',title = 'Home/Away Performance by PL Teams')
fig.add_hline(y=1.099552708794958,line_dash='dash',line_width=1,line_color='red')
fig.add_vline(x=1.3823472204742036,line_dash='dash',line_width=1,line_color='red')
fig.add_annotation(x=2,y=1.9,text='These teams score well at<br>Home and Away',showarrow=False)
fig.add_annotation(x=2,y=0.8,text='These teams score better at<br> Home than Away',showarrow=False)
fig.add_annotation(x=0.78,y=1.9,text='These teams score better at<br> Away than Home',showarrow=False)
fig.add_annotation(x=0.78,y=0.8,text='These teams score poorly at<br> Home than Away',showarrow=False)


While teams are usually expected to play better in their own stadium than on the road, Crystal Palace, Leeds and Blackburn have shown they are better at Away games.

# United's Managers Performance by Home/Away Record

In [None]:
united = df[(df["home_team"]=="Manchester United")]
saf = united[united['date']<pd.to_datetime('2013-05-19')]
moyes = united[(united['date']<pd.to_datetime('2014-04-22'))&(united['date']>pd.to_datetime('2013-05-19'))]
giggs = united[(united['date']<pd.to_datetime('2014-05-11'))&(united['date']>pd.to_datetime('2014-04-22'))]
gaal = united[(united['date']<pd.to_datetime('2016-05-23'))&(united['date']>pd.to_datetime('2014-07-16'))]
mou = united[(united['date']<pd.to_datetime('2018-12-18'))&(united['date']>pd.to_datetime('2016-05-27'))]
ole = united[united['date']>pd.to_datetime('2018-12-18')]

In [None]:
united_away = df[(df["away_team"]=="Manchester United")]
saf2 = united_away[united_away['date']<pd.to_datetime('2013-05-19')]
moyes2 = united_away[(united_away['date']<pd.to_datetime('2014-04-22'))&(united_away['date']>pd.to_datetime('2013-05-19'))]
giggs2 = united_away[(united_away['date']<pd.to_datetime('2014-05-11'))&(united_away['date']>pd.to_datetime('2014-04-22'))]
gaal2 = united_away[(united_away['date']<pd.to_datetime('2016-05-23'))&(united_away['date']>pd.to_datetime('2014-07-16'))]
mou2 = united_away[(united_away['date']<pd.to_datetime('2018-12-18'))&(united_away['date']>pd.to_datetime('2016-05-27'))]
ole2 = united_away[united_away['date']>pd.to_datetime('2018-12-18')]

Explanation: -
1. I sorted the entire dataframe by United home games (where the home_team is United)
1. I found the date range for each manager's tenure and sorted the new United df by those dates

#### Records of different United Managers

In [None]:
w = []
l = []
t = []
for i in [saf,moyes,giggs,gaal,mou,ole]:
    w.append(i[i['home_win']==1]['home_win'].sum())
    l.append(i[i['away_win']==1]['away_win'].sum())
    t.append(i['season'].count())
home_wins = pd.DataFrame({'Managers':['SAF','Moyes','Giggs','Van Gaal','Mou','Ole'],'Wins':w,'Losses':l,'Total':t})
home_wins['Draws'] = home_wins['Total'] - (home_wins['Wins'] + home_wins['Losses'])
home_wins['Home Points'] = home_wins['Wins']*3 + home_wins['Draws']*1
home_wins['Win %'] = round((home_wins['Wins']/ home_wins['Total'])*100,2)
home_wins['Point %'] = round(home_wins['Home Points']/(home_wins['Total']*3)*100,2)

w2 = []
l2 = []
t2 = []
for i in [saf2,moyes2,giggs2,gaal2,mou2,ole2]:
    w2.append(i[i['away_win']==1]['away_win'].sum())
    l2.append(i[i['home_win']==1]['home_win'].sum())
    t2.append(i['season'].count())
away_wins = pd.DataFrame({'Managers':['SAF','Moyes','Giggs','Van Gaal','Mou','Ole'],'Wins':w2,'Losses':l2,'Total':t2})
away_wins['Draws'] = away_wins['Total'] - (away_wins['Wins'] + away_wins['Losses'])
away_wins['Home Points'] = away_wins['Wins']*3 + away_wins['Draws']*1
away_wins['Win %'] = round((away_wins['Wins']/ away_wins['Total'])*100,2)
away_wins['Point %'] = round(away_wins['Home Points']/(away_wins['Total']*3)*100,2)
print('Home Table:')
display(home_wins)
print('Away Table:')
display(away_wins)


total = pd.concat([home_wins,away_wins],axis=0)
total['Home/Away'] = ['Home','Home','Home','Home','Home','Home','Away','Away','Away','Away','Away','Away']
px.bar(total,x='Managers',y='Win %',color='Home/Away',barmode='group',text='Win %',title = 'Home/Away Win % by Manager (Wins/Total Games)',hover_data=['Wins','Losses','Total'],color_discrete_sequence=["red", "black"])

In [None]:
px.bar(total,x='Managers',y='Point %',color='Home/Away',barmode='group',text='Point %',title = 'Home/Away Point % by Manager (Points/Total Possible Points)',hover_data=['Wins','Losses','Total'],color_discrete_sequence=["red", "black"])

#### Manchester United's Home/Away Goal Averages

In [None]:
o = []
b = []
for i in [saf,moyes,giggs,gaal,mou,ole]:
    o.append(i['home_goals'].mean())
    b.append(i['home_goals'].count())
home = pd.DataFrame({'Managers':['SAF','Moyes','Giggs','Van Gaal','Mou','Ole'],'Avg Goals':o,'Games Managed':b})
o2 = []
b2 = []
for i in [saf2,moyes2,giggs2,gaal2,mou2,ole2]:
    o2.append(i['away_goals'].mean())
    b2.append(i['away_goals'].count())
away = pd.DataFrame({'Managers':['SAF','Moyes','Giggs','Van Gaal','Mou','Ole'],'Avg Goals':o2,'Games Managed':b2})
total = pd.concat([home,away],axis=0)
total = total.round(2)
total2 = home.merge(away,how='inner',on='Managers',suffixes=('_home','_away'))
total2 = total2.round(2)
total['Home/Away'] = ['Home','Home','Home','Home','Home','Home','Away','Away','Away','Away','Away','Away']
display(total2)
px.bar(total,x='Managers',y='Avg Goals',color='Home/Away',barmode='group',text='Avg Goals',title = 'Home/Away Goals For Avg by Manager',hover_data=['Home/Away','Games Managed'],color_discrete_sequence=["red", "black"])

#### * Giggs' average cannot be taken too seriously since he only managed 3 games in total and was an interim manager. 

In [None]:
fig, ax = plt.subplots(figsize=(20,8))
sns.lineplot(saf['date'],saf['home_goals'],ax=ax,label='SAF')
sns.lineplot(x=moyes['date'],y=moyes['home_goals'],ax=ax, label='Moyes')
sns.lineplot(x=giggs['date'],y=giggs['home_goals'],ax=ax, label='Giggs')
sns.lineplot(x=gaal['date'],y=gaal['home_goals'],ax=ax, label='Van Gaal')
sns.lineplot(x=mou['date'],y=mou['home_goals'],ax=ax, label='Mourinho')
sns.lineplot(x=ole['date'],y=ole['home_goals'],ax=ax, label='Ole')
ax.set_title('Home Game Performance by United Manager')
ax.set_xlabel('Date')
ax.set_ylabel('Number of Goals')

# Analyzing City's Managers

In [None]:
city = df[(df["home_team"]=="Manchester City")]
man = city[city['date']<pd.to_datetime('2013-05-13')]
pelle = city[(city['date']<pd.to_datetime('2016-06-30'))&(city['date']>pd.to_datetime('2013-06-14'))]
pep = city[(city['date']>pd.to_datetime('2016-06-30'))]
city_away = df[(df["away_team"]=="Manchester City")]
man2 = city_away[city_away['date']<pd.to_datetime('2013-05-13')]
pelle2 = city_away[(city_away['date']<pd.to_datetime('2016-06-30'))&(city_away['date']>pd.to_datetime('2013-06-14'))]
pep2 = city_away[(city_away['date']>pd.to_datetime('2016-06-30'))]

In [None]:
w = []
l = []
t = []
for i in [man, pelle, pep]:
    w.append(i[i['home_win']==1]['home_win'].sum())
    l.append(i[i['away_win']==1]['away_win'].sum())
    t.append(i['season'].count())
home_wins = pd.DataFrame({'Managers':['Mancini','Pellegrini','Guardiola'],'Wins':w,'Losses':l,'Total':t})
home_wins['Draws'] = home_wins['Total'] - (home_wins['Wins'] + home_wins['Losses'])
home_wins['Home Points'] = home_wins['Wins']*3 + home_wins['Draws']*1
home_wins['Win %'] = round((home_wins['Wins']/ home_wins['Total'])*100,2)
home_wins['Point %'] = round(home_wins['Home Points']/(home_wins['Total']*3)*100,2)
home_wins['PPG'] = round(home_wins['Home Points']/home_wins['Total'],2)

w2 = []
l2 = []
t2 = []
for i in [man2, pelle2, pep2]:
    w2.append(i[i['away_win']==1]['away_win'].sum())
    l2.append(i[i['home_win']==1]['home_win'].sum())
    t2.append(i['season'].count())
away_wins = pd.DataFrame({'Managers':['Mancini','Pellegrini','Guardiola'],'Wins':w2,'Losses':l2,'Total':t2})
away_wins['Draws'] = away_wins['Total'] - (away_wins['Wins'] + away_wins['Losses'])
away_wins['Home Points'] = away_wins['Wins']*3 + away_wins['Draws']*1
away_wins['Win %'] = round((away_wins['Wins']/ away_wins['Total'])*100,2)
away_wins['Point %'] = round(away_wins['Home Points']/(away_wins['Total']*3)*100,2)
away_wins['PPG'] = round(away_wins['Home Points']/away_wins['Total'],2)

print('Home Table:')
display(home_wins)
print('Away Table:')
display(away_wins)


total = pd.concat([home_wins,away_wins],axis=0)
total['Home/Away'] = ['Home','Home','Home','Away','Away','Away']

px.bar(total,x='Managers',y='Win %',color='Home/Away',barmode='group',text='Win %',title = 'Home/Away Win % by Manager (Wins/Total Games)',hover_data=['Wins','Losses','Total'],color_discrete_sequence=[px.colors.qualitative.Light24[13],'black'])

In [None]:
px.bar(total,x='Managers',y='Point %',color='Home/Away',barmode='group',text='Point %',title = 'Home/Away Point % by Manager (Points/Total Possible Points)',hover_data=['Wins','Losses','Total'],color_discrete_sequence=[px.colors.qualitative.Light24[13],'black'])

In [None]:
px.bar(total,x='Managers',y='PPG',color='Home/Away',barmode='group',text='PPG',title = 'PPG by Manager (Points/Total Games)',hover_data=['Wins','Losses','Total'],color_discrete_sequence=[px.colors.qualitative.Light24[13],'black'])

In [None]:
o = []
b = []
for i in [man,pelle,pep]:
    o.append(i['home_goals'].mean())
    b.append(i['home_goals'].count())
home = pd.DataFrame({'Managers':['Mancini','Pellegrini','Guardiola'],'Avg Goals':o,'Games Managed':b})
o2 = []
b2 = []
for i in [man2,pelle2,pep2]:
    o2.append(i['away_goals'].mean())
    b2.append(i['away_goals'].count())
away = pd.DataFrame({'Managers':['Mancini','Pellegrini','Guardiola'],'Avg Goals':o2,'Games Managed':b2})
total = pd.concat([home,away],axis=0)
total = total.round(2)
total2 = home.merge(away,how='inner',on='Managers',suffixes=('_home','_away'))
total2 = total2.round(2)
total['Home/Away'] = ['Home','Home','Home','Away','Away','Away']
display(total2)
px.bar(total,x='Managers',y='Avg Goals',color='Home/Away',barmode='group',text='Avg Goals',title = 'Home/Away Goal Avg by Manager',hover_data=['Home/Away','Games Managed'],color_discrete_sequence=[px.colors.qualitative.Light24[13],'black'])