In [None]:
Here's the meaning or definition of each column or expression in the context of football statistics:

league: The football league or competition in which the match was played.

year: The year in which the match took place.

h_a: Indicates whether the match was played at home ('h') or away ('a').

xG: Expected goals, a statistical measure of the quality of scoring chances created by a team.

xGA: Expected goals against, a statistical measure of the quality of scoring chances conceded by a team.

npxG: Non-penalty expected goals, expected goals excluding penalty kicks.

npxGA: Non-penalty expected goals against, expected goals against excluding penalty kicks.

deep: Number of passes completed into the penalty area.

deep_allowed: Number of passes completed into the team's own penalty area by the opponent.

scored: Number of goals scored by the team.

missed: Number of goals conceded by the team.

xpts: Expected points, calculated based on expected goals for and against.

result: The result of the match (win, draw, or loss).

date: The date of the match.

wins: Number of matches won by the team.

draws: Number of matches drawn by the team.

loses: Number of matches lost by the team.

pts: Total points earned by the team.

npxGD: Non-penalty expected goals difference, the difference between non-penalty expected goals scored and conceded.

ppda_coef: Passes allowed per defensive action coefficient, a measure of defensive pressure.

ppda_att: Passes allowed per defensive action by the team.

ppda_def: Passes allowed per defensive action by the opponent.

oppda_coef: Opponent passes allowed per defensive action coefficient, a measure of offensive pressure faced by the team.

oppda_att: Opponent passes allowed per defensive action by the opponent.

oppda_def: Opponent passes allowed per defensive action by the team.

team: The name of the team.

xG_diff: Difference between actual goals scored and expected goals.

xGA_diff: Difference between actual goals conceded and expected goals against.

xpts_diff: Difference between actual points earned and expected points.

## Variables explanation
#● xg - expected goals metric, it is a statistical measure of the quality of chances created and conceded. More at understat.com

● xg_diff - difference between actual goals scored and expected goals.

● npxg - expected goals without penalties and own goals.

● xga - expected goals against.

● xga_diff - difference between actual goals missed and expected goals against.

● npxga - expected goals against without penalties and own goals.

● npxgd - difference between "for" and "against" expected goals without penalties and own goals.

● ppda_coef - passes allowed per defensive action in the opposition half (power of pressure)

● oppda_coef - opponent passes allowed per defensive action in the opposition half (power of opponent's pressure)

● deep - passes completed within an estimated 20 yards of goal (crosses excluded)

● deep_allowed - opponent passes completed within an estimated 20 yards of goal (crosses excluded)

● xpts - expected points

● xpts_diff - difference between actual and expected points

In [None]:
 # xpts_diff - difference between actual and expected points

# this means the more xpts_diff in a one league, And this means two things :-first :- league with grater xpts_diff average has more exciting and competative games than             other leagues,because match has more goals than expected(lik epl)
                                                                       :-second :- leagues with grater xpts_diff average in home matches refers to the power of their fans and their effect(like in epl and laliga)    
            



# Assuming 'df' is your DataFrame
# Calculate the mean of xpts_diff for each combination of league and h_a
mean_xpts_diff = df.groupby(['league', 'h_a'])['xpts_diff'].mean().reset_index()

# Separate the DataFrame for home and away matches
mean_xpts_diff_home = mean_xpts_diff[mean_xpts_diff['h_a'] == 'h']
mean_xpts_diff_away = mean_xpts_diff[mean_xpts_diff['h_a'] == 'a']

# Sort the DataFrames by mean xpts_diff in descending order
mean_xpts_diff_home_sorted = mean_xpts_diff_home.sort_values(by='xpts_diff', ascending=False)
mean_xpts_diff_away_sorted = mean_xpts_diff_away.sort_values(by='xpts_diff', ascending=False)

# Create subplots for home and away
fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharey=True)

# Plot for home
sns.barplot(data=mean_xpts_diff_home_sorted, x='league', y='xpts_diff', order=mean_xpts_diff_home_sorted['league'], ax=axes[0])
axes[0].set_title('Average xpts_diff by League (Home)')
axes[0].set_xlabel('League')
axes[0].set_ylabel('Average xpts_diff')

# Plot for away
sns.barplot(data=mean_xpts_diff_away_sorted, x='league', y='xpts_diff', order=mean_xpts_diff_away_sorted['league'], ax=axes[1])
axes[1].set_title('Average xpts_diff by League (Away)')
axes[1].set_xlabel('League')
axes[1].set_ylabel('Average xpts_diff')

plt.tight_layout()
plt.show()            

In [None]:
 ## different between xg and actual scored goals for each league

## different between xg and actual scored goals for each league 
def plot_top_clubs(league):
    team_df = df[df['league'] == league][['team', 'xg', 'pts']]
    df_xg_diff = team_df.groupby('team').agg({'xg': 'mean', 'pts': 'sum'}).reset_index().sort_values(by='pts', ascending=False).head(20)

    fig = px.bar(df_xg_diff, x='team', y='pts', color='xg', color_continuous_scale='inferno', \
                 hover_data=['team', 'xg', 'pts'], title=f'Top 10 clubs with most goals in {league}')
    fig.update_layout(title={'text': league, 'x': 0.5, 'y': 0.95})
    fig.show()

# Run the function for each league
leagues = ['EPL', 'Bundesliga', 'La_liga', 'Serie_A', 'Ligue_1']
for league in leagues:
    plot_top_clubs(league)


In [None]:
# the percentage of total scored goals in every league

fig = px.pie(data_frame= df, names= 'league', values= 'scored', hole = 0.3)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout({'title': {'text': 'Total goals in every league', 'x': 0.5, 'y': 0.95}})
fig.update_layout(showlegend= True, legend= {'title': 'League Name', 'x': 0.8, 'y': 0.5, 'bgcolor': 'rgb(246,228,129)'})
fig.show()

In [None]:
## how total number of scored goals distributed in every league

shows the gap between the teams of every league and indicate if there is a competation in the league like(epl and serieA) or there is a wide gap between big teams and small teams like (bundesliga and laliga)

def plot_top_goals(league):
    team_stats = df[df['league'] == league][['team', 'scored']]
    top_teams = team_stats.groupby('team').agg({'scored': 'sum'}).reset_index().sort_values(by='scored', ascending=False).head(20)

    fig = px.bar(top_teams, x='team', y='scored', color='scored', color_continuous_scale='inferno', \
                 hover_data=['team', 'scored'], title=f'Top 10 clubs with most goals in {league}')
    fig.update_layout(title={'text': league, 'x': 0.5, 'y': 0.95})
    fig.show()

# Run the function for each league
leagues = ['EPL', 'Bundesliga', 'La_liga', 'Serie_A', 'Ligue_1']
for league in leagues:
    plot_top_goals(league)


In [None]:
## how total number of scored goals distributed in every league

shows the gap between the teams of every league and indicate if there is a competation in the league like(epl and serieA) or there is a wide gap between big teams and small teams like (bundesliga and laliga)



team_stats=df[df['league'] == 'Bundesliga'][['team', 'scored']]
bundesliga = team_stats.groupby('team').agg({'scored': 'sum'}).reset_index().sort_values(by='scored', ascending=False).head(20)

fig = px.bar(bundesliga, x= 'team', y= 'scored', color= 'scored', color_continuous_scale = 'inferno' , \
             hover_data= ['team', 'scored'], title= 'Top 10 clubs with most goals')
fig.update_layout(title= {'text': 'bundesliga', 'x': 0.5, 'y': 0.95})
fig.show()


team_stats=df[df['league'] == 'La_liga'][['team', 'scored']]
La_liga = team_stats.groupby('team').agg({'scored': 'sum'}).reset_index().sort_values(by='scored', ascending=False).head(20)

fig = px.bar(La_liga, x= 'team', y= 'scored', color= 'scored', color_continuous_scale = 'inferno' , \
             hover_data= ['team', 'scored'], title= 'Top 10 clubs with most goals')
fig.update_layout(title= {'text': 'La_liga', 'x': 0.5, 'y': 0.95})
fig.show()

team_stats=df[df['league'] == 'Serie_A'][['team', 'scored']]
Serie_A = team_stats.groupby('team').agg({'scored': 'sum'}).reset_index().sort_values(by='scored', ascending=False).head(20)

fig = px.bar(Serie_A, x= 'team', y= 'scored', color= 'scored', color_continuous_scale = 'inferno' , \
             hover_data= ['team', 'scored'], title= 'Top 10 clubs with most goals')
fig.update_layout(title= {'text': 'BSerie_A', 'x': 0.5, 'y': 0.95})
fig.show()

team_stats=df[df['league'] == 'Ligue_1'][['team', 'scored']]
Ligue_1 = team_stats.groupby('team').agg({'scored': 'sum'}).reset_index().sort_values(by='scored', ascending=False).head(20)

fig = px.bar(Ligue_1, x= 'team', y= 'scored', color= 'scored', color_continuous_scale = 'inferno' , \
             hover_data= ['team', 'scored'], title= 'Top 10 clubs with most goals')
fig.update_layout(title= {'text': 'Ligue_1', 'x': 0.5, 'y': 0.95})
fig.show()

team_stats=df[df['league'] == 'EPL'][['team', 'scored']]
EPL = team_stats.groupby('team').agg({'scored': 'sum'}).reset_index().sort_values(by='scored', ascending=False).head(20)

fig = px.bar(EPL, x= 'team', y= 'scored', color= 'scored', color_continuous_scale = 'inferno' , \
             hover_data= ['team', 'scored'], title= 'Top 10 clubs with most goals')
fig.update_layout(title= {'text': 'EPL', 'x': 0.5, 'y': 0.95})
fig.show()

# despite having the biggest number of goals scored in EPL, EPL have the best ditribution of scored goals and smallest gap between its teams,which gives us an explanation of why its considered the best league by the most football coaches,journalists and of course fans,
# on the other hand,bundesliga have the smallest number of scored goal and also the worst distribution of goals between its teams.
# we conclude that smallest gaps between teams and the good distribution og goals play an important role of attracting fans .



In [None]:
# effext of deep on number og scored goals and relation between xg and scored goals
#


def plot_deep_xg_vs_scored(league, x_var):
    league_df = df[df['league'] == league]
    fig = px.scatter(league_df, x=x_var, y='scored', trendline='ols', hover_data=[x_var])
    fig.update_layout(title=f'{x_var.capitalize()} vs Scored Goals in {league}', xaxis_title=x_var.capitalize(), yaxis_title='Scored Goals')
    return fig

leagues = ['EPL', 'Bundesliga', 'La_liga', 'Serie_A', 'Ligue_1']
x_vars = ['deep', 'xg']

for league in leagues:
    for x_var in x_vars:
        fig = plot_deep_xg_vs_scored(league, x_var)
        fig.show()


In [None]:
# by noticing the effect of "deep"(passes completed within an estimated 20 yards of goal (crosses excluded)) on the points achieved , we can say that deep
is a good indicator of the team's ability to score goals and gain points in EPL(premier league) which have teams like mancity and arsenal depend mainly on ball control,
On the other hand, Bundesliga teams seems not to depend on ball control too much, and that's make sense since its big teams like bayern munich and bayern leverkusen depend on
counter pressure and forward runs from inside ,there is no obsession posession in bundeliga #



fig = make_subplots(rows= 5, cols= 1, shared_xaxes= True, shared_yaxes= True, subplot_titles= ['EPL', 'Bundesliga', 'La_liga', 'Serie_A', 'Ligue_1'])
row = 1
for level in ['EPL', 'Bundesliga', 'La_liga', 'Serie_A', 'Ligue_1']:
    fig.add_trace(go.Scatter(x= df[df.league == level]['deep'], y= df[df.league == level]['pts'], name= level, mode= 'markers')
                  , row= row, col= 1)
    row = row + 1
fig.update_layout(height=1000, title= {'text': "deep VS points for each league", 'x': 0.48, 'y': 0.97})
fig.show()

In [None]:
# deep does not have a strong correlation with scored goals,but it has more correlation with pts,beacause the more time u have the ball(with no crosses) ,not only u may 
# score a goal,but also u are keeping the ball away from the other team,and beacause deep is focus about last 20 yards from opponent goal,even if the team lost the ball,the opponent doesn't
have a great opportunity to score a goal.


def plot_deep_vs_scored(league, y_var):
    
    team_df = df[df['league'] == league][['team', 'xg', 'pts', 'scored', 'deep']]
    team_stats = team_df.groupby('scored').agg({'xg': 'mean', 'deep': 'mean'}).reset_index().sort_values(by='scored', ascending=False).head(20)

    fig = px.scatter(team_stats, x='scored', y=y_var, trendline='ols', hover_data=['xg'])
    fig.update_layout(title={'text': f'{y_var.capitalize()} vs Scored Goals in {league}', 'x': 0.5, 'y': 0.95})
    fig.show()

leagues = ['EPL', 'Bundesliga', 'La_liga', 'Serie_A', 'Ligue_1']
y_vars = ['deep', 'xg']

for league in leagues:
    for y_var in y_vars:
        plot_deep_vs_scored(league, y_var)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'df' is your DataFrame after cleaning and it contains 'xG' and 'Goals' columns
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='xg', y='scored', hue='league')
plt.title('Relationship between Expected Goals (xG) and Actual Goals')
plt.xlabel('Expected Goals (xG)')
plt.ylabel('Actual Goals')
plt.legend(title='Player', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
