# Analyze Premier League offensive stats by player: 1993-present

#### Import Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np
import requests
import json
from bs4 import BeautifulSoup
from selenium import webdriver
import altair as alt

In [3]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [4]:
today = pd.Timestamp("today").strftime("%Y-%m-%d")

---

## Get data

#### First, get the top scorers for each season before this one

In [5]:
df_past = pd.read_csv("data/raw/all_premiere_league_players_all_year_w_goals.csv")

In [6]:
df_past = df_past.query('season != "2022-2023"')

---

#### Get most current season

In [7]:
urls = []

for f, l in zip(range(2022, 2023), range(2023, 2024)):
    urls_dict = {
        "season": f"{f}-{l}",
        "url": f"https://fbref.com/en/comps/9/{f}-{l}/stats/{f}-{l}-Premier-League-Stats",
    }
    urls.append(urls_dict)

In [8]:
dfs = []

for u in urls:

    print(u["season"])
    driver = webdriver.Chrome()
    driver.get(u["url"])
    soup = BeautifulSoup(driver.page_source, "lxml")
    # driver.quit()

    try:
        table = soup.find_all("table")[11]
    except:
        continue

    src = (pd.read_html(str(table))[0]).droplevel(0, axis=1)
    src.columns = src.columns.str.lower()

    src = src[src["player"] != "Player"].fillna(0).reset_index(drop=True)
    src["gls"] = src["gls"].astype(float)

    df_slim = src.iloc[:, :14][
        [
            "player",
            "pos",
            "squad",
            "age",
            "born",
            "mp",
            "starts",
            "min",
            "gls",
            "ast",
            "g+a",
        ]
    ].copy()
    df_slim = (
        df_slim[df_slim["gls"] > 0]
        .sort_values("gls", ascending=False)
        .reset_index(drop=True)
        .copy()
    )
    df_slim["season"] = u["season"]
    dfs.append(df_slim)

2022-2023


In [9]:
df_current = pd.concat(dfs)

In [10]:
df = pd.concat([df_current, df_past]).reset_index(drop=True)

---

In [11]:
df["season_end"] = df["season"].str.split("-", expand=True)[0].astype(int)

In [12]:
df = df.sort_values(by=["player", "season_end"]).reset_index(drop=True)

In [13]:
df.head()

Unnamed: 0,player,pos,squad,age,born,mp,starts,min,gls,ast,g+a,season,season_end
0,Aaron Connolly,FW,Brighton,19,2000,24,14,1258,3.0,1,4,2019-2020,2019
1,Aaron Connolly,FW,Brighton,20,2000,17,9,791,2.0,1,3,2020-2021,2020
2,Aaron Cresswell,"DF,MF",West Ham,24,1989,38,38,3420,2.0,4,6,2014-2015,2014
3,Aaron Cresswell,DF,West Ham,25,1989,37,37,3314,2.0,4,6,2015-2016,2015
4,Aaron Cresswell,DF,West Ham,27,1989,36,35,3069,1.0,3,4,2017-2018,2017


---

In [14]:
df["cumulative_goals"] = df.groupby("player")["gls"].cumsum()

In [15]:
df["season_number"] = df.groupby("player")["season"].rank(method="dense").astype(int)

In [16]:
pivot_df = df.pivot_table(
    index="player", columns="season_number", values="cumulative_goals", fill_value=0
).reset_index()

In [17]:
df.query('player == "Harry Kane"')

Unnamed: 0,player,pos,squad,age,born,mp,starts,min,gls,ast,g+a,season,season_end,cumulative_goals,season_number
2974,Harry Kane,FW,Tottenham,20,1993,10,6,499,3.0,0,3,2013-2014,2013,3.0,1
2975,Harry Kane,FW,Tottenham,21,1993,34,28,2581,21.0,4,25,2014-2015,2014,24.0,2
2976,Harry Kane,FW,Tottenham,22,1993,38,38,3361,25.0,1,26,2015-2016,2015,49.0,3
2977,Harry Kane,FW,Tottenham,23,1993,30,29,2521,29.0,5,34,2016-2017,2016,78.0,4
2978,Harry Kane,FW,Tottenham,24,1993,37,35,3076,30.0,2,32,2017-2018,2017,108.0,5
2979,Harry Kane,FW,Tottenham,25,1993,28,27,2424,17.0,4,21,2018-2019,2018,125.0,6
2980,Harry Kane,FW,Tottenham,26,1993,29,29,2587,18.0,2,20,2019-2020,2019,143.0,7
2981,Harry Kane,FW,Tottenham,27,1993,35,35,3082,23.0,14,37,2020-2021,2020,166.0,8
2982,Harry Kane,FW,Tottenham,28,1993,37,36,3232,17.0,9,26,2021-2022,2021,183.0,9
2983,Harry Kane,FW,Tottenham,29-277,1993,34,34,3045,25.0,3,28,2022-2023,2022,208.0,10


In [18]:
##

In [19]:
df.dtypes

player               object
pos                  object
squad                object
age                  object
born                 object
mp                   object
starts               object
min                  object
gls                 float64
ast                  object
g+a                  object
season               object
season_end            int64
cumulative_goals    float64
season_number         int64
dtype: object

---

#### Players with 10 or more seasons

In [20]:
season_count = df.groupby("player")["cumulative_goals"].count()
players_with_10_or_more_seasons = season_count[season_count >= 10].index

#### Players with 100 or more goals

In [21]:
max_goals = df.groupby("player")["cumulative_goals"].max()
players_with_100_or_more_goals = max_goals[max_goals >= 100].index

In [22]:
filtered_df_100_goals = df[df["player"].isin(players_with_100_or_more_goals)].copy()

In [23]:
players = list(filtered_df_100_goals.player.unique())

In [24]:
filtered_df_100_goals["color"] = filtered_df_100_goals["player"] == "Harry Kane"

In [25]:
filtered_df_100_goals.loc[
    filtered_df_100_goals.player != "Harry Kane", "color"
] = "#e6e6e6"
filtered_df_100_goals.loc[
    filtered_df_100_goals.player == "Harry Kane", "color"
] = "#cc0000"

In [26]:
filtered_df_100_goals["sort"] = filtered_df_100_goals["player"] == "Harry Kane"

In [27]:
colors = (
    filtered_df_100_goals.groupby(["player"])["color"]
    .max()
    .reset_index()["color"]
    .to_list()
)

In [28]:
domain = players
range_ = colors

background_lines = (
    alt.Chart(
        filtered_df_100_goals.query('player != "Harry Kane"').sort_values(
            ["sort", "season"], ascending=False
        ),
        # padding={"left": 10},
    )
    .mark_line(size=2)
    .encode(
        x=alt.X("season_number:O", title="Season number"),
        y=alt.Y(
            "cumulative_goals:Q", axis=alt.Axis(tickCount=6), title="Cumulative goals"
        ),
        color=alt.Color("player", scale=alt.Scale(domain=domain, range=range_)),
    )
)

foreground_lines = (
    alt.Chart(
        filtered_df_100_goals.query('player == "Harry Kane"').sort_values(
            ["sort", "season_number"], ascending=False
        ),
    )
    .mark_line(size=2)
    .encode(
        x=alt.X("season_number:O", title="Season number"),
        y=alt.Y(
            "cumulative_goals:Q", axis=alt.Axis(tickCount=6), title="Cumulative goals"
        ),
        color=alt.Color(
            "player", scale=alt.Scale(domain=domain, range=range_), legend=None
        ),
    )
)

(background_lines + foreground_lines).properties(
    height=400,
    width=780,
    title="Harry Kane cumulative goals vs. other top strikers",
    padding={"left": 10},
)

  for col_name, dtype in df.dtypes.iteritems():
