# Scrape Premier League offensive stats by player: 1993-present

#### Import Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np
import requests
import json
from bs4 import BeautifulSoup
from selenium import webdriver

In [3]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [4]:
today = pd.Timestamp("today").strftime("%Y-%m-%d")

---

## Get data

#### First, get the top scorers for each season

In [5]:
urls = []

for f, l in zip(range(1992, 2023), range(1993, 2024)):
    urls_dict = {
        "season": f"{f}-{l}",
        "url": f"https://fbref.com/en/comps/9/{f}-{l}/stats/{f}-{l}-Premier-League-Stats",
    }
    urls.append(urls_dict)

In [6]:
dfs = []

for u in urls:

    print(u["season"])
    driver = webdriver.Chrome()
    driver.get(u["url"])
    soup = BeautifulSoup(driver.page_source, "lxml")
    # driver.quit()

    try:
        table = soup.find_all("table")[11]
    except:
        continue

    src = (pd.read_html(str(table))[0]).droplevel(0, axis=1)
    src.columns = src.columns.str.lower()

    src = src[src["player"] != "Player"].fillna(0).reset_index(drop=True)
    src["gls"] = src["gls"].astype(float)

    df_slim = src.iloc[:, :14][
        [
            "player",
            "pos",
            "squad",
            "age",
            "born",
            "mp",
            "starts",
            "min",
            "gls",
            "ast",
            "g+a",
        ]
    ].copy()
    df_slim = (
        df_slim[df_slim["gls"] > 0]
        .sort_values("gls", ascending=False)
        .reset_index(drop=True)
        .copy()
    )
    df_slim["season"] = u["season"]
    dfs.append(df_slim)

1992-1993
1993-1994
1994-1995
1995-1996
1996-1997
1997-1998
1998-1999
1999-2000
2000-2001
2001-2002
2002-2003
2003-2004


KeyboardInterrupt: 

In [None]:
df = pd.concat(dfs)

In [None]:
df_max = (
    df.groupby("season")
    .apply(lambda group: group.loc[group["gls"] == group["gls"].max()])
    .reset_index(drop=True)
    .copy()
)

In [None]:
df.query('season != "2022-2023"').to_csv(f"data/raw/all_premiere_league_players_all_year_w_goals.csv", index=False)

In [None]:
df_max.to_csv(f"data/raw/top_premiere_league_goals_scorers_all_year.csv", index=False)

In [None]:
df_max