In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from io import StringIO


In [2]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"
years = list(range(2025,2020, -1))
all_matches = []

In [3]:
def get_team_data(team_url):
    try:
        print(f"Accessing URL: {team_url}")
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        team_page = requests.get(team_url)
        print(f"Response status code: {team_page.status_code}")
        sleep(5)

        match_data = pd.read_html(StringIO(team_page.text), match="Scores & Fixtures")[0]
        match_data["Team"] = team_name

        team_soup = BeautifulSoup(team_page.text)
        links = [l.get("href") for l in team_soup.find_all('a')]

        #Get Shooting Data
        shooting_link = [l for l in links if l and 'all_comps/shooting/' in l]
        print(f"Shooting link: {shooting_link}")
        shooting_data = requests.get(f"https://fbref.com{shooting_link[0]}")
        print(f"Response status code: {shooting_data.status_code}")
        sleep(5)
        shooting = pd.read_html(StringIO(shooting_data.text), match="Shooting")[0]
        shooting.columns = shooting.columns.droplevel()
        match_data = match_data.merge(shooting[['Date', 'Sh', 'SoT', 'Dist']], on='Date')

        #Get Goalkeeping Data
        goalkeeping_link = [l for l in links if l and 'all_comps/keeper/' in l]
        print(f"Goalkeeping link: {goalkeeping_link}")
        goalkeeping_data = requests.get(f"https://fbref.com{goalkeeping_link[0]}")
        print(f"Response status code: {goalkeeping_data.status_code}")
        sleep(5)
        goalkeeping = pd.read_html(StringIO(goalkeeping_data.text), match="Goalkeeping")[0]
        goalkeeping.columns = goalkeeping.columns.droplevel()
        match_data = match_data.merge(goalkeeping[['Date', 'SoTA']], on='Date')

        #Get Passing Data
        passing_link = [l for l in links if l and 'all_comps/passing/' in l]
        print(f"Passing link: {passing_link}")
        passing_data = requests.get(f"https://fbref.com{passing_link[0]}")
        print(f"Response status code: {passing_data.status_code}")
        sleep(5)
        passing = pd.read_html(StringIO(passing_data.text), match="Passing")[0]
        passing = passing.iloc[:, :15]
        passing.columns = passing.columns.droplevel()
        match_data = match_data.merge(passing[['Date', 'Cmp', 'Att', 'TotDist', 'PrgDist']], on='Date')

        #Get GSC Data
        gsc_link = [l for l in links if l and 'all_comps/gca/' in l]
        print(f"GSC link: {gsc_link}")
        gsc_data = requests.get(f"https://fbref.com{gsc_link[0]}")
        print(f"Response status code: {gsc_data.status_code}")
        sleep(5)
        gsc = pd.read_html(StringIO(gsc_data.text), match="GCA")[0]
        gsc.columns = gsc.columns.droplevel()
        match_data = match_data.merge(gsc[['Date', 'SCA', 'GCA']], on='Date')

        match_data = match_data[match_data["Comp"] == "Premier League"]
    except Exception as e:
        print(f"Error: {e}")
        match_data = pd.DataFrame()

    if 'Notes' in match_data.columns:
        match_data = match_data.drop(columns=['Notes'])
    if 'Match Report' in match_data.columns:
        match_data = match_data.drop(columns=['Match Report'])

    return match_data


In [4]:
def get_season_data(season_url):
    print(f"Accessing URL: {season_url}")
    season_page = requests.get(season_url)
    print(f"Response status code: {season_page.status_code}")
    sleep(5)

    season_soup = BeautifulSoup(season_page.text)
    standings_table = season_soup.select('table.stats_table')[0]
    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]

    season_data = pd.DataFrame()
    for team_url in team_urls:
        team_data = get_team_data(team_url)
        season_data = pd.concat([season_data, team_data], ignore_index=True)
    
    '''previous_season = season_soup.select("a.prev")[0].get("href")
    previous_season = f"https://fbref.com{previous_season}"

    return season_data, previous_season'''
    return season_data

In [11]:
standings_url = "https://fbref.com/en/comps/9/2017-2018/2017-2018-Premier-League-Stats"

season_data = get_season_data(standings_url)

Accessing URL: https://fbref.com/en/comps/9/2017-2018/2017-2018-Premier-League-Stats
Response status code: 200
Accessing URL: https://fbref.com/en/squads/b8fd03ef/2017-2018/Manchester-City-Stats
Response status code: 200
Shooting link: ['/en/squads/b8fd03ef/2017-2018/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions', '/en/squads/b8fd03ef/2017-2018/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions', '/en/squads/b8fd03ef/2017-2018/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions', '/en/squads/b8fd03ef/2017-2018/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions']
Response status code: 200
Goalkeeping link: ['/en/squads/b8fd03ef/2017-2018/matchlogs/all_comps/keeper/Manchester-City-Match-Logs-All-Competitions', '/en/squads/b8fd03ef/2017-2018/matchlogs/all_comps/keeper/Manchester-City-Match-Logs-All-Competitions', '/en/squads/b8fd03ef/2017-2018/matchlogs/all_comps/keeper/Manchester-City-Match-Logs-A

In [12]:
season_data.to_csv("season_data_2017_2018.csv", index=False)

In [None]:
for year in years:
    season_data, new_url = get_season_data(standings_url)
    standings_url = new_url
    all_matches.append(season_data)
all_matches = pd.concat(all_matches)
all_matches.to_csv("all_matches.csv", index=False)