## Football web scraping

The aim of this notebook is to download and save match statisticts for Premier League 2018 - 2022 seasons  from https://fbref.com/ website.

In [1]:
import pandas as pd

#selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException as SeleniumTimeout

#other
from bs4 import BeautifulSoup
import time
import re
import os
import string


In [2]:
SEASONS = [2018, 2019, 2020, 2021, 2022]

In [3]:
DATA_DIR = "data"
FIXTURES_DIR = os.path.join(DATA_DIR, "fixtures")
SCORES_DIR = os.path.join(DATA_DIR, "scores")

In [4]:
def get_html(url, selector, sleep=5, retries=3):
#return html page 
#url: url address
#selector:  XPATH selector
#sleep: time in seconds - period of time to wait before proceed with web access, avoid bann from website
#retries: number of page laoding retries
    html = None
    for i in range(1, retries + 1):
        time.sleep(sleep * i)

        try:
            driver = webdriver.Firefox()
            driver.get(url)

            html = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, selector))
            )
            print(driver.title)
            html = html.get_attribute('innerHTML')
        except SeleniumTimeout:
            print(f"Timeout error on {url}")
        else:
            break
        finally:
            driver.quit()

    return html

In [5]:
def scrape_season(season):
    #save full season games html fixture tables to a file
    full_season = f'{season - 1}-{season}'
    selector = f'//*[@id="sched_{full_season}_9_1"]'
    url = f'https://fbref.com/en/comps/9/{full_season}/schedule/{full_season}-Premier-League-Scores-and-Fixtures'
    save_path = os.path.join(FIXTURES_DIR, url.split("/")[-1])
    
    if os.path.exists(save_path):
        return
    
    html = get_html(url, selector)

    with open(save_path, "w+") as f:
        f.write(html)

In [6]:
def get_match_reports(html):
#return links to match reports from fixture table
#html: html code with season fixture table
    page = BeautifulSoup(html)
    links = page.find_all("a", string=re.compile("Match Report"))
    href = [l["href"] for l in links]
    match_reports = [f"https://fbref.com{h}" for h in href]
    
    return match_reports

In [7]:
def scrape_season_games(fixture_file):
   
#download all match statistics for a season and save it as html in save_path
#fixture_file: file name with fixtures

    with open(os.path.join(FIXTURES_DIR, fixture_file), "r") as f:
        fixture = f.read()
        
#generate links to match reports    
    match_reports = get_match_reports(fixture)
    
    for report in match_reports:
        selector = '//*[@id="content"]'
        
        #add season year to file name for later reference
        season = fixture_file.split('-')[1]
        save_path = os.path.join(SCORES_DIR, report.split("/")[-1] + f'_{season}')

        if os.path.exists(save_path):
            continue

        html = get_html(report, selector)

        with open(save_path, "w+", encoding="utf-8") as f:
            f.write(html)

In [8]:
def get_teams_id(soup):
    # return teams id hashtags. hashtags are needed to find respective statistics tables
    #soup: html code with match statistics
    tags = soup.find_all('div', id=re.compile('^switcher_player'))
    
    return [t['id'].split('_')[-1] for t in tags]

In [9]:
def read_stats(soup, team_id):
    #return last row of statistics from html
    #soup: html code with match statistics
    #tem_id: team id hashtag
    
    stats_soup = soup.find('div', id=re.compile(f'switcher_player_stats_{team_id}'))
    stats_tables = pd.read_html(str(stats_soup), attrs={'class': 'stats_table'})
    stats_df = pd.concat(stats_tables, axis=1)
    
    return stats_df.iloc[-1:]

In [10]:
def get_column_names(columns):
    #return column names used for data frame
    names = ['_'.join(c) if 'Unnamed' not in c[0] else c[1] for c in columns ]
    
    return [c.replace(' ', '_').lower() for c in names]

In [11]:
#scrape pages with season matches
for season in SEASONS:
    scrape_season(season)

In [67]:
#scrape pages with match statiscics
fixture_files = os.listdir(FIXTURES_DIR)

for file in fixture_files:
    scrape_season_games(file)

Brentford vs. Brighton & Hove Albion Match Report – Saturday September 11, 2021 | FBref.com


In [11]:
score_files = os.listdir(SCORES_DIR)
games = []
base_cols = None

for score_file in score_files:

    with open(os.path.join(SCORES_DIR, score_file), "r", encoding="utf8") as f:
        score_html = f.read()

    soup = BeautifulSoup(score_html)
    teams_id = get_teams_id(soup)

    summaries = []
    for team in teams_id:
        summary = read_stats(soup, team)
        summary.columns = get_column_names(summary.columns)
        
        #generate columns to use in final df
        if base_cols is None:
            remove_cols = ['index', 'player', '#', 'nation', 'pos', 'age', 'min', 'performance_touches', 'performance_tkl',
                   'performance_int', 'performance_blocks', 'passes_cmp', 'passes_att', 'passes_cmp%', 'passes_prog']
            base_cols = summary.columns.drop_duplicates(keep='first')
            base_cols = [c for c in base_cols if c not in remove_cols]

        summary = summary[base_cols]
        #remove performance_crdy and performance_crdr column which are duplicated
        summary = summary.loc[:,~summary.columns.duplicated()]
        summaries.append(summary)

    game = pd.concat(summaries, axis=0, ignore_index=True)


    #list with info about teams names and date of the match
    info = soup.h1.contents[0].split('Match Report')

    game['team'] = [s.strip() for s in info[0].split('vs.')]
    game['home'] = [1, 0]

    #reverse game df to generate statistics for opponent
    game_opp = game.iloc[::-1].reset_index()
    game_opp.columns += '_opp'
    
    full_game = pd.concat([game, game_opp], axis=1)
    full_game['season'] = score_file[-4:]
    full_game['date'] = pd.to_datetime(' '.join(info[-1].split()[-3:]), errors='coerce', format='%B %d, %Y')
    full_game['result'] = np.select([(full_game['performance_gls'] > full_game['performance_gls_opp']), 
                                     (full_game['performance_gls'] < full_game['performance_gls_opp'])],
                         ["W", "L"], "D")

    f.close()

    games.append(full_game)

    if len(games) % 100 == 0:
        print(len(games) / len(score_files))


0.06578947368421052
0.13157894736842105
0.19736842105263158
0.2631578947368421
0.32894736842105265
0.39473684210526316
0.4605263157894737
0.5263157894736842
0.5921052631578947
0.6578947368421053
0.7236842105263158
0.7894736842105263
0.8552631578947368
0.9210526315789473
0.9868421052631579


In [12]:
games_df = pd.concat(games, axis=0 , ignore_index=True)
games_df

Unnamed: 0,performance_gls,performance_ast,performance_pk,performance_pkatt,performance_sh,performance_sot,performance_crdy,performance_crdr,expected_xg,expected_npxg,...,performance_og_opp,performance_recov_opp,aerial_duels_won_opp,aerial_duels_lost_opp,aerial_duels_won%_opp,team_opp,home_opp,season,date,result
0,0,0,0,0,13,2,0,0,1.5,1.5,...,0,49.0,15.0,11.0,57.7,Aston Villa,0,2021,2020-11-08,L
1,2,2,0,0,15,5,0,0,1.8,1.8,...,1,51.0,11.0,15.0,42.3,Arsenal,1,2021,2020-11-08,W
2,3,2,0,1,21,8,2,0,3.1,2.2,...,0,49.0,15.0,11.0,57.7,Aston Villa,0,2022,2021-10-22,W
3,1,1,0,0,10,4,5,0,1.4,1.4,...,0,60.0,11.0,15.0,42.3,Arsenal,1,2022,2021-10-22,L
4,3,0,1,1,20,5,7,1,2.5,1.7,...,0,44.0,5.0,8.0,38.5,Aston Villa,0,2020,2019-09-22,W
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3035,3,3,0,0,9,6,4,0,1.3,1.3,...,0,63.0,19.0,25.0,43.2,Wolverhampton Wanderers,1,2021,2021-04-05,W
3036,2,2,0,0,13,7,1,0,1.5,1.5,...,0,63.0,16.0,27.0,37.2,West Ham United,0,2020,2019-12-04,W
3037,0,0,0,0,6,3,2,0,0.6,0.6,...,0,58.0,27.0,16.0,62.8,Wolverhampton Wanderers,1,2020,2019-12-04,L
3038,1,1,0,0,15,5,1,0,1.1,1.1,...,0,45.0,11.0,11.0,50.0,West Ham United,0,2022,2021-11-20,W


In [15]:
games_df.to_csv('football_games.csv')