# Crawling [https://fbref.com](https://fbref.com) pages

Crawl pages from [https://fbref.com](https://fbref.com) and save them locally.

Imports

In [2]:
from time import sleep

import requests
from bs4 import BeautifulSoup
from requests import Response
from tqdm.notebook import trange

General crawler settings

In [3]:
base_url = 'https://fbref.com'
stats_href = '/en/comps/20/Bundesliga-Stats'
pages_path = '../data/pages'
seasons_to_crawl = 7
seconds_to_sleep = 7

Utility functions

In [4]:
def save_page(href) -> Response|None:
    url = f'{base_url}{href}'
    html = requests.get(url, headers={'User-agent': 'bot123'})
    # File names are not allowed to contain / and :
    file_name = url.replace('/', '(').replace(':', '_')

    with open(f'{pages_path}/{file_name}', 'w') as f:
        f.write(html.text)

    sleep(seconds_to_sleep)
    return html

def get_category_href(soup, category):
    return soup.select(f'div.filter div a[href*="all_comps/{category}"]')[0]['href']

Crawling functionality

In [None]:
current_stats_href = stats_href

for season_no in trange(seasons_to_crawl, desc='Seasons crawled'):
    stats_page_html = save_page(current_stats_href)
    stats_page_soup = BeautifulSoup(stats_page_html.text)

    standings_table = stats_page_soup.select('table.stats_table')[0]
    teams_anchors = standings_table.select('tr td:nth-of-type(1) a')
    team_hrefs = [anchor["href"] for anchor in teams_anchors]

    for team_href in team_hrefs:
        team_page_html = save_page(team_href)
        team_page_soup = BeautifulSoup(team_page_html.text)

        save_page(get_category_href(team_page_soup, 'shooting'))
        save_page(get_category_href(team_page_soup, 'keeper'))
        save_page(get_category_href(team_page_soup, 'passing'))
        save_page(get_category_href(team_page_soup, 'passing_types'))
        save_page(get_category_href(team_page_soup, 'gca'))
        save_page(get_category_href(team_page_soup, 'defense'))
        save_page(get_category_href(team_page_soup, 'possession'))
        save_page(get_category_href(team_page_soup, 'misc'))

    href_to_previous_season = stats_page_soup.select('div.prevnext a:-soup-contains("Previous Season")')[0]['href']
    current_stats_href = href_to_previous_season
    sleep(seconds_to_sleep)