In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep

In [2]:
base_url = 'https://fbref.com'
stats_url = f'{base_url}/en/comps/20/Bundesliga-Stats'

In [3]:
seasons_to_scrape = 7
all_matches = []
current_stats_url = stats_url

for season_no in range(seasons_to_scrape):
    print(f'Fetching {current_stats_url}')

    stats_html = requests.get(current_stats_url, headers = {'User-agent': 'fontys4'})
    stats_soup = BeautifulSoup(stats_html.text)

    standings_table = stats_soup.select('table.stats_table')[0]
    teams_anchors = standings_table.select('tr td:nth-of-type(1) a')
    teams_urls = [f'{base_url}{anchor["href"]}' for anchor in teams_anchors]

    for team_url in teams_urls:
        team_name = team_url\
                        .split('/')[-1]\
                        .replace('-Stats', '')\
                        .replace('-', ' ')
        print(f' Scraping {team_name}')

        team_html = requests.get(team_url, headers = {'User-agent': 'fontys5'})
        team_matches_df = pd.read_html(team_html.text, match='Scores & Fixtures')[0]
        team_soup = BeautifulSoup(team_html.text)

        sleep(5)
        print('  Shooting stats')
        shooting_stats_link = team_soup.select('div.filter div a[href*="all_comps/shooting"]')[0]['href']
        shooting_stats_df = pd.read_html(f'{base_url}{shooting_stats_link}', match='Shooting')[0]
        shooting_stats_df.columns = shooting_stats_df.columns.droplevel(0)

        try:
            team_matches_df = team_matches_df\
                                .merge(
                shooting_stats_df[['Date', 'Sh', 'SoT', 'Dist', 'FK', 'PK', 'PKatt']],
                on='Date'
            )
        except ValueError:
            continue

        sleep(5)
        print('  Goalkeeping stats')
        goalkeeping_stats_link = team_soup.select('div.filter div a[href*="all_comps/keeper"]')[0]['href']
        goalkeeping_stats_df = pd.read_html(f'{base_url}{goalkeeping_stats_link}', match='Goalkeeping')[0]
        goalkeeping_stats_df.columns = goalkeeping_stats_df.columns.droplevel(0)
        
        try:
            team_matches_df = team_matches_df\
                                .merge(
                goalkeeping_stats_df[['Date', 'SoTA', 'Saves', 'CS', 'PKA', 'PKsv']],
                on='Date'
            )
        except ValueError:
            continue

        sleep(5)
        print('  Passing stats')
        passing_stats_link = team_soup.select('div.filter div a[href*="all_comps/passing"]')[0]['href']
        passing_stats_df = pd.read_html(f'{base_url}{passing_stats_link}', match='Passing')[0]
        passing_stats_df.columns = passing_stats_df.columns.droplevel(0)
        
        try:
            team_matches_df = team_matches_df\
                                .merge(
                passing_stats_df[['Date', 'TotDist']],
                on='Date'
            )
        except ValueError:
            continue

        sleep(5)
        print('  Passing types stats')
        passing_types_stats_link = team_soup.select('div.filter div a[href*="all_comps/passing_types"]')[0]['href']
        passing_types_stats_df = pd.read_html(f'{base_url}{passing_types_stats_link}', match='Pass Types')[0]
        passing_types_stats_df.columns = passing_types_stats_df.columns.droplevel(0)
        
        try:
            team_matches_df = team_matches_df\
                                .merge(
                passing_types_stats_df[['Date', 'Live', 'Dead', 'Att']],
                on='Date'
            )
        except ValueError:
            continue

        sleep(5)
        print('  Possession stats')
        possession_stats_link = team_soup.select('div.filter div a[href*="all_comps/possession"]')[0]['href']
        possession_stats_df = pd.read_html(f'{base_url}{possession_stats_link}', match='Possession')[0]
        possession_stats_df.columns = possession_stats_df.columns.droplevel(0)
        
        try:
            team_matches_df = team_matches_df\
                                .merge(
                possession_stats_df[['Date', 'Touches', 'Succ']],
                on='Date'
            )
        except ValueError:
            continue

        team_matches_df['Team'] = team_name
        all_matches.append(team_matches_df)
        sleep(5)
        
    href_to_previous_season = stats_soup.select('div.prevnext a:-soup-contains("Previous Season")')[0]['href']
    current_stats_url = f'{base_url}{href_to_previous_season}'
    sleep(5)

Fetching https://fbref.com/en/comps/20/Bundesliga-Stats
 Scraping Bayer Leverkusen
  Shooting stats
  Goalkeeping stats
  Passing stats
  Passing types stats
  Possession stats
 Scraping Bayern Munich
  Shooting stats
  Goalkeeping stats
  Passing stats
  Passing types stats
  Possession stats
 Scraping RB Leipzig
  Shooting stats
  Goalkeeping stats
  Passing stats
  Passing types stats
  Possession stats
 Scraping Stuttgart
  Shooting stats
  Goalkeeping stats
  Passing stats
  Passing types stats
  Possession stats
 Scraping Hoffenheim
  Shooting stats
  Goalkeeping stats
  Passing stats
  Passing types stats
  Possession stats
 Scraping Wolfsburg
  Shooting stats
  Goalkeeping stats
  Passing stats
  Passing types stats
  Possession stats
 Scraping Dortmund
  Shooting stats
  Goalkeeping stats
  Passing stats
  Passing types stats
  Possession stats
 Scraping Union Berlin
  Shooting stats
  Goalkeeping stats
  Passing stats
  Passing types stats
  Possession stats
 Scraping Eintrac

In [4]:
all_matches_df = pd.concat(all_matches)
all_matches_df.columns = [c.lower() for c in all_matches_df.columns]

In [5]:
all_matches_df.shape

(4516, 37)

In [6]:
all_matches_df.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,cs,pka,pksv,totdist,live,dead,att,touches,succ,team
0,2023-08-12,15:30,DFB-Pokal,Round of 64,Sat,Away,W,8.0,0.0,FC Teutonia Ottensen,...,1.0,0.0,0.0,,,,,,,Bayer Leverkusen
1,2023-08-19,15:30,Bundesliga,Matchweek 1,Sat,Home,W,3.0,2.0,RB Leipzig,...,0.0,0.0,0.0,6130.0,438.0,35.0,475.0,574.0,11.0,Bayer Leverkusen
2,2023-08-26,18:30,Bundesliga,Matchweek 2,Sat,Away,W,3.0,0.0,M'Gladbach,...,1.0,0.0,0.0,9491.0,693.0,45.0,740.0,854.0,7.0,Bayer Leverkusen
3,2023-09-02,15:30,Bundesliga,Matchweek 3,Sat,Home,W,5.0,1.0,Darmstadt 98,...,0.0,0.0,0.0,10511.0,775.0,36.0,812.0,913.0,15.0,Bayer Leverkusen
4,2023-09-15,20:30,Bundesliga,Matchweek 4,Fri,Away,D,2.0,2.0,Bayern Munich,...,0.0,0.0,0.0,8322.0,561.0,37.0,601.0,744.0,16.0,Bayer Leverkusen


In [7]:
all_matches_df.to_csv('./data/bundesliga_matches_2.csv', index=False)