In [9]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep

In [10]:
base_url = 'https://fbref.com'
stats_url = f'{base_url}/en/comps/20/Bundesliga-Stats'

In [11]:
seasons_to_scrape = 7
all_matches = []
current_stats_url = stats_url

for season_no in range(seasons_to_scrape):
    print(f'Fetching {current_stats_url}')

    stats_html = requests.get(current_stats_url, headers = {'User-agent': 'fontys4'})
    stats_soup = BeautifulSoup(stats_html.text)

    standings_table = stats_soup.select('table.stats_table')[0]
    teams_anchors = standings_table.select('tr td:nth-of-type(1) a')
    teams_urls = [f'https://fbref.com{anchor["href"]}' for anchor in teams_anchors]

    for team_url in teams_urls:
        team_name = team_url\
                        .split('/')[-1]\
                        .replace('-Stats', '')\
                        .replace('-', ' ')
        print(f' Scraping {team_name}')

        team_html = requests.get(team_url, headers = {'User-agent': 'fontys5'})
        team_matches_df = pd.read_html(team_html.text, match='Scores & Fixtures')[0]
        team_soup = BeautifulSoup(team_html.text)

        shooting_stats_link = team_soup.select('div.filter div a[href*="all_comps/shooting"]')[0]['href']
        shooting_stats_df = pd.read_html(f'https://fbref.com{shooting_stats_link}', match='Shooting')[0]
        shooting_stats_df.columns = shooting_stats_df.columns.droplevel(0)


        try:
            team_matches_df = team_matches_df\
                                .merge(
                shooting_stats_df[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]],
                on="Date"
            )
        except ValueError:
            continue

        team_matches_df['Team'] = team_name
        all_matches.append(team_matches_df)
        sleep(5)
        
    href_to_previous_season = stats_soup.select('div.prevnext a:-soup-contains("Previous Season")')[0]['href']
    current_stats_url = f'{base_url}{href_to_previous_season}'
    sleep(5)

Fetching https://fbref.com/en/comps/20/Bundesliga-Stats
 Scraping Bayer Leverkusen
 Scraping Bayern Munich
 Scraping RB Leipzig
 Scraping Stuttgart
 Scraping Hoffenheim
 Scraping Wolfsburg
 Scraping Dortmund
 Scraping Union Berlin
 Scraping Eintracht Frankfurt
 Scraping Freiburg
 Scraping Heidenheim
 Scraping Werder Bremen
 Scraping Bochum
 Scraping Monchengladbach
 Scraping Augsburg
 Scraping Koln
 Scraping Darmstadt 98
 Scraping Mainz 05
Fetching https://fbref.com/en/comps/20/2022-2023/2022-2023-Bundesliga-Stats
 Scraping Bayern Munich
 Scraping Dortmund
 Scraping RB Leipzig
 Scraping Union Berlin
 Scraping Freiburg
 Scraping Bayer Leverkusen
 Scraping Eintracht Frankfurt
 Scraping Wolfsburg
 Scraping Mainz 05
 Scraping Monchengladbach
 Scraping Koln
 Scraping Hoffenheim
 Scraping Werder Bremen
 Scraping Bochum
 Scraping Augsburg
 Scraping Stuttgart
 Scraping Schalke 04
 Scraping Hertha BSC
Fetching https://fbref.com/en/comps/20/2021-2022/2021-2022-Bundesliga-Stats
 Scraping Bayern M

KeyError: "['FK'] not in index"

In [12]:
all_matches_df = pd.concat(all_matches)
all_matches_df.columns = [c.lower() for c in all_matches_df.columns]

In [14]:
all_matches_df.shape

(4511, 26)

In [20]:
all_matches_df.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,referee,match report,notes,sh,sot,dist,fk,pk,pkatt,team
0,2023-08-12,15:30,DFB-Pokal,Round of 64,Sat,Away,W,8.0,0.0,FC Teutonia Ottensen,...,Tom Bauer,Match Report,,22.0,11.0,,,1.0,1.0,Bayer Leverkusen
1,2023-08-19,15:30,Bundesliga,Matchweek 1,Sat,Home,W,3.0,2.0,RB Leipzig,...,Felix Brych,Match Report,,11.0,7.0,19.0,0.0,0.0,0.0,Bayer Leverkusen
2,2023-08-26,18:30,Bundesliga,Matchweek 2,Sat,Away,W,3.0,0.0,M'Gladbach,...,Christian Dingert,Match Report,,24.0,11.0,15.8,0.0,0.0,0.0,Bayer Leverkusen
3,2023-09-02,15:30,Bundesliga,Matchweek 3,Sat,Home,W,5.0,1.0,Darmstadt 98,...,Sven Jablonski,Match Report,,25.0,13.0,17.3,1.0,0.0,0.0,Bayer Leverkusen
4,2023-09-15,20:30,Bundesliga,Matchweek 4,Fri,Away,D,2.0,2.0,Bayern Munich,...,Daniel Schlager,Match Report,,12.0,4.0,20.7,1.0,1.0,1.0,Bayer Leverkusen


In [17]:
all_matches_df.to_csv('./data/bundesliga_matches.csv', index=False)