# Scrape English Premier League stats from [FB Reference](https://fbref.com/en/comps/9/Premier-League-Stats)

#### Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

---

## Get data

#### Create a list of seasons

In [3]:
years = list(range(2022, 2020, -1))

In [4]:
years

[2022, 2021]

#### Define the URL we'll scrape

In [5]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

#### Loop over the years and scrape team pages and shooting stats

In [22]:
%%time

all_matches = []

for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select("table.stats_table")[0]

    links = [l.get("href") for l in standings_table.find_all("a")]
    links = [l for l in links if "/squads/" in l]
    team_urls = [f"https://fbref.com{l}" for l in links]

    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"

In [23]:
standings_url

'https://fbref.com/en/comps/9/1889/2018-2019-Premier-League-Stats'

In [26]:
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all("a")]
        links = [l for l in links if l and "/shooting/" in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        shooting = pd.read_html(data.text, match="Shooting")[0]
        shooting.columns = shooting.columns.droplevel()
        try:
            team_data = matches.merge(
                shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date"
            )
        except ValueError:
            continue
        team_data = team_data[team_data["Comp"] == "Premier League"]

        team_data["Season"] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)
        time.sleep(1)

AttributeError: 'list' object has no attribute 'columns'

In [25]:
matches

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes
0,2019-08-04,15:00,Community Shield,FA Community Shield,Sun,Neutral,D,1 (5),1 (4),Liverpool,,,47,77565.0,David Silva,4-3-3,Martin Atkinson,Match Report,
1,2019-08-10,12:30,Premier League,Matchweek 1,Sat,Away,W,5,0,West Ham,3.1,0.8,57,59870.0,David Silva,4-3-3,Mike Dean,Match Report,
2,2019-08-17,17:30,Premier League,Matchweek 2,Sat,Home,D,2,2,Tottenham,2.9,0.3,56,54503.0,Kevin De Bruyne,4-3-3,Michael Oliver,Match Report,
3,2019-08-25,14:00,Premier League,Matchweek 3,Sun,Away,W,3,1,Bournemouth,2.4,1.5,74,10486.0,David Silva,4-3-3,Andre Marriner,Match Report,
4,2019-08-31,15:00,Premier League,Matchweek 4,Sat,Home,W,4,0,Brighton,2.1,0.5,54,54386.0,David Silva,4-3-3,Jonathan Moss,Match Report,
5,2019-09-14,17:30,Premier League,Matchweek 5,Sat,Away,L,2,3,Norwich City,2.2,1.5,69,27035.0,David Silva,4-2-3-1,Kevin Friend,Match Report,
6,2019-09-18,22:00,Champions Lg,Group stage,Wed,Away,W,3,0,ua Shakhtar,3.0,0.8,53,36675.0,Fernandinho,4-2-3-1,Artur Soares Dias,Match Report,
7,2019-09-21,15:00,Premier League,Matchweek 6,Sat,Home,W,8,0,Watford,5.8,0.3,68,54273.0,David Silva,4-3-3,Mike Dean,Match Report,
8,2019-09-24,19:45,EFL Cup,Third round,Tue,Away,W,3,0,Preston,,,65,22025.0,David Silva,4-3-3,Lee Mason,Match Report,
9,2019-09-28,17:30,Premier League,Matchweek 7,Sat,Away,W,3,1,Everton,3.5,2.5,63,39222.0,Fernandinho,4-3-3,Michael Oliver,Match Report,


#### Concatenate our list of dataframes 

In [7]:
match_df = pd.concat(all_matches)

#### Clean up column headers

In [8]:
match_df.columns = [c.lower() for c in match_df.columns]

In [13]:
match_df = match_df.sort_values("date", ascending=False)

---

## Export

In [14]:
match_df.to_csv("data/raw/matches.csv", index=False)