In [48]:
#importing libraries
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

In [49]:
#FBREF Premier League URL
#You can change the URL to any league you want
premier_league_url = "https://fbref.com/en/comps/9/2022-2023/schedule/2022-2023-Premier-League-Scores-and-Fixtures"

In [50]:
#Getting the data from the URL with requests
premier_league_data = requests.get(premier_league_url)

In [51]:
#Using BeautifulSoup to parse the data
soup = BeautifulSoup(premier_league_data.text)

premier_league_scores_table = soup.select("table.stats_table")[0]

In [52]:
#Getting data with pandas
premier_league_scores = pd.read_html(premier_league_data.text, match= "Scores & Fixtures")[0]

premier_league_scores.head()

Unnamed: 0,Wk,Day,Date,Time,Home,xG,Score,xG.1,Away,Attendance,Venue,Referee,Match Report,Notes
0,1.0,Fri,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor,Match Report,
1,1.0,Sat,2022-08-06,12:30,Fulham,1.2,2–2,1.2,Liverpool,22207.0,Craven Cottage,Andy Madley,Match Report,
2,1.0,Sat,2022-08-06,15:00,Tottenham,1.5,4–1,0.5,Southampton,61732.0,Tottenham Hotspur Stadium,Andre Marriner,Match Report,
3,1.0,Sat,2022-08-06,15:00,Newcastle Utd,1.7,2–0,0.3,Nott'ham Forest,52245.0,St James' Park,Simon Hooper,Match Report,
4,1.0,Sat,2022-08-06,15:00,Leeds United,0.8,2–1,1.3,Wolves,36347.0,Elland Road,Robert Jones,Match Report,


In [53]:
#Data shape for 2022-2023 Season
premier_league_scores.shape

(430, 14)

In [54]:
premier_league_match_scores = premier_league_scores
premier_league_match_scores.head()

Unnamed: 0,Wk,Day,Date,Time,Home,xG,Score,xG.1,Away,Attendance,Venue,Referee,Match Report,Notes
0,1.0,Fri,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor,Match Report,
1,1.0,Sat,2022-08-06,12:30,Fulham,1.2,2–2,1.2,Liverpool,22207.0,Craven Cottage,Andy Madley,Match Report,
2,1.0,Sat,2022-08-06,15:00,Tottenham,1.5,4–1,0.5,Southampton,61732.0,Tottenham Hotspur Stadium,Andre Marriner,Match Report,
3,1.0,Sat,2022-08-06,15:00,Newcastle Utd,1.7,2–0,0.3,Nott'ham Forest,52245.0,St James' Park,Simon Hooper,Match Report,
4,1.0,Sat,2022-08-06,15:00,Leeds United,0.8,2–1,1.3,Wolves,36347.0,Elland Road,Robert Jones,Match Report,


In [55]:
#Last 5 years list
years = list(range(2023, 2018, -1))
years

[2023, 2022, 2021, 2020, 2019]

In [56]:
#Creating a list to store all the data from the last 5 years
#This will take a while to run
#You can change the range to get more data

all_matches = []

for year in years:
    data = requests.get(premier_league_url)
    soup = BeautifulSoup(premier_league_data.text)
    premier_league_scores_table = soup.select("table.stats_table")[0]
    
    previous_seasons = soup.select("a.prev")[0].get("href")
    premier_league_scores_url = f"https://fbref.com/{previous_seasons}"
    
    premier_league_scores = pd.read_html(premier_league_data.text, match= "Scores & Fixtures")[0]

    all_matches.append(premier_league_scores)
    time.sleep(1)


In [57]:
#Concatenating all the data into one dataframe
premier_league_all_matches = pd.concat(all_matches)

In [58]:
#Data shape for all the data
premier_league_all_matches.shape

(2150, 14)

In [59]:
#Dropping unnecessary columns
premier_league_all_matches = premier_league_all_matches.drop(["Match Report", "Notes", "Day"], axis=1)
premier_league_all_matches.head()

Unnamed: 0,Wk,Date,Time,Home,xG,Score,xG.1,Away,Attendance,Venue,Referee
0,1.0,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor
1,1.0,2022-08-06,12:30,Fulham,1.2,2–2,1.2,Liverpool,22207.0,Craven Cottage,Andy Madley
2,1.0,2022-08-06,15:00,Tottenham,1.5,4–1,0.5,Southampton,61732.0,Tottenham Hotspur Stadium,Andre Marriner
3,1.0,2022-08-06,15:00,Newcastle Utd,1.7,2–0,0.3,Nott'ham Forest,52245.0,St James' Park,Simon Hooper
4,1.0,2022-08-06,15:00,Leeds United,0.8,2–1,1.3,Wolves,36347.0,Elland Road,Robert Jones


In [60]:
#Renaming columns
premier_league_all_matches.columns = ["Week", "Date", "Time", "Home_Team", "Home_xG", "Score", "Away_xG", "Away_Team", "Attendance", "Venue", "Referee"]

In [61]:
#Dropping rows with no data
premier_league_all_matches = premier_league_all_matches.dropna(how="all")

In [62]:
#Changing the data type of the Week column
premier_league_all_matches["Week"] = premier_league_all_matches["Week"].astype(int)

In [63]:
#Resetting the index
premier_league_all_matches = premier_league_all_matches.reset_index(drop=True)
premier_league_all_matches.head()

Unnamed: 0,Week,Date,Time,Home_Team,Home_xG,Score,Away_xG,Away_Team,Attendance,Venue,Referee
0,1,2022-08-05,20:00,Crystal Palace,1.2,0–2,1.0,Arsenal,25286.0,Selhurst Park,Anthony Taylor
1,1,2022-08-06,12:30,Fulham,1.2,2–2,1.2,Liverpool,22207.0,Craven Cottage,Andy Madley
2,1,2022-08-06,15:00,Tottenham,1.5,4–1,0.5,Southampton,61732.0,Tottenham Hotspur Stadium,Andre Marriner
3,1,2022-08-06,15:00,Newcastle Utd,1.7,2–0,0.3,Nott'ham Forest,52245.0,St James' Park,Simon Hooper
4,1,2022-08-06,15:00,Leeds United,0.8,2–1,1.3,Wolves,36347.0,Elland Road,Robert Jones


In [64]:
#Saving the data to a csv file
premier_league_all_matches.to_csv("premier_league_all_matches.csv", index=False)