<a href="https://colab.research.google.com/github/shusritavenugopal/Football-Match-Prediction/blob/main/sheInnovates_Web_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Web Scraping Football Matches from Official EPL with Python

Scraping our first page with requests

In [None]:
import requests

In [None]:
standings_url = "https://fbref.com/en/comps/9/Prier-Leagure-Stats"

In [None]:
data = requests.get(standings_url)

Parsing HTML links with beautifulsoup

In [None]:
from bs4 import BeautifulSoup

In [None]:
soup = BeautifulSoup(data.text)

In [None]:
standings_table = soup.select('table.stats_table')[0]
links = standings_table.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if '/squads/' in l]

In [None]:
# building absolute links

team_urls = [f"https://fbref.com{l}" for l in links]
team_url = team_urls[0]
data = requests.get(team_url)

Extract Match Stats Using Pandas and Requests

In [None]:
import pandas as pd

In [None]:
matches_table = pd.read_html(data.text, match = "Scores & Fixtures")
matches_table[0].head()

# Get Match Shooting stats With Requests and Pandas

In [None]:
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'all_comps/shooting/' in l]
links[0]

In [None]:
data = requests.get(f"https://fbref.com{links[0]}")
shooting = pd.read_html(data.text, match="Shooting")[0]

Cleaning and Merging Scraped Data With Pandas

we will scrap the standings, download the data for a single team and combined the data for a single team in a single season into one dataframe.

In [None]:
shooting.columns = shooting.columns.droplevel()
shooting.head()

In [None]:
# Let's combine matches_table df and shooting df

team_data = matches_table[0].merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
team_data.head()

In [None]:
shooting.shape

Scraping Data for Multiple Season and Teams With a Loop

In [None]:
years = list(range(2024, 2022, -1))
all_matches = []

In [None]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [None]:
import time
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]

    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"

    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        shooting = pd.read_html(data.text, match="Shooting")[0]
        shooting.columns = shooting.columns.droplevel()

        try:
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue
        team_data = team_data[team_data["Comp"] == "Premier League"]

        team_data["Season"] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)
        time.sleep(1)

In [None]:
len(all_matches)

In [None]:
match_df = pd.concat(all_matches)

In [None]:
match_df.columns = [c.lower() for c in match_df.columns]

In [None]:
match_df

In [None]:
match_df.to_csv("matches.csv")