# Web Scraping Premier League Data 

In [1]:
# Importing required libraries 
import requests 
from bs4 import BeautifulSoup 
import pandas as pd 
import time 

## Scraping data for multiple seasons and multiple teams 

In [19]:
# Years that we will require data for 
years = list(range(2023,2021,-1))

In [20]:
# Initialize a list that will contain all dataframes after the loop is finished 
all_matches = [] 

In [21]:
#standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [23]:
for year in years:
    # Get URL 
    data = requests.get(standings_url) 

    # Instantiate Soup Object 
    soup = BeautifulSoup(data.text) 
    
   # Select the league standings stats table 
    standings_table = soup.select('#switcher_results2022-202391')[0]   

    # Find all the anchor tags in the standings table 
    links = standings_table.find_all('a') 

    # Extract the href property from the links 
    # This will give all the links in the table associated with each squad, includes top scorer, goalkeeper etc
    all_links = [l.get("href") for l in links]

    # Filter the links so you only get the squad stats links 
    squad_links = [l for l in all_links if '/squads/' in l]
    
    # Turn the above links into full urls 
    team_urls = [f"https://fbref.com{l}" for l in squad_links]

    # Get the previous year data 
    previous_season = soup.select("a.prev")[0].get("href") 
    standings_url = f"https://fbref.com/{previous_season}"

    # Loop through each of the team urls 
    for team_url in team_urls:
        # Get the team name 
        team_name = team_url.split('/')[-1].replace("-Stats","").replace("-","") 

        # Get the team url 
        data = requests.get(team_url) 

        # Read html for scores and fixtures 
        matches = pd.read_html(data.text,match='Scores & Fixtures')
        matches = matches[0]

        # Read html for shooting 
        shoot_soup = BeautifulSoup(data.text)

        links = shoot_soup.find_all('a') 
        links = [l.get('href') for l in links]

        shooting_links = [l for l in links if l and 'all_comps/shooting/' in l] 
        shooting_links = shooting_links[0]
        
        shoot_data = requests.get(f"https://fbref.com{shooting_links}")
        shoot_data = pd.read_html(shoot_data.text,match='Shooting')[0]
        shoot_data.columns = shoot_data.columns.droplevel()

        # Wrap shooting data in try and except to prevent errors from teams having on shooting data 
        try: 
            # Merge the data and shooting data together 
            # We are taking data from the shoot_data frame of date, shots, shots on target, distance of shots, frees, pens, pens attempted in order. 
            team_data = matches.merge(shoot_data[["Date","Sh","SoT","Dist","FK","PK","PKatt"]],on='Date')
                        
        except ValueError: 
            continue 

        # Only get data for the premier league 
        team_data = team_data[team_data['Comp'] == 'Premier League']

        # Add in team name and season columns 
        team_data['Team'] = team_name 
        team_data['Season'] = year 
        
        # Append team_data to the all matches list 
        all_matches.append(team_data) 

        # Add sleep 
        time.sleep(10) 