<a href="https://colab.research.google.com/github/ryanma5/PL-Project/blob/main/PL_5_year_league_table.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Import Libraries and mount drive
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Function for scrapping each year's league table

In [None]:
# Define the function to scrape the league table for each season
def scrape_league_table(season_url):
    response = requests.get(season_url)
    if response.status_code != 200:
        print(f"Failed to retrieve data for {season_url}")
        return None

    # Parse HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all tables on page
    tables = soup.find_all('table')  # This returns a list of all <table> elements

    # Retrieve first table from page (first table is always league table)
    if tables:
        league_table = tables[0]
    else:
        print(f"No tables found on the page for {season_url}")
        return None

    return league_table


# Function for parsing each year's league table and populating a list that stores team names, and their respective points in a year

In [None]:
def parse_league_table(league_table):
    # Find all rows in the table (skip the header row)
    rows = league_table.find_all('tr')[1:]  # Skip the header row

    # List to store parsed data
    table_data = []

    # Loop through each row and extract the relevant data (team name, points)
    for row in rows:
        cols = row.find_all('td')
        if len(cols) > 1:  # Ensure the row has valid columns
            # Extract team name using the 'team' data-stat
            team_cell = row.find('td', {'data-stat': 'team'})
            team = team_cell.find('a').text.strip() if team_cell and team_cell.find('a') else team_cell.text.strip()

            # Extract points using the 'points' data-stat
            points_cell = row.find('td', {'data-stat': 'points'})
            points = int(points_cell.text.strip()) if points_cell else 0

            # Add the extracted data to the list
            table_data.append({
                'Team': team,
                'Points': points,
            })

    # Return the extracted data as a list of dictionaries
    return table_data



# Aggregating points for each team

In [None]:
# Define the URLs for the different seasons
urls = {
    '20-21': 'https://fbref.com/en/comps/9/2020-2021/2020-2021-Premier-League-Stats',
    '21-22': 'https://fbref.com/en/comps/9/2021-2022/2021-2022-Premier-League-Stats',
    '22-23': 'https://fbref.com/en/comps/9/2022-2023/2022-2023-Premier-League-Stats',
    '23-24': 'https://fbref.com/en/comps/9/2023-2024/2023-2024-Premier-League-Stats',
    '24-25': 'https://fbref.com/en/comps/9/Premier-League-Stats'
}

# Initialize an empty dictionary to store the total points for each team
team_points = {}

# Loop through each season and scrape the league table data
for season, url in urls.items():
    print(f"Scraping data for {season}...")

    # Call the scrape_league_table function with the season's URL
    season_table = scrape_league_table(url)

    # If the table was successfully scraped, process it
    if season_table:
        # Parse the league table and get the team points for the season
        season_data = parse_league_table(season_table)

        # Accumulate points for each team
        for data in season_data:
            team = data['Team']
            points = data['Points']

            # Add the points for the team across seasons
            if team in team_points:
                team_points[team] += points
            else:
                team_points[team] = points

    # Sleep to avoid overloading the server
    time.sleep(2)


Scraping data for 20-21...
Scraping data for 21-22...
Scraping data for 22-23...
Scraping data for 23-24...
Scraping data for 24-25...


In [None]:
# Convert the accumulated team points into a DataFrame
team_points_df = pd.DataFrame(list(team_points.items()), columns=['Team', 'Total Points'])

# Sort the DataFrame by Total Points in descending order (optional)
team_points_df = team_points_df.sort_values(by='Total Points', ascending=False)

# Display the final DataFrame
print(team_points_df)

               Team  Total Points
0   Manchester City           390
2         Liverpool           355
7           Arsenal           342
1    Manchester Utd           289
3           Chelsea           283
6         Tottenham           283
10      Aston Villa           258
11    Newcastle Utd           257
5          West Ham           236
15         Brighton           229
13   Crystal Palace           206
12           Wolves           199
9           Everton           193
20        Brentford           168
4    Leicester City           166
17           Fulham           156
8      Leeds United           128
23      Bournemouth           117
14      Southampton           114
24  Nott'ham Forest           107
16          Burnley            98
19    Sheffield Utd            39
18        West Brom            26
25       Luton Town            26
21          Watford            23
22     Norwich City            22
26     Ipswich Town            15


# Export as CSV file

In [None]:
# Export as csv and save
team_points_df.to_csv('/content/drive/MyDrive/Colab Notebooks/PL_5years_points.csv', index=False)