# Webscraping Player Stats

### Import Libraries

In [1]:
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
import undetected_chromedriver as uc

from collections import OrderedDict
import time

### Function to retrieve soup object of webpage

Since we will need to do a lot of webscraping to collect our player stat data, we will first create a function that retrieves and parses the HTML content of a given URL, returning a Beautiful Soup object for further processing.

In [2]:
def get_soup_object(url):
    """
    Fetches and parses an HTML page from a given URL using BeautifulSoup.

    Parameters:
    - url (str): The web address of the page to be retrieved.

    Returns:
    - soup (BeautifulSoup object): Parsed HTML content of the webpage.
    """

    # Define a user-agent string to mimic a real browser request
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'

    # Set up HTTP headers to include the user-agent
    headers = {
        'User-Agent': user_agent
    }

    # Make an HTTP GET request to the specified URL with the custom headers
    response = requests.get(url, headers=headers)

    # Parse the HTML content of the response using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Return the BeautifulSoup object for further processing
    return soup


### Function to webscrape individual win/loss stats for an individual player

We will collect out player stats from the website matchstat.com which contains extensive information on a player's stats across multiple areas such as various breakdowns of their yearly win/loss stats as well as match stats such as serve, return and break point stats.

First we will create a function that extracts the below win-loss statistics

<div style="text-align: center;">
    <img src="screenshots/win_loss_tables.png" width="600">
</div>


The function will take a specified index name and the Beautiful Soup object as arguments, and return the wins and losses from that index name, i.e. for the above example if the index_name was 'Overall', the function would return 351 and 122 representing the wins and the losses respectively. 


In [3]:
def get_win_loss_stats(soup, index_name):
    """
    Extracts win-loss statistics from an HTML table.

    Parameters:
    - soup (BeautifulSoup object): Parsed HTML content.
    - index_name (str): The text identifier to locate the specific row in the table.

    Returns:
    - wins (int or NaN): The number of wins extracted from the stats.
    - losses (int or NaN): The number of losses extracted from the stats.
    """

    # Iterate through all table rows in the HTML
    for row in soup.find_all('tr'):
        # Check if the specified index_name is present in the row's text
        # and that the row contains exactly three table data (<td>) cells
        if (index_name in row.text) & (len(row.find_all('td')) == 3):
            # Ensure the first cell exactly matches index_name
            if row.find('td').text == index_name:
                # Extract the win-loss statistics from the third cell (expected format: "wins/losses")
                stats = row.find_all('td')[2].text

                # Extract the percent from the second column
                pct_col = row.find_all('td')[1].text

                # Extract and convert wins and losses by splitting on '/'
                wins = int(stats.split('/')[0].strip())   # First value (number of wins)
                losses = int(stats.split('/')[1].strip()) # Second value (number of losses)

                # Extract percentage
                try:
                    pct = float(pct_col.strip().replace('%', ''))
                except:
                    pct = 0
                
                break  # Stop iterating once the correct row is found
            else:
                # If the row does not exactly match, return 0 for all values
                wins = 0
                losses = 0
                pct = 0
        else:
            # If no matching row is found, return 0 for all values
            wins = 0
            losses = 0
            pct = 0

    # Return the extracted win and loss values
    return wins, losses, pct


### Function to webscrape individual serve/return stats for an individual player

Similarly to above, we also need to extract the serve and returns stats which are in a similar format:

<div style="text-align: center;">
    <img src="screenshots/serve_return_tables.png" width="600">
</div>

Thereofore, we will build another similar function that takes a specified index name and the Beautiful Soup object as arguments, and return the serves/returns etc and the attempts from that index name.

In [4]:
def get_serve_return_stats(soup, index_name):
    """
    Extracts serve return statistics from an HTML table.

    Parameters:
    - soup (BeautifulSoup object): Parsed HTML content.
    - index_name (str): The text identifier to locate the specific row in the table.

    Returns:
    - num (int): The first number in the stats (successful returns).
    - total (int): The second number in the stats (total attempts).
    """

    # Iterate over all table rows in the parsed HTML
    for row in soup.find_all('tr'):
        # Check if the row contains the target index_name and has exactly 3 columns
        if (index_name in row.text) & (len(row.find_all('td')) == 3):
            # Extract the statistics from the third column (assumed format: "num/total")
            stats = row.find_all('td')[2].text

            # Extract the percent from the second column
            pct_col = row.find_all('td')[1].text

            # Extract and convert numbers from the "num/total" format
            try:
                num = int(stats.split('/')[0].strip())   # First value (successful returns)
                total = int(stats.split('/')[1].strip()) # Second value (total attempts)
            except: 
                num = 0
                total = 0

            # Extract percentage
            try:
                pct = float(pct_col.strip().replace('%', ''))
            except:
                pct = 0
            
            break  # Stop iterating once the correct row is found

        else:
            num = 0
            total = 0
            pct = 0

    return num, total, pct  # Return extracted values as a tuple


### Function to webscrape all win/loss stats for an individual player for a specified year

Now that we have functions to extract the win and loss stats from each row of the win/loss tables, we now need to build a function that runs this function on all the rows and stores all the values in a dictionary. Therefore, the following function takes the soup object for a certain player's stat page, a dictionary to store the stat values, and a specified year (since we will inevitably want to run this on multiple years), and the returns the dictionary now with new populated fields for each stat and year: 

In [5]:
def get_win_loss_stats_by_year(soup, player_dict, year):
    """
    Extracts and stores win-loss statistics for a given player and year from an HTML table.

    Parameters:
    - soup (BeautifulSoup object): Parsed HTML content containing the player's match statistics.
    - player_dict (dict): A dictionary where extracted statistics will be stored.
    - year (int or str): The year for which statistics are being retrieved.

    Returns:
    - player_dict (dict): Updated dictionary containing win-loss stats for various match categories.
    """

    # Extract overall win-loss statistics
    player_dict['wins_' + str(year)] = get_win_loss_stats(soup, 'Overall')[0]
    player_dict['losses_' + str(year)] = get_win_loss_stats(soup, 'Overall')[1]
    player_dict['win_pct_' + str(year)] = get_win_loss_stats(soup, 'Overall')[2]

    # Extract win-loss stats by surface type
    player_dict['hard_wins_' + str(year)] = get_win_loss_stats(soup, 'Hard')[0]
    player_dict['hard_losses_' + str(year)] = get_win_loss_stats(soup, 'Hard')[1]
    player_dict['hard_pct_' + str(year)] = get_win_loss_stats(soup, 'Hard')[2]
    player_dict['clay_wins_' + str(year)] = get_win_loss_stats(soup, 'Clay')[0]
    player_dict['clay_losses_' + str(year)] = get_win_loss_stats(soup, 'Clay')[1]   
    player_dict['clay_pct_' + str(year)] = get_win_loss_stats(soup, 'Clay')[2]
    player_dict['grass_wins_' + str(year)] = get_win_loss_stats(soup, 'Grass')[0]
    player_dict['grass_losses_' + str(year)] = get_win_loss_stats(soup, 'Grass')[1]
    player_dict['grass_pct_' + str(year)] = get_win_loss_stats(soup, 'Grass')[2]

    # Extract win-loss stats by tournament stage
    player_dict['final_wins_' + str(year)] = get_win_loss_stats(soup, 'Final')[0]
    player_dict['final_losses_' + str(year)] = get_win_loss_stats(soup, 'Final')[1]    
    player_dict['final_pct_' + str(year)] = get_win_loss_stats(soup, 'Final')[2]
    player_dict['semifinal_wins_' + str(year)] = get_win_loss_stats(soup, 'Semi-final')[0]
    player_dict['semifinal_losses_' + str(year)] = get_win_loss_stats(soup, 'Semi-final')[1]
    player_dict['semifinal_pct_' + str(year)] = get_win_loss_stats(soup, 'Semi-final')[2]
    player_dict['quarterfinal_wins_' + str(year)] = get_win_loss_stats(soup, 'Quarter-final')[0]
    player_dict['quarterfinal_losses_' + str(year)] = get_win_loss_stats(soup, 'Quarter-final')[1]
    player_dict['quarterfinal_pct_' + str(year)] = get_win_loss_stats(soup, 'Quarter-final')[2]
    player_dict['round4_wins_' + str(year)] = get_win_loss_stats(soup, '4th Round')[0]
    player_dict['round4_losses_' + str(year)] = get_win_loss_stats(soup, '4th Round')[1]
    player_dict['round4_pct_' + str(year)] = get_win_loss_stats(soup, '4th Round')[2]
    player_dict['round3_wins_' + str(year)] = get_win_loss_stats(soup, '3rd Round')[0]
    player_dict['round3_losses_' + str(year)] = get_win_loss_stats(soup, '3rd Round')[1]
    player_dict['round3_pct_' + str(year)] = get_win_loss_stats(soup, '3rd Round')[2]
    player_dict['round2_wins_' + str(year)] = get_win_loss_stats(soup, '2nd Round')[0]
    player_dict['round2_losses_' + str(year)] = get_win_loss_stats(soup, '2nd Round')[1]
    player_dict['round2_pct_' + str(year)] = get_win_loss_stats(soup, '2nd Round')[2]
    player_dict['round1_wins_' + str(year)] = get_win_loss_stats(soup, '1st Round')[0]
    player_dict['round1_losses_' + str(year)] = get_win_loss_stats(soup, '1st Round')[1]
    player_dict['round1_pct_' + str(year)] = get_win_loss_stats(soup, '1st Round')[2]

    # Extract win-loss stats by tournament type
    player_dict['grand_slam_wins_' + str(year)] = get_win_loss_stats(soup, 'Grand Slam')[0]
    player_dict['grand_slam_losses_' + str(year)] = get_win_loss_stats(soup, 'Grand Slam')[1]
    player_dict['grand_slam_pct_' + str(year)] = get_win_loss_stats(soup, 'Grand Slam')[2]
    player_dict['masters_wins_' + str(year)] = get_win_loss_stats(soup, 'Masters')[0]
    player_dict['masters_losses_' + str(year)] = get_win_loss_stats(soup, 'Masters')[1]
    player_dict['masters_pct_' + str(year)] = get_win_loss_stats(soup, 'Masters')[2]
    player_dict['main_tour_wins_' + str(year)] = get_win_loss_stats(soup, 'Main Tour')[0]
    player_dict['main_tour_losses_' + str(year)] = get_win_loss_stats(soup, 'Main Tour')[1]
    player_dict['main_tour_pct_' + str(year)] = get_win_loss_stats(soup, 'Main Tour')[2]
    player_dict['cup_wins_' + str(year)] = get_win_loss_stats(soup, 'Cups')[0]
    player_dict['cup_losses_' + str(year)] = get_win_loss_stats(soup, 'Cups')[1]
    player_dict['cup_pct_' + str(year)] = get_win_loss_stats(soup, 'Cups')[2]

    # Extract win-loss stats against top-ranked players
    player_dict['vsTop5_wins_' + str(year)] = get_win_loss_stats(soup, 'Vs Top.5')[0]
    player_dict['vsTop5_losses_' + str(year)] = get_win_loss_stats(soup, 'Vs Top.5')[1]
    player_dict['vsTop5_pct_' + str(year)] = get_win_loss_stats(soup, 'Vs Top.5')[2]
    player_dict['vsTop10_wins_' + str(year)] = get_win_loss_stats(soup, 'Vs Top.10')[0]
    player_dict['vsTop10_losses_' + str(year)] = get_win_loss_stats(soup, 'Vs Top.10')[1]
    player_dict['vsTop10_pct_' + str(year)] = get_win_loss_stats(soup, 'Vs Top.10')[2]
    player_dict['vsTop20_wins_' + str(year)] = get_win_loss_stats(soup, 'Vs Top.20')[0]
    player_dict['vsTop20_losses_' + str(year)] = get_win_loss_stats(soup, 'Vs Top.20')[1]
    player_dict['vsTop20_pct_' + str(year)] = get_win_loss_stats(soup, 'Vs Top.20')[2]
    player_dict['vsTop50_wins_' + str(year)] = get_win_loss_stats(soup, 'Vs Top.50')[0]
    player_dict['vsTop50_losses_' + str(year)] = get_win_loss_stats(soup, 'Vs Top.50')[1]
    player_dict['vsTop50_pct_' + str(year)] = get_win_loss_stats(soup, 'Vs Top.50')[2]
    player_dict['vsTop100_wins_' + str(year)] = get_win_loss_stats(soup, 'Vs Top.100')[0]
    player_dict['vsTop100_losses_' + str(year)] = get_win_loss_stats(soup, 'Vs Top.100')[1]
    player_dict['vsTop100_pct_' + str(year)] = get_win_loss_stats(soup, 'Vs Top.100')[2]

    return player_dict


### Function to webscrape all serve/return stats for an individual player for a specified year

We can then build a similar function for the serve and return stats:

In [6]:
def get_serve_return_stats_by_year(soup, player_dict, year):
    """
    Extracts and stores serve and return statistics for a given player and year from an HTML table.

    Parameters:
    - soup (BeautifulSoup object): Parsed HTML content containing the player's match statistics.
    - player_dict (dict): A dictionary where extracted statistics will be stored.
    - year (int or str): The year for which statistics are being retrieved.

    Returns:
    - player_dict (dict): Updated dictionary containing serve and return stats for the given year.
    """

    # Extract serve-related statistics
    player_dict['total_games_' + str(year)] = get_serve_return_stats(soup, 'Aces per Game')[1]
    player_dict['aces_' + str(year)] = get_serve_return_stats(soup, 'Aces per Game')[0]
    player_dict['aces_pct_' + str(year)] = get_serve_return_stats(soup, 'Aces per Game')[2]    
    player_dict['opp_aces_' + str(year)] = get_serve_return_stats(soup, 'Opponent Aces / Game')[0]
    player_dict['opp_aces_pct_' + str(year)] = get_serve_return_stats(soup, 'Opponent Aces / Game')[2]
    player_dict['DFs_' + str(year)] = get_serve_return_stats(soup, 'Double Faults Per Game')[0]
    player_dict['DFs_pct_' + str(year)] = get_serve_return_stats(soup, 'Double Faults Per Game')[2]
    player_dict['opp_DFs_' + str(year)] = get_serve_return_stats(soup, 'Opponent Double Faults / Game')[0]
    player_dict['opp_DFs_pct_' + str(year)] = get_serve_return_stats(soup, 'Opponent Double Faults / Game')[2]

    # Extract 1st and 2nd serve percentage and win percentage
    player_dict['1st_serve_' + str(year)] = get_serve_return_stats(soup, '1st Serve %')[0]
    player_dict['1st_serve_total_' + str(year)] = get_serve_return_stats(soup, '1st Serve %')[1]
    player_dict['1st_serve_pct_' + str(year)] = get_serve_return_stats(soup, '1st Serve %')[2]
    player_dict['1st_serve_win_' + str(year)] = get_serve_return_stats(soup, '1st Serve Win%')[0]
    player_dict['1st_serve_win_total_' + str(year)] = get_serve_return_stats(soup, '1st Serve Win%')[1]
    player_dict['1st_serve_win_pct_' + str(year)] = get_serve_return_stats(soup, '1st Serve Win%')[2]
    player_dict['2nd_serve_win_' + str(year)] = get_serve_return_stats(soup, '2nd Serve Win%')[0]
    player_dict['2nd_serve_win_total_' + str(year)] = get_serve_return_stats(soup, '2nd Serve Win%')[1]
    player_dict['2nd_serve_win_pct_' + str(year)] = get_serve_return_stats(soup, '2nd Serve Win%')[2]
    player_dict['serve_pts_win_' + str(year)] = get_serve_return_stats(soup, 'Serve Points Win%')[0]
    player_dict['serve_pts_win_total_' + str(year)] = get_serve_return_stats(soup, 'Serve Points Win%')[1]
    player_dict['serve_pts_win_pct_' + str(year)] = get_serve_return_stats(soup, 'Serve Points Win%')[2]

    # Extract opponent serve statistics
    player_dict['opp_1st_serve_' + str(year)] = get_serve_return_stats(soup, 'Opponent 1st Serve %')[0]
    player_dict['opp_1st_serve_total_' + str(year)] = get_serve_return_stats(soup, 'Opponent 1st Serve %')[1]
    player_dict['opp_1st_serve_pct_' + str(year)] = get_serve_return_stats(soup, 'Opponent 1st Serve %')[2]

    # Extract return statistics
    player_dict['1st_rtn_win_' + str(year)] = get_serve_return_stats(soup, '1st Return Win %')[0]
    player_dict['1st_rtn_win_total_' + str(year)] = get_serve_return_stats(soup, '1st Return Win %')[1]
    player_dict['1st_rtn_win_pct_' + str(year)] = get_serve_return_stats(soup, '1st Return Win %')[2]
    player_dict['2nd_rtn_win_' + str(year)] = get_serve_return_stats(soup, '2nd Return Win %')[0]
    player_dict['2nd_rtn_win_total_' + str(year)] = get_serve_return_stats(soup, '2nd Return Win %')[1]
    player_dict['2nd_rtn_win_pct_' + str(year)] = get_serve_return_stats(soup, '2nd Return Win %')[2]
    player_dict['rtn_pts_win_' + str(year)] = get_serve_return_stats(soup, 'Return Points Win%')[0]
    player_dict['rtn_pts_win_total_' + str(year)] = get_serve_return_stats(soup, 'Return Points Win%')[1]
    player_dict['rtn_pts_win_pct_' + str(year)] = get_serve_return_stats(soup, 'Return Points Win%')[2]

    # Extract break point statistics
    player_dict['bps_saved_' + str(year)] = get_serve_return_stats(soup, 'Break Points Saved / Game')[0]  
    player_dict['bps_saved_pct_' + str(year)] = get_serve_return_stats(soup, 'Break Points Saved / Game')[2]       
    player_dict['bps_faced_' + str(year)] = get_serve_return_stats(soup, 'Break Points Faced / Game')[0]
    player_dict['bps_faced_pct_' + str(year)] = get_serve_return_stats(soup, 'Break Points Faced / Game')[2]
    player_dict['bp_save_' + str(year)] = get_serve_return_stats(soup, 'Break Points Save %')[0]
    player_dict['bp_save_pct_' + str(year)] = get_serve_return_stats(soup, 'Break Points Save %')[2]
    player_dict['bp_save_total_' + str(year)] = get_serve_return_stats(soup, 'Break Points Save %')[1]
    player_dict['bp_save_total_pct_' + str(year)] = get_serve_return_stats(soup, 'Break Points Save %')[2]

    # Extract service hold and opponent break statistics
    player_dict['service_hold_' + str(year)] = get_serve_return_stats(soup, 'Service Hold %')[0]    
    player_dict['service_hold_pct_' + str(year)] = get_serve_return_stats(soup, 'Service Hold %')[2]  
    player_dict['bps_won_' + str(year)] = get_serve_return_stats(soup, 'Break Points Won / Game')[0]   
    player_dict['bps_won_pct_' + str(year)] = get_serve_return_stats(soup, 'Break Points Won / Game')[2]       
    player_dict['bps_opps_' + str(year)] = get_serve_return_stats(soup, 'Opponent Break Points / Game')[0] 
    player_dict['bps_opps_pct_' + str(year)] = get_serve_return_stats(soup, 'Opponent Break Points / Game')[2]         
    player_dict['bps_won_' + str(year)] = get_serve_return_stats(soup, 'Break Points Won %')[0]
    player_dict['bps_won_pct_' + str(year)] = get_serve_return_stats(soup, 'Break Points Won %')[2]
    player_dict['bps_won_total_' + str(year)] = get_serve_return_stats(soup, 'Break Points Won %')[1] 
    player_dict['bps_won_total_pct_' + str(year)] = get_serve_return_stats(soup, 'Break Points Won %')[2]       
    player_dict['opp_hold_' + str(year)] = get_serve_return_stats(soup, 'Opponent Hold %')[0]
    player_dict['opp_hold_pct_' + str(year)] = get_serve_return_stats(soup, 'Opponent Hold %')[2]

    return player_dict


### Function to webscrape all win/loss stats for an individual player for all years

Now we need to use Selenium WebDriver to manually change the year options on the page so that we can scrape the stats for all years for each player. This is done by gettting the xpath of the dropdown menu below, and cycling through and clicking the options, starting at the second to avoid 'Career' and clicking the next 24 options to cover all years back to 2001 to ensure we cover all our match data. Since some retired players will have their most recent year's data as before the current year, we will include a break clause once we reach 2000. 

<div style="text-align: center;">
    <img src="screenshots/win_loss_dropdown.png" width="500">
</div>

Once we have loaded a page, we wait a second for the page to load, and then get the soup object of the page and update the player dictionary with the stat fields for that year. The function then finally returns the fully populated dictionary with all win/loss stats for evey year for that player 

In [7]:
def perform_win_loss_scraping(driver, player_dict):
    """
    Scrapes win-loss statistics for a player from a webpage using Selenium.

    Parameters:
    - driver (WebDriver): Selenium WebDriver instance controlling the browser.
    - player_dict (dict): A dictionary where extracted statistics will be stored.

    Returns:
    - player_dict (dict): Updated dictionary containing win-loss stats for multiple years.
    """

    dropdown_xpath = '/html/body/app-root/div/app-profile/div/div/div[1]/div[2]/app-profile-performance-breakdown/div[1]/div/h2/app-profile-performance-year-picker/div/a'

    previous_stats = None

    for index in range(2, 26):
        delay = 0.5
        max_delay = 7.5
        success = False

        while delay <= max_delay:
            try:
                wait = WebDriverWait(driver, delay)
                dropdown_link = wait.until(EC.element_to_be_clickable((By.XPATH, dropdown_xpath)))
                driver.execute_script("arguments[0].click();", dropdown_link)

                year_xpath = f'/html/body/app-root/div/app-profile/div/div/div[1]/div[2]/app-profile-performance-breakdown/div[1]/div/h2/app-profile-performance-year-picker/div/div/div/div/ul/li[{index}]/a'
                year_option = driver.find_element(By.XPATH, year_xpath)

                success = True
                break  # Element found and clickable, proceed
            except:
                delay += 1.0  # Increase delay and try again

        if not success:
            continue  # Skip to next index if still unsuccessful after max_delay


        year = year_option.text

        if int(year) <= 2003:
            break

        driver.execute_script("arguments[0].scrollIntoView(true);", year_option)
        driver.execute_script("arguments[0].click();", year_option)

        delay = 0.5
        max_delay = 7.5
        stats_changed = False

        while delay <= max_delay:
            time.sleep(delay)
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            current_stats = {}
            get_win_loss_stats_by_year(soup, current_stats, year)

            if current_stats != previous_stats:
                player_dict.update(current_stats)
                previous_stats = current_stats
                stats_changed = True
                break
            else:
                delay += 1.0

        if not stats_changed:
            player_dict.update(current_stats)
            previous_stats = current_stats

    return player_dict


### Function to webscrape all serve/return stats for an individual player for all years

Similarly to above, we can create a function that uses Selenium to cycle through the years on the dropdown menu in the serve and returns section, subsquently use our previous function to scrape the data from here, and finally return a dictionary containing all the stats information for each year

<div style="text-align: center;">
    <img src="screenshots/serve_return_dropdown.png" width="500">
</div>



In [8]:
def perform_serve_return_scraping(driver, player_dict):
    """
    Scrapes serve and return statistics for a player from a webpage using Selenium.

    Parameters:
    - driver (WebDriver): Selenium WebDriver instance controlling the browser.
    - player_dict (dict): A dictionary where extracted statistics will be stored.

    Returns:
    - player_dict (dict): Updated dictionary containing serve and return stats for multiple years.
    """

    parent_div = driver.find_element(By.XPATH, '//div[@class="col col-md-2 p-1 p-md-2"]')
    dropdown = Select(parent_div.find_element(By.CLASS_NAME, 'bg-white'))
    options = dropdown.options

    previous_values = None

    for option in options:
        year = option.text

        if int(year) == 2025:
            continue

        if int(year) <= 2003:
            break

        dropdown.select_by_visible_text(year)

        delay = 0.5
        max_delay = 7.5
        stats_changed = False

        while delay <= max_delay:
            time.sleep(delay)

            soup = BeautifulSoup(driver.page_source, 'html.parser')

            current_stats = {}
            get_serve_return_stats_by_year(soup, current_stats, year)

            current_values = list(current_stats.values())

            if current_values != previous_values:
                player_dict.update(current_stats)
                previous_values = current_values
                stats_changed = True
                break
            else:
                delay += 1.0  # Increase delay if stats haven't changed

        if not stats_changed:
            # Stats remained the same even after max_delay, so keep them
            player_dict.update(current_stats)
            previous_values = list(current_stats.values())

    return player_dict


### Function to webscrape all stats for an individual player for all years

The following function then ties all the above together by taking a player's url, starting the Chrome webdriver, creating a soup object to scrape and store the player's name, then running the above functions to run through the year options on the win/loss and serve/return tables to scrape and store all stat data for each year. 

Given that the website may ocassionally deny requests, or the pages may take longer than expected to load, the function will wait three minutes should it run into an error with the webscraping. If this error persists three times then it will exit the function.

In [9]:
def get_player_data(url):
    """
    Scrapes player performance data from a given URL using Selenium and BeautifulSoup.

    Parameters:
    - url (str): The webpage URL containing the player's data.

    Returns:
    - player_dict (dict): A dictionary containing the player's name, win/loss stats, and serve/return stats.
    - None: Returns None if all retries fail due to errors.
    """

    max_retries = 5  # Maximum number of retry attempts in case of errors
    attempt = 0  # Track the number of attempts

    while attempt < max_retries:
        # Initialise an empty dictionary to store player data
        player_dict = {}

        # Set Chrome options to disable notifications
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument("--disable-notifications")

        # Start an undetected Chrome WebDriver instance with the specified options
        driver = uc.Chrome(options=chrome_options)

        try:
            # Navigate to the given URL
            driver.get(url)

            # Wait briefly to allow the page to load
            time.sleep(2)

            # Extract the webpage content using BeautifulSoup
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # Handle cases where the page does not load properly due to a gateway timeout or connection issue
            if ('505 Gateway Timeout' in soup.text) or ("This site can't be reached" in soup.text) or ('Your connection was interrupted' in soup.text):
                time.sleep(120)  # Wait 2 minutes before retrying
                driver.get(url)  # Reload the page
                soup = BeautifulSoup(driver.page_source, 'html.parser')  # Extract content again

            # Extract and store the player's name
            player_dict['name'] = soup.find('h4').text

            # Scrape win-loss statistics and update the dictionary
            perform_win_loss_scraping(driver, player_dict)

            # Scrape serve and return statistics and update the dictionary
            perform_serve_return_scraping(driver, player_dict)

            # Close the WebDriver after successful data extraction
            driver.quit()

            return player_dict  # Return the collected player data on success

        except Exception as e:
            attempt += 1
            driver.quit()
            # Wait time logic: 5 seconds for the first three attempts, 120 seconds for the next two
            if attempt < 3:
                time.sleep(5)
            else:
                time.sleep(120)

    # If all retry attempts fail, print a message and return None
    print(f'Maximum retries reached for {url}. Exiting function.')
    return None


### Function to webscrape all player urls

The next step is to get the above function to run on all players, and to do this we need to get a list of all player urls. We can gather this by using BeautifulSoup to scrape the urls from the links contained within the player names from the tennis rankings page from matchstat.com:

<div style="text-align: center;">
    <img src="screenshots/player_rankings.png" width="500">
</div>

Since some players would have retired and not be in the latest rankings, or some players may have dropped out the current rankings, we will have to cycle through previous rankings by using a webdriver to select the options from the dropdown menu similar to what we did above. However, in this dropdown menu we have weekly rankings, so to save time we will only select the last week of each year (stopping at 2001).

<div style="text-align: center;">
    <img src="screenshots/rankings_dropdown.png" width="500">
</div>

Once we load each yearly rankings page, we will have to use the webdriver to then repeatedly click 'View More' until this option is no longer available to ensure that the whole ranking list is available to scrape Ih the year is 2011 or lower we will only click 'View More' four times as these years only have 500 players available and clicking it an additional time causes the list to disappear.

<div style="text-align: center;">
    <img src="screenshots/rankings_view_more.png" width="500">
</div>

Once the rankings page is loaded, we will webscrape the urls by finding the 'href' links contained within the players names, and add these to a list if the url does not already exist in the list. The function then finally saves the list as a csv file and then returns this list of player urls.

In [21]:
def get_player_urls(url):
    """
    Scrapes player profile URLs from a rankings webpage using Selenium.

    Parameters:
    - url (str): The webpage URL containing player rankings.

    Returns:
    - player_urls (list): A list of extracted player profile URLs.
    """

    player_urls = []  # List to store player profile URLs

    # Set Chrome options to disable notifications
    chrome_options = uc.ChromeOptions()
    chrome_options.add_argument("--disable-notifications")

    # Start an undetected Chrome WebDriver instance with the specified options
    driver = uc.Chrome(options=chrome_options)
    driver.get(url)

    # Locate the rankings dropdown element and initialize a Select object
    dropdown_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '/html/body/app-root/div/app-rankings/div/div/div[1]/div[5]/form/div[1]/app-selector/select'))
    )
    dropdown = Select(dropdown_element)

    # Retrieve all available options from the dropdown
    options = dropdown.options

    # Dictionary to store the first occurrence of each year in the dropdown
    first_of_each_year = OrderedDict()
    first_option_texts = []  # Store option TEXT instead of elements

    # Iterate through dropdown options and extract unique years
    for option in options:
        date_text = option.text  # Extract the option text (e.g., "March 2024")
        year = date_text.split()[-1]  # Extract the year from the text

        # Store the first occurrence of each year
        if year not in first_of_each_year:
            first_of_each_year[year] = date_text
            first_option_texts.append(date_text)

        # Stop collecting data if the year is 2003 (since the rankings are missing between 2003 and 1989)
        if int(year) == 3:
            break
    first_option_texts.reverse()
    
    # Iterate through the selected year options to scrape player URLs
    for text in first_option_texts:
        print(f"Processing: {text}")

        # Re-locate the dropdown each time (elements become stale after refresh)
        dropdown_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '/html/body/app-root/div/app-rankings/div/div/div[1]/div[5]/form/div[1]/app-selector/select'))
        )
        dropdown = Select(dropdown_element)

        # Select the dropdown option by its visible text
        dropdown.select_by_visible_text(text)

        # Ensure the selection triggers a table update
        driver.execute_script("arguments[0].dispatchEvent(new Event('change'))", dropdown_element)
        
        # Wait for the rankings table to update
        time.sleep(5)

        # Extract the year from the text
        year = int(text.split()[-1][-2:])  # Extract last two digits of the year
        max_clicks = 4 if year <= 11 else float('inf')  # Limit clicks if year is 11 or lower
        click_count = 0

        # Click the "Show More" button to reveal additional players, if available
        while click_count < max_clicks:
            try:
                show_more_button = driver.find_elements(By.XPATH, '/html/body/app-root/div/app-rankings/div/div/div[1]/div[8]/div/button')
                if show_more_button:
                    driver.execute_script("arguments[0].click();", show_more_button[0])
                    time.sleep(2)  # Allow time for more results to load
                    click_count += 1
                else:
                    break  # Exit loop if no button is found
            except:
                print('No "Show More" button found')
                break

        # Extract the updated table data from the webpage
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        table = soup.find("table")  # Locate the rankings table

        if table:
            rows = table.find_all("tr")  # Get all table rows

            # Iterate through table rows and extract player profile URLs
            for row in rows[1:]:  # Skip the header row
                url_suffix = row.find('a')['href']
                player_url = 'https://matchstat.com/' + url_suffix  # Construct full URL

                # Avoid duplicate URLs
                if player_url not in player_urls:
                    player_urls.append(player_url)

        # Instead of refreshing the page entirely, reload only the relevant parts
        driver.execute_script("window.location.reload();")
        time.sleep(5)  # Allow time for the page to reload
    
    # Close the WebDriver after all data has been collected
    driver.quit()

    # Print the number or URLs collected
    print(len(player_urls), 'player urls collected')

    # Save the list to a CSV file
    pd.DataFrame(player_urls, columns=['url']).to_csv('player_stats/player_urls.csv', index=False)
    
    return player_urls


We can then run the function and observe the outputs to ensure that we have successfully covered every year from the current year to 2001:

In [22]:
player_urls = get_player_urls('https://matchstat.com/tennis/atp-wta-rankings/atp')

Processing: 15 Dec 03
Processing: 20 Dec 04
Processing: 19 Dec 05
Processing: 18 Dec 06
Processing: 24 Dec 07
Processing: 22 Dec 08
Processing: 28 Dec 09
Processing: 27 Dec 10
Processing: 26 Dec 11
Processing: 24 Dec 12
Processing: 23 Dec 13
Processing: 22 Dec 14
Processing: 28 Dec 15
Processing: 26 Dec 16
Processing: 25 Dec 17
Processing: 24 Dec 18
Processing: 23 Dec 19
Processing: 28 Dec 20
Processing: 27 Dec 21
Processing: 26 Dec 22
Processing: 25 Dec 23
Processing: 23 Dec 24
Processing: 07 Apr 25
3099 player urls collected


### Function to webscrape all stats for all players for all years

We can finally create a function that combines all of the above functions to perform the webscraping for all player stats for all years for all player urls. This function takes the player urls list as an argument, and performs all webscraping on each url adding each dictionary to a list. The function keeps a log of every 50 players scraped,  and finally saves the list of dictionaries as a DataFrame in a specified folder. 

In [24]:
def scrape_and_save_player_data(player_urls, output_file='player_stats/all_player_atts.csv'):
    """
    Scrapes player data for a list of player URLs and saves the results to a CSV file inside a specified folder.

    Parameters:
    - player_urls (list): List of URLs to scrape player data from.
    - output_file (str): File path where the CSV will be saved (default: 'player_stats/all_player_atts.csv').

    Returns:
    - None: Saves the data to the specified CSV file.
    """

    all_player_atts = []
    total_players = len(player_urls)

    for i, url in enumerate(player_urls, start=1):
        player_atts = get_player_data(url)

        # Only add valid player data to the list
        if player_atts is not None:
            all_player_atts.append(player_atts)

        remaining = total_players - i
        if i % 50 == 0 or remaining == 0:
            print(f'{i} players checked, {remaining} remaining')

    # Save DataFrame only if valid data exists
    try:
        if all_player_atts:
            pd.DataFrame(all_player_atts).to_csv(output_file, index=False)
            print(f"Data successfully saved to {output_file}")
        else:
            print("No valid player data collected. CSV file was not created.")
    except:
        return all_player_atts

We can load a new 'new_player_urls' file which was based on player_urls but reduced during a previous iteration of the model building where players that did not exist in the match dataset were removed:

In [None]:
# Load player urls
player_urls = pd.read_csv('player_urls/new_player_urls.csv')['url'].to_list()

In [25]:
# Run scraping function
scrape_and_save_player_data(player_urls)

50 players checked, 3049 remaining
100 players checked, 2999 remaining
150 players checked, 2949 remaining
200 players checked, 2899 remaining
250 players checked, 2849 remaining
300 players checked, 2799 remaining
350 players checked, 2749 remaining
400 players checked, 2699 remaining
450 players checked, 2649 remaining
500 players checked, 2599 remaining
550 players checked, 2549 remaining
600 players checked, 2499 remaining
650 players checked, 2449 remaining
700 players checked, 2399 remaining
750 players checked, 2349 remaining
800 players checked, 2299 remaining
850 players checked, 2249 remaining
900 players checked, 2199 remaining
Maximum retries reached for https://matchstat.com//tennis/player/Nocolas%20Alberto%20Jara%20Lozano. Exiting function.
950 players checked, 2149 remaining
1000 players checked, 2099 remaining
1050 players checked, 2049 remaining
1100 players checked, 1999 remaining
1150 players checked, 1949 remaining
1200 players checked, 1899 remaining
1250 players c

We have now successfully created our player dataset which contains multiple stats for each player across multiple years. This should give us a lot of data to work with to engineer features when building our betting model.