### Webscraper for necessary Data

Webscraper, to collect all relevant dive spots data points

In [13]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from tqdm import tqdm  # For progress display
import pandas as pd
import time
import os

# Options for headless operation
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Enable headless mode
options.add_argument("--no-sandbox")  # (optional) to improve stability
options.add_argument("--disable-dev-shm-usage")  # (optional) to avoid memory issues

# Initialize WebDriver
driver = webdriver.Chrome(options=options)
driver.maximize_window()

initial = 0

# Base URL for the PADI dive site page
base_url = "https://www.padi.com/dive-sites/all/"

# Excel file path
excel_file_path = "dive_sites_details.xlsx"

# Loop through pages
for page in range(1, 222):  # Example: First 9 pages
    url = f"{base_url}?page={page}"
    driver.get(url)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "container--vQkgc"))
    )

    # Close cookie banner if present
    try:
        # Switch to iframe
        if initial == 0:
            cookies = driver.find_element(By.TAG_NAME, "iframe")
        
            driver.switch_to.frame(driver.find_element(By.TAG_NAME, "iframe"))
            
            # Click on the cookie consent button
            cookie_button = driver.find_element(By.CLASS_NAME, "call")
            cookie_button.click()
            time.sleep(1)

            # Switch back to the main document
            driver.switch_to.default_content()
            initial = 1
    except Exception as e:
        e=e

    # Collect links to dive sites
    dive_site_links = driver.find_elements(By.CLASS_NAME, "container--vQkgc")
    links = [link.get_attribute("href") for link in dive_site_links]
    
    # Temporary list to store data for the current page
    current_page_data = []

    for link in tqdm(links, desc=f"Scraping Page {page} ", position=0, unit="dive site"):
        try:
            driver.get(link)  # Click on the dive site link
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "dive-site-header__title"))
            )


            # Extract data (title and link)
            title = driver.find_element(By.CLASS_NAME, "dive-site-header__title").text.strip()
            
            # Extract Description  
            description = ''
            try:          
                description = driver.find_element(By.CLASS_NAME, "dive-site-overview__content-description").text.strip()
            except Exception as e:
                e = e
            
            # Extract rating
            try:
                rating = driver.find_element(By.CLASS_NAME, "rating__percentage").text.strip()
            except Exception as e:
                rating = None

            # Collect images
            images = driver.find_element(By.CLASS_NAME, "collage-photos")
            image_link = None

            try:
                if images is not None:
                    try:
                        image_element = images.find_element(By.CSS_SELECTOR, "div[data-index='0'] img")
                        if image_element is not None:
                            image_link = image_element.get_attribute("src")
                    except:
                        image_element = images.find_element(By.CSS_SELECTOR, "img[data-index='0']")
                        if image_element is not None:
                            image_link = image_element.get_attribute("src")
            except Exception as e:
                image_link = None

            # Collect additional information
            location, dive_types, common_sightings, max_depth = None, '', '', None
            metrics = driver.find_elements(By.CLASS_NAME, "dive-site-overview__content-metric__description")
            try:
                for metric in metrics:
                    title_text = metric.find_element(By.CLASS_NAME, "dive-site-overview__content-metric__title").text.strip()
                    subtitle_text = metric.find_elements(By.CLASS_NAME, "dive-site-overview__content-metric__subtitle")
                    
                    for subtitle in subtitle_text:
                        if title_text == "Location":
                            location = subtitle.text.strip()
                        elif title_text == "Dive Types":
                            dive_types = subtitle.text.strip() if dive_types == '' else f"{dive_types}, {subtitle.text.strip()}"
                        elif title_text == "Common Sightings":
                            common_sightings = subtitle.text.strip() if common_sightings == '' else f"{common_sightings}, {subtitle.text.strip()}"
                        elif title_text == "Maximum Depth":
                            max_depth = subtitle.text.strip()
            except Exception as e:
                e=e
                        
            element = {
                "URL": link,
                "Title": title,
                "Description": description,
                "Location": location,
                "Dive Types": dive_types,
                "Common Sightings": common_sightings,
                "Maximum Depth": max_depth,
                "Rating": rating,
                "Image": image_link
            }

            # Save the data
            current_page_data.append(element)

        except Exception as e:
            print(f"Error retrieving data: {e}")

    # Append the current page's data to the Excel file
    df = pd.DataFrame(current_page_data)
    if os.path.exists(excel_file_path):
        with pd.ExcelWriter(excel_file_path, mode='a', if_sheet_exists="overlay", engine="openpyxl") as writer:
            df.to_excel(writer, index=False, header=False, startrow=writer.sheets['Sheet1'].max_row)
    else:
        df.to_excel(excel_file_path, index=False)

    print(f"Data from page {page} successfully saved.")

# Close the WebDriver
driver.quit()
print("Scraping complete.")

Scraping Page 1 : 100%|██████████| 20/20 [00:39<00:00,  1.97s/dive site]


Data from page 1 successfully saved.


Scraping Page 2 : 100%|██████████| 20/20 [00:38<00:00,  1.92s/dive site]


Data from page 2 successfully saved.


Scraping Page 3 : 100%|██████████| 20/20 [00:41<00:00,  2.09s/dive site]


Data from page 3 successfully saved.


Scraping Page 4 :  70%|███████   | 14/20 [00:27<00:11,  1.96s/dive site]


KeyboardInterrupt: 