### Webscraper for necessary Data

Webscraper, to collect all relevant dive spots data points

In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from tqdm import tqdm  # For progress display
import pandas as pd
import time
import os

# Options for headless operation
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Enable headless mode
options.add_argument("--no-sandbox")  # (optional) to improve stability
options.add_argument("--disable-dev-shm-usage")  # (optional) to avoid memory issues

# Initialize WebDriver
driver = webdriver.Chrome(options=options)
driver.maximize_window()

initial = 0

# Base URL for the PADI dive site page
base_url = "https://www.padi.com/dive-sites/all/"

# Excel file path
excel_file_path = "dive_sites_details.xlsx"

# Loop through pages
for page in range(43, 222):  # Example: First 9 pages
    url = f"{base_url}?page={page}"
    driver.get(url)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "container--vQkgc"))
    )

    # Close cookie banner if present
    try:
        # Switch to iframe
        if initial == 0:
            cookies = driver.find_element(By.TAG_NAME, "iframe")
        
            driver.switch_to.frame(driver.find_element(By.TAG_NAME, "iframe"))
            
            # Click on the cookie consent button
            cookie_button = driver.find_element(By.CLASS_NAME, "call")
            cookie_button.click()
            time.sleep(1)

            # Switch back to the main document
            driver.switch_to.default_content()
            initial = 1
    except Exception as e:
        e=e

    # Collect links to dive sites
    dive_site_links = driver.find_elements(By.CLASS_NAME, "container--vQkgc")
    links = [link.get_attribute("href") for link in dive_site_links]
    
    # Temporary list to store data for the current page
    current_page_data = []

    for link in tqdm(links, desc=f"Scraping Page {page} ", position=0, unit="dive site"):
        try:
            driver.get(link)  # Click on the dive site link
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "dive-site-header__title"))
            )


            # Extract data (title and link)
            title = driver.find_element(By.CLASS_NAME, "dive-site-header__title").text.strip()

            # Extract rating
            try:
                rating = driver.find_element(By.CLASS_NAME, "rating__percentage").text.strip()
            except Exception as e:
                rating = None

            # Collect images
            images = driver.find_element(By.CLASS_NAME, "collage-photos")
            image_link = None
            try:
                if images is not None:
                    image_element = images.find_element(By.CSS_SELECTOR, "div[data-index='0'] img")
                    if image_element is not None:
                        image_link = image_element.get_attribute("src")
            except Exception as e:
                image_link = None

            # Collect additional information
            location, dive_types, common_sightings, max_depth = None, '', '', None
            metrics = driver.find_elements(By.CLASS_NAME, "dive-site-overview__content-metric__description")
            try:
                for metric in metrics:
                    title_text = metric.find_element(By.CLASS_NAME, "dive-site-overview__content-metric__title").text.strip()
                    subtitle_text = metric.find_elements(By.CLASS_NAME, "dive-site-overview__content-metric__subtitle")
                    
                    for subtitle in subtitle_text:
                        if title_text == "Location":
                            location = subtitle.text.strip()
                        elif title_text == "Dive Types":
                            dive_types = subtitle.text.strip() if dive_types == '' else f"{dive_types}, {subtitle.text.strip()}"
                        elif title_text == "Common Sightings":
                            common_sightings = subtitle.text.strip() if common_sightings == '' else f"{common_sightings}, {subtitle.text.strip()}"
                        elif title_text == "Maximum Depth":
                            max_depth = subtitle.text.strip()
            except Exception as e:
                e=e
                        
            element = {
                "URL": link,
                "Title": title,
                "Location": location,
                "Dive Types": dive_types,
                "Common Sightings": common_sightings,
                "Maximum Depth": max_depth,
                "Rating": rating,
                "Image": image_link
            }

            # Save the data
            current_page_data.append(element)

        except Exception as e:
            print(f"Error retrieving data: {e}")

    # Append the current page's data to the Excel file
    df = pd.DataFrame(current_page_data)
    if os.path.exists(excel_file_path):
        with pd.ExcelWriter(excel_file_path, mode='a', if_sheet_exists="overlay", engine="openpyxl") as writer:
            df.to_excel(writer, index=False, header=False, startrow=writer.sheets['Sheet1'].max_row)
    else:
        df.to_excel(excel_file_path, index=False)

    print(f"Data from page {page} successfully saved.")

# Close the WebDriver
driver.quit()
print("Scraping complete.")

Scraping Page 43 : 100%|██████████| 20/20 [00:48<00:00,  2.45s/dive site]


Data from page 43 successfully saved.


Scraping Page 44 : 100%|██████████| 20/20 [00:47<00:00,  2.37s/dive site]


Data from page 44 successfully saved.


Scraping Page 45 : 100%|██████████| 20/20 [00:53<00:00,  2.66s/dive site]


Data from page 45 successfully saved.


Scraping Page 46 : 100%|██████████| 20/20 [00:50<00:00,  2.53s/dive site]


Data from page 46 successfully saved.


Scraping Page 47 : 100%|██████████| 20/20 [00:52<00:00,  2.60s/dive site]


Data from page 47 successfully saved.


Scraping Page 48 : 100%|██████████| 20/20 [01:02<00:00,  3.13s/dive site]


Data from page 48 successfully saved.


Scraping Page 49 : 100%|██████████| 20/20 [00:56<00:00,  2.84s/dive site]


Data from page 49 successfully saved.


Scraping Page 50 : 100%|██████████| 20/20 [00:57<00:00,  2.88s/dive site]


Data from page 50 successfully saved.


Scraping Page 51 : 100%|██████████| 20/20 [01:01<00:00,  3.09s/dive site]


Data from page 51 successfully saved.


Scraping Page 52 : 100%|██████████| 20/20 [00:57<00:00,  2.89s/dive site]


Data from page 52 successfully saved.


Scraping Page 53 : 100%|██████████| 20/20 [01:00<00:00,  3.02s/dive site]


Data from page 53 successfully saved.


Scraping Page 54 : 100%|██████████| 20/20 [00:54<00:00,  2.75s/dive site]


Data from page 54 successfully saved.


Scraping Page 55 : 100%|██████████| 20/20 [00:45<00:00,  2.26s/dive site]


Data from page 55 successfully saved.


Scraping Page 56 : 100%|██████████| 20/20 [00:56<00:00,  2.84s/dive site]


Data from page 56 successfully saved.


Scraping Page 57 : 100%|██████████| 20/20 [00:48<00:00,  2.40s/dive site]


Data from page 57 successfully saved.


Scraping Page 58 : 100%|██████████| 20/20 [00:54<00:00,  2.70s/dive site]


Data from page 58 successfully saved.


Scraping Page 59 : 100%|██████████| 20/20 [00:53<00:00,  2.65s/dive site]


Data from page 59 successfully saved.


Scraping Page 60 : 100%|██████████| 20/20 [00:41<00:00,  2.08s/dive site]


Data from page 60 successfully saved.


Scraping Page 61 : 100%|██████████| 20/20 [00:50<00:00,  2.51s/dive site]


Data from page 61 successfully saved.


Scraping Page 62 : 100%|██████████| 20/20 [00:48<00:00,  2.43s/dive site]


Data from page 62 successfully saved.


Scraping Page 63 : 100%|██████████| 20/20 [00:51<00:00,  2.57s/dive site]


Data from page 63 successfully saved.


Scraping Page 64 : 100%|██████████| 20/20 [00:50<00:00,  2.54s/dive site]


Data from page 64 successfully saved.


Scraping Page 65 : 100%|██████████| 20/20 [00:45<00:00,  2.28s/dive site]


Data from page 65 successfully saved.


Scraping Page 66 : 100%|██████████| 20/20 [00:58<00:00,  2.90s/dive site]


Data from page 66 successfully saved.


Scraping Page 67 : 100%|██████████| 20/20 [00:56<00:00,  2.84s/dive site]


Data from page 67 successfully saved.


Scraping Page 68 : 100%|██████████| 20/20 [00:48<00:00,  2.44s/dive site]


Data from page 68 successfully saved.


Scraping Page 69 : 100%|██████████| 20/20 [00:38<00:00,  1.95s/dive site]


Data from page 69 successfully saved.


Scraping Page 70 : 100%|██████████| 20/20 [00:42<00:00,  2.13s/dive site]


Data from page 70 successfully saved.


Scraping Page 71 : 100%|██████████| 20/20 [00:54<00:00,  2.74s/dive site]


Data from page 71 successfully saved.


Scraping Page 72 : 100%|██████████| 20/20 [00:46<00:00,  2.32s/dive site]


Data from page 72 successfully saved.


Scraping Page 73 : 100%|██████████| 20/20 [00:42<00:00,  2.10s/dive site]


Data from page 73 successfully saved.


Scraping Page 74 : 100%|██████████| 20/20 [00:55<00:00,  2.76s/dive site]


Data from page 74 successfully saved.


Scraping Page 75 : 100%|██████████| 20/20 [00:54<00:00,  2.70s/dive site]


Data from page 75 successfully saved.


Scraping Page 76 : 100%|██████████| 20/20 [00:51<00:00,  2.57s/dive site]


Data from page 76 successfully saved.


Scraping Page 77 : 100%|██████████| 20/20 [00:38<00:00,  1.90s/dive site]


Data from page 77 successfully saved.


Scraping Page 78 : 100%|██████████| 20/20 [00:58<00:00,  2.90s/dive site]


Data from page 78 successfully saved.


Scraping Page 79 : 100%|██████████| 20/20 [00:50<00:00,  2.54s/dive site]


Data from page 79 successfully saved.


Scraping Page 80 : 100%|██████████| 20/20 [00:38<00:00,  1.94s/dive site]


Data from page 80 successfully saved.


Scraping Page 81 : 100%|██████████| 20/20 [01:05<00:00,  3.25s/dive site]


Data from page 81 successfully saved.


Scraping Page 82 : 100%|██████████| 20/20 [00:55<00:00,  2.79s/dive site]


Data from page 82 successfully saved.


Scraping Page 83 : 100%|██████████| 20/20 [01:18<00:00,  3.91s/dive site]


Data from page 83 successfully saved.


Scraping Page 84 : 100%|██████████| 20/20 [00:51<00:00,  2.57s/dive site]


Data from page 84 successfully saved.


Scraping Page 85 : 100%|██████████| 20/20 [00:40<00:00,  2.02s/dive site]


Data from page 85 successfully saved.


Scraping Page 86 : 100%|██████████| 20/20 [01:05<00:00,  3.25s/dive site]


Data from page 86 successfully saved.


Scraping Page 87 : 100%|██████████| 20/20 [01:02<00:00,  3.11s/dive site]


Data from page 87 successfully saved.


Scraping Page 88 : 100%|██████████| 20/20 [00:39<00:00,  1.95s/dive site]


Data from page 88 successfully saved.


Scraping Page 89 : 100%|██████████| 20/20 [00:47<00:00,  2.39s/dive site]


Data from page 89 successfully saved.


Scraping Page 90 : 100%|██████████| 20/20 [00:55<00:00,  2.75s/dive site]


Data from page 90 successfully saved.


Scraping Page 91 : 100%|██████████| 20/20 [00:40<00:00,  2.01s/dive site]


Data from page 91 successfully saved.


Scraping Page 92 : 100%|██████████| 20/20 [00:42<00:00,  2.12s/dive site]


Data from page 92 successfully saved.


Scraping Page 93 : 100%|██████████| 20/20 [00:56<00:00,  2.83s/dive site]


Data from page 93 successfully saved.


Scraping Page 94 : 100%|██████████| 20/20 [00:48<00:00,  2.44s/dive site]


Data from page 94 successfully saved.


Scraping Page 95 : 100%|██████████| 20/20 [01:15<00:00,  3.77s/dive site]


Data from page 95 successfully saved.


Scraping Page 96 : 100%|██████████| 20/20 [00:37<00:00,  1.88s/dive site]


Data from page 96 successfully saved.


Scraping Page 97 : 100%|██████████| 20/20 [00:41<00:00,  2.09s/dive site]


Data from page 97 successfully saved.


Scraping Page 98 : 100%|██████████| 20/20 [01:19<00:00,  3.98s/dive site]


Data from page 98 successfully saved.


Scraping Page 99 : 100%|██████████| 20/20 [01:25<00:00,  4.30s/dive site]


Data from page 99 successfully saved.


Scraping Page 100 : 100%|██████████| 20/20 [00:42<00:00,  2.15s/dive site]


Data from page 100 successfully saved.


Scraping Page 101 :  30%|███       | 6/20 [00:27<01:05,  4.65s/dive site]


KeyboardInterrupt: 