In [1]:
pip install requests beautifulsoup4 pandas


Note: you may need to restart the kernel to use updated packages.


### Scraping the available courses frim the SHL website

In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin, urlparse
import time

BASE_URL = "https://www.shl.com"
CATALOG_URL = "https://www.shl.com/en/solutions/products/product-catalog/?page={}"
HEADERS = {"User-Agent": "Mozilla/5.0"}

# Lists to hold scraped links, metadata, and any failed catalog pages
all_links = []
metadata = []
failed_pages = []

# ----------------------------
# Extract links and Adaptive/IRT status from each catalog page
# ----------------------------
def extract_links_and_adaptive(page_num):
    try:
        res = requests.get(CATALOG_URL.format(page_num), headers=HEADERS, timeout=15)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, "html.parser")
        rows = soup.select("div.custom__table-wrapper table tr[data-course-id]")

        for row in rows:
            # Extract course link
            name_cell = row.select_one("td.custom__table-heading__title a")
            if name_cell:
                link = urljoin(BASE_URL, name_cell["href"])

                # Extract Adaptive/IRT Support
                adaptive_cell = row.select("td.custom__table-heading__general")[1]  # Adaptive column
                adaptive = "Yes" if adaptive_cell.select_one(".-yes") else "No"

                # Append both link and adaptive status
                all_links.append({"link": link, "adaptive": adaptive})

        return True
    except Exception as e:
        print(f"‚ùå Failed to access catalog page {page_num}: {e}")
        failed_pages.append(page_num)
        return False

# ----------------------------
# Extract details from each course page
# ----------------------------
def extract_details_from_course_page(entry):
    try:
        url = entry["link"]
        adaptive = entry["adaptive"]

        res = requests.get(url, headers=HEADERS, timeout=15)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, "html.parser")

        # Extract course name from URL after /view/
        path = urlparse(url).path
        name = path.split("/view/")[-1].replace("-", " ").title()

        # Extract course description
        desc_section = soup.find("h4", string="Description")
        description = desc_section.find_next_sibling("p").text.strip() if desc_section else None

        # Extract duration
        duration_section = soup.find("h4", string="Assessment length")
        duration = duration_section.find_next_sibling("p").text.strip() if duration_section else None

        # Extract test type
        type_section = soup.select("span.product-catalogue__key")
        test_type = ", ".join([span.text.strip() for span in type_section]) if type_section else None

        # Extract remote testing status
        remote = "Yes" if soup.select_one("span.catalogue__circle.-yes") else "No"

        return {
            "Assessment Name": name,
            "URL": url,
            "Description": description,
            "Assessment Duration": duration,
            "Test Type": test_type,
            "Remote Testing Support": remote,
            "Adaptive/IRT Support": adaptive
        }

    except Exception as e:
        print(f"‚ùå Failed to extract data from {entry['link']}: {e}")
        return None

# ----------------------------
# Main scraping loop
# ----------------------------

# Step 1: Collect all course page links and Adaptive/IRT status
print("üîç Collecting product links and Adaptive/IRT status from catalog pages...")
for i in range(1, 33):
    extract_links_and_adaptive(i)
    time.sleep(1)

# Step 2: Visit each link and extract metadata
print("üì¶ Scraping individual product pages...")
for idx, entry in enumerate(all_links, 1):
    print(f"[{idx}/{len(all_links)}] Scraping: {entry['link']}")
    details = extract_details_from_course_page(entry)
    if details:
        metadata.append(details)
    time.sleep(0.5)

# Step 3: Save to CSV and show results
df = pd.DataFrame(metadata)
df.to_csv("shl_product_catalog_data.csv", index=False)

print("\n‚úÖ Scraping complete.")
if failed_pages:
    print("‚ö†Ô∏è The following catalog pages failed and need manual checking:", failed_pages)

# Display DataFrame
print("\nüìÑ Preview of scraped data:")
# print(df.head(10))
df.head()


üîç Collecting product links and Adaptive/IRT status from catalog pages...
üì¶ Scraping individual product pages...
[1/384] Scraping: https://www.shl.com/solutions/products/product-catalog/view/account-manager-solution/
[2/384] Scraping: https://www.shl.com/solutions/products/product-catalog/view/administrative-professional-short-form/
[3/384] Scraping: https://www.shl.com/solutions/products/product-catalog/view/agency-manager-solution/
[4/384] Scraping: https://www.shl.com/solutions/products/product-catalog/view/apprentice-8-0-job-focused-assessment-4261/
[5/384] Scraping: https://www.shl.com/solutions/products/product-catalog/view/apprentice-8-0-job-focused-assessment/
[6/384] Scraping: https://www.shl.com/solutions/products/product-catalog/view/bank-administrative-assistant-short-form/
[7/384] Scraping: https://www.shl.com/solutions/products/product-catalog/view/bank-collections-agent-short-form/
[8/384] Scraping: https://www.shl.com/solutions/products/product-catalog/view/bank-op

Unnamed: 0,Assessment Name,URL,Description,Assessment Duration,Test Type,Remote Testing Support,Adaptive/IRT Support
0,Account Manager Solution/,https://www.shl.com/solutions/products/product...,The Account Manager solution is an assessment ...,Approximate Completion Time in minutes = 49,"C, P, A, B, A, B, C, D, E, K, P, S",Yes,Yes
1,Administrative Professional Short Form/,https://www.shl.com/solutions/products/product...,The Administrative Professional solution is fo...,Approximate Completion Time in minutes = 36,"A, K, P, A, B, C, D, E, K, P, S",Yes,Yes
2,Agency Manager Solution/,https://www.shl.com/solutions/products/product...,The Agency Manager solution is for mid-level s...,Approximate Completion Time in minutes = 51,"A, B, P, S, A, B, C, D, E, K, P, S",Yes,Yes
3,Apprentice 8 0 Job Focused Assessment 4261/,https://www.shl.com/solutions/products/product...,The Apprentice + 8.0 Job-Focused Assessment is...,Approximate Completion Time in minutes = 30,"B, P, A, B, C, D, E, K, P, S",Yes,No
4,Apprentice 8 0 Job Focused Assessment/,https://www.shl.com/solutions/products/product...,The Apprentice 8.0 Job-Focused Assessment is a...,Approximate Completion Time in minutes = 20,"B, P, A, B, C, D, E, K, P, S",Yes,No
