In [None]:
import time
import math
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [2]:
BASE_URL = "https://www.shl.com"
CATALOG_URL = f"{BASE_URL}/products/product-catalog/"
HEADERS = {
    "User-Agent": "Siddharth/1.0 (contact: siddharth11.sarkar@gmail.com)"
}

In [3]:
def get_soup(url, params=None):
    resp = requests.get(url, params=params, headers=HEADERS, timeout=15)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")


In [4]:
def extract_last_page_for_individual_tests(soup):
    """
    On the first catalog page, find the 'Individual Test Solutions' section,
    then read the pagination numbers below it to get the last page.
    """
    # Find the text node / heading that contains "Individual Test Solutions"
    header_text = soup.find(string=lambda t: t and "Individual Test Solutions" in t)
    if not header_text:
        raise RuntimeError("Couldn't find 'Individual Test Solutions' header")

    # The pagination block is a little bit below; simplest robust approach:
    # look for all links whose text is a digit and take the max.
    page_nums = []
    for a in soup.find_all("a"):
        text = (a.get_text() or "").strip()
        if text.isdigit():
            page_nums.append(int(text))

    if not page_nums:
        raise RuntimeError("No pagination numbers found")

    return max(page_nums)

In [5]:
def extract_tests_from_page(soup):
    """
    Extract all tests on a page where type=1 (Individual Test Solutions).
    On those pages, the only catalog list present is the individual tests list,
    so we can simply grab all product-catalog/view links.
    """
    tests = []

    for a in soup.find_all("a", href=True):
        href = a["href"]
        if "/products/product-catalog/view/" in href:
            name = a.get_text(strip=True)
            if not name:
                continue

            full_url = urljoin(BASE_URL, href)
            # Nearby sibling text often includes the category codes like "K P S"
            codes = a.next_sibling
            codes_text = ""
            if codes and isinstance(codes, str):
                codes_text = codes.strip()

            tests.append({
                "name": name,
                "url": full_url,
                "codes": codes_text
            })

    return tests


In [6]:
def crawl_individual_tests():
    # 1. Fetch main page to infer last page
    soup_first = get_soup(CATALOG_URL)
    last_page = extract_last_page_for_individual_tests(soup_first)
    print(f"Detected {last_page} pages of Individual Test Solutions")

    all_tests = {}
    for page in range(1, last_page + 1):
        start = (page - 1) * 12
        params = {"type": 1, "start": start} if start > 0 else {"type": 1}

        print(f"Crawling page {page}/{last_page} with params={params}...")
        soup = get_soup(CATALOG_URL, params=params)
        tests = extract_tests_from_page(soup)

        for t in tests:
            # Deduplicate by URL
            all_tests[t["url"]] = t

        # Be polite
        time.sleep(1)

    tests_list = list(all_tests.values())
    print(f"Collected {len(tests_list)} unique Individual Test Solutions")
    return tests_list

In [7]:
if __name__ == "__main__":
    tests = crawl_individual_tests()
    if len(tests) < 377:
        raise RuntimeError(f"Only found {len(tests)} tests, need at least 377")

Detected 32 pages of Individual Test Solutions
Crawling page 1/32 with params={'type': 1}...
Crawling page 2/32 with params={'type': 1, 'start': 12}...
Crawling page 3/32 with params={'type': 1, 'start': 24}...
Crawling page 4/32 with params={'type': 1, 'start': 36}...
Crawling page 5/32 with params={'type': 1, 'start': 48}...
Crawling page 6/32 with params={'type': 1, 'start': 60}...
Crawling page 7/32 with params={'type': 1, 'start': 72}...
Crawling page 8/32 with params={'type': 1, 'start': 84}...
Crawling page 9/32 with params={'type': 1, 'start': 96}...
Crawling page 10/32 with params={'type': 1, 'start': 108}...
Crawling page 11/32 with params={'type': 1, 'start': 120}...
Crawling page 12/32 with params={'type': 1, 'start': 132}...
Crawling page 13/32 with params={'type': 1, 'start': 144}...
Crawling page 14/32 with params={'type': 1, 'start': 156}...
Crawling page 15/32 with params={'type': 1, 'start': 168}...
Crawling page 16/32 with params={'type': 1, 'start': 180}...
Crawling

In [None]:
import csv

with open("shl_individual_tests.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["name", "url", "codes"])
    writer.writeheader()
    writer.writerows(tests)