In [2]:
# notebooks/01_build_catalog.ipynb (convertible to .py if preferred)

"""
This notebook crawls the SHL product catalog and builds a structured CSV containing:
 - name
 - url
 - description
 - duration_minutes (if available)
 - inferred test_type (K or P)

Output: data/catalog.csv
"""

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

BASE_URL = "https://www.shl.com/solutions/products/product-catalog/"
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; SHLRecommenderBot/1.0)"}

print("Fetching main catalog page...")
resp = requests.get(BASE_URL, headers=HEADERS)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")

# Extract assessment links
links = [a['href'] for a in soup.find_all('a', href=True) if '/view/' in a['href']]
links = list(dict.fromkeys(links))  # deduplicate
print(f"Found {len(links)} raw links.")

data = []
for i, link in enumerate(links, 1):
    full_url = link if link.startswith('http') else f"https://www.shl.com{link}"
    try:
        page = requests.get(full_url, headers=HEADERS, timeout=10)
        if page.status_code != 200:
            print(f"[{i}] Skipped {full_url} (status {page.status_code})")
            continue
        sp = BeautifulSoup(page.text, "html.parser")

        # Extract title and description
        title_tag = sp.find(['h1', 'h2'])
        desc_tag = sp.find('div', class_='product-description') or sp.find('p')
        name = title_tag.get_text(strip=True) if title_tag else None
        desc = desc_tag.get_text(' ', strip=True) if desc_tag else ''

        # Duration parsing
        duration_match = re.findall(r'(\d+)\s*(minute|min|hour)', desc.lower())
        duration = None
        if duration_match:
            val, unit = duration_match[0]
            val = int(val)
            duration = val * 60 if 'hour' in unit else val

        data.append({
            'name': name,
            'url': full_url,
            'description': desc,
            'duration_minutes': duration
        })
        print(f"[{i}/{len(links)}] Parsed: {name}")
        time.sleep(0.5)
    except Exception as e:
        print(f"Error parsing {link}: {e}")

# Create dataframe
df = pd.DataFrame(data)

# Heuristic test_type inference
TECH_KEYWORDS = r"python|java|excel|sql|developer|data|analysis|programming|code|engineer|technical|digital"
BEHAV_KEYWORDS = r"team|leader|behavior|communication|collaboration|personality|emotional|situational|style"

def infer_test_type(text):
    text = text.lower()
    if re.search(TECH_KEYWORDS, text):
        return 'K'
    elif re.search(BEHAV_KEYWORDS, text):
        return 'P'
    return 'Unknown'

df['test_type'] = df['description'].apply(infer_test_type)

# Save results
out_path = '../data/catalog.csv'
df.to_csv(out_path, index=False)
print(f"Saved {len(df)} assessments to {out_path}")

df.head()


Fetching main catalog page...
Found 24 raw links.
[1/24] Parsed: Account Manager Solution
[2/24] Parsed: Administrative Professional - Short Form
[3/24] Parsed: Agency Manager Solution
[4/24] Parsed: Apprentice + 8.0 Job Focused Assessment
[5/24] Parsed: Apprentice 8.0 Job Focused Assessment
[6/24] Parsed: Bank Administrative Assistant - Short Form
[7/24] Parsed: Bank Collections Agent - Short Form
[8/24] Parsed: Bank Operations Supervisor - Short Form
[9/24] Parsed: Bilingual Spanish Reservation Agent Solution
[10/24] Parsed: Bookkeeping, Accounting, Auditing Clerk Short Form
[11/24] Parsed: Branch Manager - Short Form
[12/24] Parsed: Cashier Solution
[13/24] Parsed: Global Skills Development Report
[14/24] Parsed: .NET Framework 4.5
[15/24] Parsed: .NET MVC (New)
[16/24] Parsed: .NET MVVM (New)
[17/24] Parsed: .NET WCF (New)
[18/24] Parsed: .NET WPF (New)
[19/24] Parsed: .NET XAML (New)
[20/24] Parsed: Accounts Payable (New)
[21/24] Parsed: Accounts Payable Simulation (New)
[22/24] P

Unnamed: 0,name,url,description,duration_minutes,test_type
0,Account Manager Solution,https://www.shl.com/products/product-catalog/v...,We recommend upgrading to a modern browser.,,Unknown
1,Administrative Professional - Short Form,https://www.shl.com/products/product-catalog/v...,We recommend upgrading to a modern browser.,,Unknown
2,Agency Manager Solution,https://www.shl.com/products/product-catalog/v...,We recommend upgrading to a modern browser.,,Unknown
3,Apprentice + 8.0 Job Focused Assessment,https://www.shl.com/products/product-catalog/v...,We recommend upgrading to a modern browser.,,Unknown
4,Apprentice 8.0 Job Focused Assessment,https://www.shl.com/products/product-catalog/v...,We recommend upgrading to a modern browser.,,Unknown


In [3]:
import pandas as pd

df = pd.read_csv('../data/catalog.csv')

# Drop any duplicates or empty names
df.drop_duplicates(subset=['url'], inplace=True)
df.dropna(subset=['name'], inplace=True)

# Optional: fill missing durations with median (or 60)
df['duration_minutes'] = df['duration_minutes'].fillna(60)

# Save cleaned version
df.to_csv('../data/catalog_clean.csv', index=False)
print(f"Clean catalog saved with {len(df)} items.")


Clean catalog saved with 24 items.
