# Web Scraping Job Advertisements

Data from: https://www.jobs.ch/de/stellenangebote/informatik-telekommunikation

## Libraries and settings

In [14]:
import json
import requests
from bs4 import BeautifulSoup
from datetime import datetime

## Web scraper

### Create URLs

In [15]:
# Define the number of pages to be scraped (listed are 24 job ads per page)
pages = 2

# Define the topic (for overview see occupational groups on https://www.jobs.ch)
# topic = 'informatik-telekommunikation'
topic = 'gastronomie-lebensmittel-tourismus'

# Define URLs
urls = [f'https://www.jobs.ch/de/stellenangebote/{topic}/?page={i}' for i in range(1, pages+1)]

# Check if the URLs are valid
urls

['https://www.jobs.ch/de/stellenangebote/gastronomie-lebensmittel-tourismus/?page=1',
 'https://www.jobs.ch/de/stellenangebote/gastronomie-lebensmittel-tourismus/?page=2']

## Run the web scraper

In [None]:
# List to store job links
job_links = []

# Get job links from the overview pages
for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    for a_tag in soup.select("div[data-feat='searched_jobs']:nth-of-type(n+5) a"):
        link = a_tag.get('href')
        if link and not link.startswith('http'):
            link = 'https://www.jobs.ch' + link
        job_links.append(link)

# List to store job ad details
job_data_list = []

# Fetch and parse details from each job detail page
for job_url in job_links:
    job_response = requests.get(job_url)
    job_soup = BeautifulSoup(job_response.text, 'html.parser')

    title = job_soup.select_one('[data-cy="vacancy-title"]').get_text(strip=True) if job_soup.select_one('[data-cy="vacancy-title"]') else None

    description_section = job_soup.select_one('div[data-feat="vacancy_description"]')
    tasks, profile = None, None

    if description_section:

        # Find the headers and their following elements
        headers = description_section.select('h3.textStyle_h5')
        for header in headers:
            header_text = header.get_text(strip=True)

            # Extract 'Ihre Aufgaben'
            if 'Ihre Aufgaben' in header_text:
                tasks_list = header.find_next_sibling('ul')
                tasks = tasks_list.get_text(separator='\n', strip=True) if tasks_list else None

            # Extract 'Was Sie mitbringen'
            elif 'Was Sie mitbringen' in header_text:
                profile_paragraph = header.find_next_sibling('span')
                profile = profile_paragraph.get_text(separator='\n', strip=True) if profile_paragraph else None

    # Extract company contact
    contact = job_soup.select_one('[data-cy="company-name"]').get_text(strip=True) if job_soup.select_one('[data-cy="company-name"]') else None

    # Store the data in a dictionary
    job_data = {
        'date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'link': job_url,
        'title': title,
        'tasks': tasks,
        'profile': profile,
        'contact': contact,
        'html': job_response.text
    }

    job_data_list.append(job_data)

# Store the data in a JSON file
with open('./data/jobs_ch.json', 'w') as f:
    json.dump(job_data_list, f, indent=2, ensure_ascii=False)

# Output JSON
print(json.dumps(job_data_list, indent=2, ensure_ascii=False))

[
  {
    "date": "2025-03-24 16:13:18",
    "link": "https://www.jobs.ch/de/stellenangebote/detail/af69bf6b-3056-455c-8654-b6b239237c29/",
    "title": "Hotelreiniger:In in Opfikon Zürich 40% - 80%",
    "tasks": null,
    "profile": null,
    "contact": "Enzler Hotel-Services AG",
    "html": "<!doctype html>\n<html  lang=\"de\">\n  <head>\n    <meta charset=\"utf-8\">\n    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\" />\n\n    <!-- No cache for index.html -->\n    \n\n    <title>Hotelreiniger:In in Opfikon Zürich 40% - 80% - Stellenangebot bei Enzler Hotel-Services AG - jobs.ch</title>\n\n    <!-- Preconnects : only first party origin and used everywhere! -->\n\n    <link rel=\"preconnect\" href=\"https://media.jobs.ch/\" />\n    \n\n    \n    <link rel=\"preconnect\" href=\"//c.jobs.ch\" />\n    \n\n    <!-- Fonts -->\n    \n      <link as=\"font\" type=\"font/woff2\" href=\"/public/fonts/03d2aa5e66b427f22f38.woff2\" fetchpriority=\"high\" crossorigin /

### Jupyter notebook --footer info--

In [17]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
NT
Windows | 10
Datetime: 2025-03-24 16:13:28
Python Version: 3.11.10
-----------------------------------
