In [1]:
pip install selenium undetected-chromedriver

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import json
import time
import random
import hashlib
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
#keywords used: 
#{"Data Scientist", "Data Engineer", "Data Analyst", "ai ml engineer", "nlp engineer", "Data Architect", "MLOps Engineer", "Computer Vision Engineer"}

In [None]:
KEYWORD = "Computer Vision Engineer"
LOCATION = "United States"
OUTPUT_FILE = "jobs.json"
HASH_FILE = "job_hashes.json"
TOTAL_PAGES = 50
BASE_URL = f"https://www.ziprecruiter.com/jobs-search?search={KEYWORD.replace(' ', '+')}&location={LOCATION.replace(' ', '+')}&page="

In [None]:
#load dataset(json) file
if os.path.exists(OUTPUT_FILE):
    with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
        job_data = json.load(f)
else:
    job_data = []

#load hash file
if os.path.exists(HASH_FILE):
    with open(HASH_FILE, "r", encoding="utf-8") as f:
        seen_hashes = set(json.load(f))
else:
    seen_hashes = set()

#broswer setup
options = uc.ChromeOptions()
options.headless = False
driver = uc.Chrome(options=options)
driver.set_window_size(1200, 800)

#manual login and confirmation before starting the scraping process
driver.get(BASE_URL + "1")
input(">> Log in to ZipRecruiter manually if required, then press ENTER to start scraping...\n")


for page_count in range(1, TOTAL_PAGES + 1):
    print(f"\n>> Scraping Page {page_count}")
    try:
        driver.get(BASE_URL + str(page_count))
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.break-words.text-primary"))
        )
        job_cards = driver.find_elements(By.CSS_SELECTOR, "a.break-words.text-primary")
        job_links = list({card.get_attribute("href") for card in job_cards if card.get_attribute("href")})
        print(f">> Collected {len(job_links)} unique job URLs")
    except Exception as e:
        print(f"!! Error collecting job links: {e}")
        continue

    for i, link in enumerate(job_links):
        try:
            print(f">> Visiting job #{i + 1}: {link}")
            driver.get(link)
            time.sleep(random.uniform(2.5, 4.5))

            #Retry logic for error pages
            for attempt in range(2):
                title_elem = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, "h1"))
                )
                title = title_elem.text.strip()
                if "Web server is returning an unknown error" in title or "Checking your browser" in title:
                    print(f"!! Server returned error page (Attempt {attempt + 1})")
                    time.sleep(5)
                    driver.refresh()
                    continue
                break

            desc_block = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "div.job_description"))
            )
            description = desc_block.text.strip()
            
            #skipping short/empty descriptions
            if len(description) < 50:
                print("!! Too short / empty description, skipping.")
                continue
            
            #checking duplicate jobs
            job_hash = hashlib.md5(description.encode("utf-8")).hexdigest()
            if job_hash in seen_hashes:
                print(f"!! Duplicate job based on description hash.")
                continue

            seen_hashes.add(job_hash)
            job_entry = {
                "title": title,
                "description": description,
                "keyword": KEYWORD
            }

            job_data.append(job_entry)
            print(f"[+] Scraped: {title} | Total: {len(job_data)}")

        except Exception as e:
            print(f"!! Error scraping job: {e}")
            continue

    #Anti-rate-limit wait
    delay = random.uniform(3, 6)
    print(f">> Sleeping {delay:.2f} seconds before next page...")
    time.sleep(delay)

driver.quit()


with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(job_data, f, indent=2, ensure_ascii=False)


with open(HASH_FILE, "w", encoding="utf-8") as f:
    json.dump(list(seen_hashes), f, indent=2)


print(f"\n Total jobs scraped: {len(job_data)}")


>> Scraping Page 1
>> Collected 20 unique job URLs
>> Visiting job #1: https://www.ziprecruiter.com/k/l/AAKv9lS3Ri68PYbcNVUa8Pxr8Xgu5ma2tJVohiPm9NjlI6RJVT2gMRjjk8Z0ufxgFCciokoEh94MWBeoTw4RewFVyr0IgG8pEKFzxhOSJsXda1QPhmnoHJTrqtc6fXllHZUQYVUWT3PvzyeBju1DungaatRC4iBn9gjXYZdry2sIF1QWnlQLBffUrJgX9GisEO8obVCDTWQPLoLAS6Bw57QtkgSmI3oCPDRxPmLVyB7Ir-gZNIpJh5wufPublmNPfTepMCMhOJq0q17nYUk2g5d6iOjj9v_CkjKRKj6_EbYVUbSYLoU3Y3LAhx28kUNd5JL-
[+] Scraped: Corinth Reman Engines, Cranks & Blocks Reclaim Machinist II (2nd shift) | Total: 5805
>> Visiting job #2: https://www.ziprecruiter.com/k/l/AALzA0jWbU87Y5q9AEoHU-zN2n6ZYfYXSCu0vrRqYsGZ960r0ucZwg9sodcUF0VHIW7jcUDQqaGHjMwvf9te-aBxBsLGzZ4n9BrGRoEoPkQ5k6uOETizrW7T2zHdPKJDBderIQvoaq6HbKh4CnM0Km9QlPFRTElQtiGHJH9ahqCvMUTVYznc2u_LGPMC2nTqU2OIYQuVyj0592nw5erxjI5r50_g1VwI99fI05c2cXZrGgtQ1JvEvmVZ-tG-S0InjarVSgTo2Jh1vFYs5FZnjtqnRPDtYU5B7_UWLnQPAi3WoNb9amv76c5ym37wQhYp6l7K
[+] Scraped: CNC Operator-3rd Shift (1881) | Total: 5806
>> Visiting job #3: https://www.zipr