In [7]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, TimeoutException
from bs4 import BeautifulSoup
import pandas as pd

# init setup driver 
def setup_driver(url):
    options = webdriver.ChromeOptions()
    options.add_argument('--start-maximized')
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    return driver

# close popup window 
def close_popup(driver):
    try:
        close_button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CLASS_NAME, "CloseButton")))
        close_button.click()
        # WebDriverWait(driver, 10).until(EC.invisibility_of_element_located((By.CLASS_NAME, "CloseButton")))
    except TimeoutException:
        pass

# click load more button 
def click_load_more(driver):
    try:
        load_more_button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-test='load-more']")))
        driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button)
        load_more_button.click()
        return True
    except ElementClickInterceptedException:
        close_popup(driver)
        return click_load_more(driver)  
    except (NoSuchElementException, TimeoutException):
        return False

# extract data 
def extract_job_data(driver):
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    all_jobs_container = soup.find("ul", {"class": "JobsList_jobsList__Ey2Vo"})
    all_jobs = all_jobs_container.find_all("li") if all_jobs_container else []
    jobs_list = []
    for job in all_jobs:
        details = {
            "name-of-company": job.find("span", {"class": "EmployerProfile_employerName__Xemli"}).get_text(strip=True) 
                if job.find("span", {"class": "EmployerProfile_employerName__Xemli"}) 
                else None,
            "name-of-job": job.find("a", {"class": "JobCard_seoLink__WdqHZ"}).get_text(strip=True) 
                if job.find("a", {"class": "JobCard_seoLink__WdqHZ"}) 
                else None,
            "location": job.find("div", {"class": "JobCard_location__N_iYE"}).get_text(strip=True) 
                if job.find("div", {"class": "JobCard_location__N_iYE"}) 
                else None,
            "salary": job.find("div", {"class": "JobCard_salaryEstimate___m9kY"}).get_text(strip=True) 
                if job.find("div", {"class": "JobCard_salaryEstimate___m9kY"}) 
                else None
        }
        jobs_list.append(details)
    return jobs_list

# for ending loop
def get_total_job_count(driver):
    try:
        job_count_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "h1.SearchResultsHeader_jobCount__12dWB"))
        )
        return int(job_count_element.text.split()[0])
    except TimeoutException:
        return 0
    

# set up the webdriver
target_url = "https://www.glassdoor.com/Job/new-zealand-ai-jobs-SRCH_IL.0,11_IN186_KO12,14.htm"
# target_url = "https://www.glassdoor.com/Job/christchurch-canterbury-new-zealand-data-engineer-jobs-SRCH_IL.0,35_IC3526586_KO36,49.htm"
driver = setup_driver(target_url)

# Main part
collected_data = set()

try:
    total_job_count = get_total_job_count(driver)
    while True:
        current_data = extract_job_data(driver)

        for job in current_data:
            job_tuple = tuple(job.items())
            collected_data.add(job_tuple)

        if len(collected_data) >= total_job_count:
            break

        if not click_load_more(driver):
            break

        close_popup(driver)

        WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, "ul.JobsList_jobsList__Ey2Vo li")))

finally:
    driver.quit()

# data export
unique_collected_data = [dict(job) for job in collected_data]
total_jobs = len(unique_collected_data)
print(f"Total {total_jobs} jobs exported")

df = pd.DataFrame(unique_collected_data)
styled_df = df.style.set_properties(**{'text-align': 'left'})
styled_df.set_table_styles([dict(selector='th', props=[('text-align', 'left')])])

display(styled_df)

df.to_csv('jobs.csv', index=False, encoding='utf-8')


Total 382 jobs exported


Unnamed: 0,name-of-company,name-of-job,location,salary
0,Deloitte,Graduate Payroll Specialist,East Tamaki,NZ$76K - NZ$96K(Glassdoor est.)
1,Orion Health,Senior Infrastructure and Systems Engineer,Auckland,
2,Te Whatu Ora – Health New Zealand Nelson Marlborough,Emergency Medicine Specialist/ Rural Hospital Specialist - Blenheim,Blenheim,NZ$77K - NZ$121K(Glassdoor est.)
3,Deloitte,Business Development Specialist,East Tamaki,NZ$66K - NZ$77K(Glassdoor est.)
4,New Zealand Government,Service Manager,Palmerston North,NZ$87K - NZ$150K(Glassdoor est.)
5,Auckland Council,Lead Architect,Auckland,NZ$78K - NZ$90K(Glassdoor est.)
6,Te Whatu Ora – Health New Zealand Nelson Marlborough,Crisis Workers - Adult Community Mental Health,Blenheim,NZ$51K - NZ$79K(Glassdoor est.)
7,Fulton Hogan,Data Engineering Manager,Christchurch,NZ$72K - NZ$105K(Glassdoor est.)
8,Manukau Institute of Technology,TAO Health Workforce Academic Mentor,Manukau City,NZ$63K - NZ$80K(Glassdoor est.)
9,Deloitte,Receptionist,Wellington,NZ$55K - NZ$71K(Glassdoor est.)
