In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

In [None]:
# Setup driver
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

base_url = 'https://www.loker.id/cari-lowongan-kerja'
driver.get(base_url)
time.sleep(3)

data = []
max_pages = 110

for i in range(max_pages):
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'card')))
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    articles = soup.find_all('article', class_='card')

    for job in articles:
        job_id = job.get('data-id', '')
        title_tag = job.find('h3')
        title = title_tag.get_text(strip=True) if title_tag else ''
        company_tag = job.find('span', class_='text-sm text-secondary-500')
        company = company_tag.get_text(strip=True) if company_tag else ''
        loc_tag = job.find('svg', {'class': 'icon-tabler-map-pin'})
        location = loc_tag.find_next('span').get_text(strip=True) if loc_tag else ''
        salary_tag = job.find('svg', {'class': 'icon-tabler-cash'})
        salary = salary_tag.find_next('span').get_text(strip=True) if salary_tag else ''
        badge_tags = job.find_all('span', class_='badge')
        categories = ";".join(b.get_text(strip=True) for b in badge_tags)
        posted_tag = job.find('time')
        posted = posted_tag['datetime'] if posted_tag and posted_tag.has_attr('datetime') else ''
        link_tag = job.find('a', href=True)
        detail_link = 'https://www.loker.id' + link_tag['href'] if link_tag and link_tag['href'].endswith('.html') else ''

        function = job_type = education = level = description = ''

        if detail_link:
            driver.get(detail_link)
            time.sleep(2)
            detail_soup = BeautifulSoup(driver.page_source, 'html.parser')

            def get_field_value(label_keywords):
                for div in detail_soup.select('div.font-bold'):
                    label = div.get_text(strip=True)
                    if any(keyword in label for keyword in label_keywords):
                        a_tags = div.find_next_siblings('a')
                        if a_tags:
                            return ";".join(a.get_text(strip=True) for a in a_tags)
                        single_a = div.find_next('a')
                        if single_a:
                            return single_a.get_text(strip=True)
                return ''

            function = get_field_value(['Fungsi', 'Kategori'])
            job_type = get_field_value(['Tipe Pekerjaan'])
            education = get_field_value(['Pendidikan'])
            level = get_field_value(['Level Pekerjaan'])

            prose_div = detail_soup.find('div', class_='prose')
            description_parts = []

            if prose_div:
                for elem in prose_div.find_all(['div', 'h2'], recursive=False):
                    if elem.name == 'h2':
                        description_parts.append(f"\n{elem.get_text(strip=True)}")
                    elif elem.name == 'div':
                        content = elem.get_text(separator="\n", strip=True)
                        if content:
                            description_parts.append(content)

            description = "\n".join(description_parts).strip()

        data.append({
            'Job ID': job_id,
            'Title': title,
            'Company': company,
            'Location': location,
            'Salary': salary,
            'Category': categories,
            'Posted Date': posted,
            'Link': detail_link,
            'Role': function,
            'Type': job_type,
            'Education': education,
            'Level': level,
            'Description': description
        })

        driver.back()
        time.sleep(1)

    try:
        next_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, '//nav//button[@aria-label="Next" and not(@disabled)]'))
        )
        next_button.click()
        time.sleep(5)
    except:
        print("Tidak ada tombol Next atau sudah sampai halaman terakhir.")
        break

# Save to CSV format
df = pd.DataFrame(data)
df.to_csv('lokerid_data.csv', index=False, encoding='utf-8-sig')

driver.quit()