In [5]:
# imports
import time
from time import sleep
import pandas as pd
from random import randint
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

In [6]:
# browser options
option=webdriver.ChromeOptions()
option.add_argument("--incognito")
# option.add_argument('--headless=chrome')

In [7]:
# search specifications
position = 'Software+Engineer'
location = 'Kuala+Lumpur'

within_radius = 15 # within 15 miles of location [15, 25]
prevent_dupe = 1 # filter set to 1 [0, 1]
days_ago = 7 # posted 7 days ago [1, 3, 7, 14]
start_page = 0 # start at page 0

pagination_url = 'https://malaysia.indeed.com/jobs?q={}&l={}&radius={}&filter={}&sort=date&fromage={}&start={}'

## Enter your query here

In [None]:
start = time.time()

job_list = [] # store job list
job_desc_href = [] # store job description link
job_descs = [] # store job description
job_salary = [] # store salary list
merged_array = [] # store merged array

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()),options=option)

driver.get(pagination_url.format(position, location, within_radius, prevent_dupe, days_ago, start_page))

# wait for a random seconds before continuing
sleep(randint(2, 6))

# find the number of jobs
p = driver.find_element(By.CLASS_NAME, 'jobsearch-JobCountAndSortPane-jobCount').text
p = p.strip(" jobs")
p = int(p.replace(',',''))
max_iter_pgs = p // 15 # each page contains 15 listing, so we find the page number by dividing 15

# set loop = 0 if only 1 page of listings
if (max_iter_pgs == 0 and p > 0):
    maxRange = 1
else:
    maxRange = max_iter_pgs

# scraper
for start_page in range(0,maxRange):
    retries = 0
    while retries < 5:
        try:
            print("Current Page: ", start_page)
            driver.get(pagination_url.format(position, location, within_radius, prevent_dupe, days_ago, start_page * 10)) # 0 = page 1, 10 = page 2, etc
            sleep(randint(2, 4))

            job_page = driver.find_element(By.ID, "mosaic-jobResults")
            jobs = job_page.find_elements(By.CLASS_NAME, "job_seen_beacon")

            for jj in jobs:
                job_title = jj.find_element(By.CLASS_NAME, "jobTitle")

                job_list.append([
                    job_title.text, # job title
                    job_title.find_element(By.CSS_SELECTOR, "a").get_attribute("href"), # job link
                    job_title.find_element(By.CSS_SELECTOR, "a").get_attribute("id"), # job id
                    jj.find_element(By.CLASS_NAME, "companyName").text, # company name
                    jj.find_element(By.CLASS_NAME, "companyLocation").text, # company location
                    jj.find_element(By.CLASS_NAME, "date").text # date posted
                ])

                try:
                    job_salary.append(jj.find_element(By.CLASS_NAME, "salary-snippet-container").text)
                except NoSuchElementException:
                    job_salary.append(None)

                try:
                    pop_up_button = jj.find_element(By.XPATH, '//*[@id="mosaic-desktopserpjapopup"]/div[1]/button')
                    pop_up_button.click()
                    print("pop-up closed")
                    print("Result: " + job_title.text + " completed!")
                except NoSuchElementException:
                    print("Result: " + job_title.text + " completed!")
            break

        except Exception as e:
            print("No Internet")
            time.sleep(60)
            print("retrying...")

# extract job description links
for i in range(len(job_list)):
    job_desc_href.append(job_list[i][1])

# extract job description from each link
for url in job_desc_href:
    retries = 0
    while retries < 5:
        try:
            driver.get(url)
            sleep(randint(3, 5))
            try:
                job_descs.append(driver.find_element(By.ID,"jobDescriptionText").text)
            except:
                job_descs.append(None)
            print("Page: " + url + " completed!")
            break

        except Exception as e:
            print("Error retriving job desc for: " + url)
            time.sleep(60)
            retries += 1
            print("retrying...")


driver.quit()
end = time.time()

# merge job listing and with corresponding salary
for job, salary, desc in zip(job_list, job_salary, job_descs):
    merged_array.append(job + [salary] + [desc])

# print scraping details
print(end - start, 'seconds to complete action!')
print('---------------------')
print('Max Iterable Pages for this search:', max_iter_pgs)
print('Job Count:', p)
print('Extracted: ', len(job_list))

## Pandas DF & Save into Excel

In [None]:
df = pd.DataFrame(merged_array, columns = [
    'Job Title', 'Job Link', 'Job ID',
    'Company Name', 'Location', 'Job Posting',
    'Salary', 'Job Description'
])

# change this path
df.to_csv('..//web_scraper//excel//results.csv', index=False)