# Scraping jobs on Jobstreet

In [1]:
import requests
from bs4 import BeautifulSoup
from csv import reader, writer
from time import sleep
import json

## Get the job URLs
First, we will collect all the URLs from the search pages.
The job information is stored as a JSON, easier to navigate.

In [2]:
jobs = []
N_JOBS = 1109 #Number of available jobs on the portal. Change accordingly.
PAGES = N_JOBS // 30 + 1 #Denominator is the number of jobs per page
SEARCH_QUERY = 'telecommunication' #change accordingly, leave empty if necessary

In [3]:
if SEARCH_QUERY:
    SEARCH_LINK = 'https://www.jobstreet.com.sg/en/job-search/' + '-'.join((SEARCH_QUERY + ' ' + 'jobs in singapore').split()) + '/{}/'
else:
    SEARCH_LINK = 'https://www.jobstreet.com.sg/en/job-search/singapore-jobs/{}/'                                                                      

In [4]:
SEARCH_LINK

'https://www.jobstreet.com.sg/en/job-search/telecommunication-jobs-in-singapore/{}/'

In [5]:
# Index starts from 1
for i in range(1, PAGES + 1):
    try:
        search_page = requests.get(SEARCH_LINK.format(i))
        search_page_soup = BeautifulSoup(search_page.text, 'html5lib')
        jobs.extend(json.loads(search_page_soup.find_all("script")[1].text[len('window.REDUX_STATE = '):-len(";")])["result"]["jobs"])
    except:
        continue

In [6]:
len(jobs)

1002

In [7]:
job_id = list(map(lambda job: job["id"], jobs))
country_code = list(map(lambda job: job["sourceCountryCode"], jobs))
company = list(map(lambda job: job["companyMeta"]["name"], jobs))
qualification = list(map(lambda job: job["qualificationName"], jobs))
work_experience = list(map(lambda job: job["workExperienceName"], jobs))
career_level = list(map(lambda job: job["careerLevelName"], jobs))
employment_term = list(map(lambda job: job["employmentTermName"], jobs))
job_title = list(map(lambda job: job["jobTitle"], jobs))
job_url = list(map(lambda job: job["jobUrl"], jobs))
category = list(map(lambda job: str(list(map(lambda category: category["name"], job["categories"])))[1:-1], jobs))
posted_date = list(map(lambda job: job["postedAt"], jobs))
salary = list(map(lambda job: job["salary"], jobs))

In [8]:
def getPage(link):
    try:
        req = requests.get(link)
        return BeautifulSoup(req.text, 'lxml')
    except:
        return BeautifulSoup('', 'lxml')

In [9]:
pages = list(map(getPage, job_url))

In [10]:
def getJobDesc(page):
    try:
        jd = page.find("div", {"data-automation": "jobDescription"}).text
        return jd
    except:
        return ""
job_descriptions = list(map(lambda page: getJobDesc(page), pages))

## Save as Dataframe

In [11]:
import pandas as pd

In [12]:
col = {'job_title': job_title, 'category': category, 'job_descriptions': job_descriptions,
       'job_id':job_id, 'country_code': country_code, 
       'company': company, 'qualification': qualification,
       'work_experience': work_experience, 'career_level': career_level,
       'employment_term': employment_term, 'job_url': job_url, 
       'posted_date': posted_date, 'salary': salary}

In [13]:
jobs = pd.DataFrame(col)

In [14]:
PREFIX = "jobstreet results"
FILENAME = '_'.join((PREFIX + ' '+ SEARCH_QUERY).split()) + ".csv"
jobs.to_csv(FILENAME, index=False)
print("DONE")

DONE
