In [None]:
import urllib
import requests
from bs4 import BeautifulSoup
import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options 
import pandas as pd
import os, re, time
import numpy as np


In [None]:
header= {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) ' 
      'AppleWebKit/537.11 (KHTML, like Gecko) '
      'Chrome/23.0.1271.64 Safari/537.11',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
##      'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
##      'Accept-Encoding': 'none',
##      'Accept-Language': 'en-US,en;q=0.8',
      'Connection': 'keep-alive'}

## Scraping from Indeed and Jobstreet

In [None]:

    
##===============================================Function for Indeed==============================================##

def load_indeed_jobs_div(job_title, location):
    getVars = {'q': job_title, 'l': location, 'fromage': 'last', 'sort': 'date', 'limit': 100}
    url = ('https://sg.indeed.com/jobs?'+ urllib.parse.urlencode(getVars))
    page = requests.get(url,verify=False)
    soup = BeautifulSoup(page.content,'html.parser')
    job_soup = soup.find(id='resultsCol')
    return job_soup

def extract_job_information_indeed(job_soup,desired_characs):
    job_elems = job_soup.find_all('div',class_='jobsearch-SerpJobCard')
    
    cols = []
    extracted_info = []
    
    if 'titles' in desired_characs:
        titles = []
        cols.append('titles')
        for job_elem in job_elems:
            titles.append(extract_job_title_indeed(job_elem))
        extracted_info.append(titles)

    if 'companies' in desired_characs:
        companies = []
        cols.append('companies')
        for job_elem in job_elems:
            companies.append(extract_company_indeed(job_elem))
        extracted_info.append(companies)
    
    if 'links' in desired_characs:
        links = []
        cols.append('links')
        for job_elem in job_elems:
            links.append(extract_link_indeed(job_elem))
        extracted_info.append(links)

    if 'date_listed' in desired_characs:
        dates = []
        cols.append('date_listed')
        for job_elem in job_elems:
            dates.append(extract_date_indeed(job_elem))
        extracted_info.append(dates)
        
    if 'salary' in desired_characs:
        salary = []
        cols.append('salary')
        for job_elem in job_elems:
            salary.append(extract_salary_indeed(job_elem))
        extracted_info.append(salary)
    
    jobs_list = {}
    
    for j in range(len(cols)):
        jobs_list[cols[j]]=extracted_info[j]
        
    num_listings = len(extracted_info[0])
    
    return jobs_list, num_listings



def extract_job_title_indeed(job_elem):
    title_elem = job_elem.find('h2',class_='title')
    title = title_elem.text.strip().replace('\nnew','')
    return title

def extract_company_indeed(job_elem):
    company_elem = job_elem.find('span',class_='company')
    company = company_elem.text.strip()
    return company

def extract_link_indeed(job_elem):
    link = job_elem.find('a',class_='jobtitle')['href']
    link = 'sg.indeed.com' + link
    return link

def extract_date_indeed(job_elem):
    date_elem = job_elem.find('span',class_='date')
    date = date_elem.text.strip()
    return date

def extract_salary_indeed(job_elem):
    try:
        salary_elem = job_elem.find('span',class_='salary')
        salary = salary_elem.text.strip()
    except:
        salary='NA'
    return salary


##===============================================Function for JobStreet==============================================##

""" get all position <a> tags for a single job role, triggered by linksByRoles function, results stored in a list """    

def linksbyKey(key):
    base_url = 'https://www.jobstreet.com.sg/en/job-search/job-vacancy.php?'
    pay_load = {'key':'','area':1,'option':1,'pg':None,'classified':1,'src':16,'srcr':12}
    pay_load['key'] = key
    
    pn = 1
    
    position_links = []
    loaded = True
    while loaded:
        print('Loading page {}...'.format(pn))
        pay_load['pg'] = pn
        url =base_url+ urllib.parse.urlencode(pay_load)
        page = requests.get(url,headers=header,verify=False)
        
        soup = BeautifulSoup(page.text,'html.parser')
        links = soup.find_all('a',class_="DvvsL_0 _1p9OP",href=True)
        
        if not len(links):
            loaded = False
        else:
            position_links += links
            pn +=1
    return position_links
        

""" get all position links for the list of roles, results stored in a dict"""   
    
def linksByRoles(roles):
    ## roles: a list of job roles
    ## return: a dictionary of links

    links_dic = dict()
    # scrape key words one by one
    for role in roles:
        print('Scraping position: ', role, ' ...')
        links_dic[role] = linksbyKey(role)
        print('{} {} positions found!'.format(len(links_dic[role]),role))
    return links_dic


""" parse HTML strings for the list of roles"""
    
def parseLinks(links_dic):
    ## links_dic: a dictionary of links
    ## return: print parsed results to .csv file

    for key in links_dic:
        
        jobs = []
        for link in links_dic[key]:
            jobs.append([key] + getJobDetail(link))

        # transfrom the result to a pandas.DataFrame
        result = pd.DataFrame(jobs,columns=['key_word','company_name','job_title','company_industry','qualification','location','salary','date_posted'])

        # add a column denoting if the position is posted by a recuriter company
        result['postedByHR'] = result.company_industry.apply(lambda x:True if x and x.find('Human Resources')>-1 else False)
        num_listings = len(result)
        # save result,
#        file_name = key+'.csv'
#        result.to_csv(file_name,index=False)
        return result, num_listings
    

""" extract details from post detail page """
def getJobDetail(job_href):
    ## job_href: a post url
    ## retun: post details from the detail page

    print('Scraping ',job_href,'...')
    baselink = 'https://www.jobstreet.com.sg'
   
    r = requests.get(baselink+job_href.get('href'))
    
    if r.status_code == 429:
        time.sleep(int(r.headers["Retry-After"]))
        r = requests.get(baselink+job_href.get('href'))
        
    soup = BeautifulSoup(r.content,'html.parser')
    
    dict={}
    dict['company_name'] = soup.find_all('div',{'class':'FYwKg _6Gmbl_0'})[1].text.strip() if soup.find('div',{'class':'FYwKg _6Gmbl_0'}) else None
    dict['job_title'] = soup.find_all('div',{'class':'FYwKg _6Gmbl_0'})[2].text.strip() if soup.find('div',{'class':'FYwKg _6Gmbl_0'}) else None
    for i in np.arange(4,len(soup.find_all('div',{'class':"FYwKg zoxBO_0"})),2):
        try:
            dict[soup.find_all('div',{'class':"FYwKg zoxBO_0"})[i].text]=soup.find_all('div',{'class':"FYwKg zoxBO_0"})[i+1].text  
        except IndexError:
            dict[soup.find_all('div',{'class':"FYwKg zoxBO_0"})[i].text] = ''
    dict['location'] = soup.find_all('div',{'class':"FYwKg _11hx2_0"})[0].text.strip() if soup.find('div',{'class':'FYwKg _11hx2_0'}) else None
    
    dict['salary'] = soup.find_all('div',{'class':"FYwKg _11hx2_0"})[1].text.strip().replace(u'\xa0', u'') if len(soup.find_all('div',{'class':"FYwKg _11hx2_0"}))>2 else None
    dict['date_posted'] = soup.find_all('div',{'class':"FYwKg _11hx2_0"})[-1].text.strip() if soup.find('div',{'class':"FYwKg _11hx2_0"}) else None
    
    return [dict['company_name'],dict['job_title'],dict.get('Industry',None),dict.get('Qualification',None),dict.get('location',None),dict.get('salary',None),dict.get('date_posted',None)]




##================================================Generic Function================================================##

def save_jobs_to_excel(jobs_list,filename):
    if isinstance(jobs_list,dict):
        jobs = pd.DataFrame(jobs_list)
        jobs.to_excel(filename)
    if isinstance(jobs_list,pd.DataFrame):
        jobs_list.to_excel(filename)
 
    
##=================================================Main Function=================================================##

def find_jobs_from(website, job_title, location, desired_characs, filename = 'results.xlsx'):
    '''
    This function extracs all the desired charateristics fo all new job postings of the title and location specified and return
    them in a single file.
    The arguments it take are:
    -website: to specify ('Indeed' or 'JobsDB')
    -Job_title
    -Location
    -Desired characs: list of characteristics to be returned
    -Filename: to specify the filename and format of the output
    '''
    
    if website == 'Indeed':
        job_soup = load_indeed_jobs_div(job_title,location)
        jobs_list, num_listings = extract_job_information_indeed(job_soup,desired_characs)
        
    if website == 'Jobstreet':
        s = requests.session()
        links_dict = linksByRoles(job_title)
        jobs_list, num_listings = parseLinks(links_dict)
   
        
    save_jobs_to_excel(jobs_list,filename)
    
    print('{} new job postings retrieved from {}. Stored in {}.'.format(num_listings,website,filename))
    


In [None]:
desired_characs = ['titles','companies','links','date_listed','salary']

find_jobs_from('Jobstreet',['ite graduate'],'Singapore', desired_characs,filename = 'jobstreet.xlsx')

## End of Programme