In [1]:
#reference https://towardsdatascience.com/scraping-job-posting-data-from-indeed-using-selenium-and-beautifulsoup-dfc86230baac

import re
import json
from bs4 import BeautifulSoup
from time import sleep
import requests
import datetime

FETCH_DELAY_SECONDS = 1 # or whatever value you're comfortable with

def get_soup(url):
    """
    Given the url of a page, this function returns the soup object.
    
    Parameters:
        url: the link to get soup object for
    
    Returns:
        soup: soup object
    """
#     driver = webdriver.Firefox()
#     driver.get(url)
#     html = driver.page_source
#     soup = BeautifulSoup(html, 'html.parser')
#     driver.close()

    sleep(FETCH_DELAY_SECONDS)
    req = requests.get(url)
    soup = BeautifulSoup(req.text, 'html.parser') 
    #soup = BeautifulSoup(req.text, "lxml", from_encoding="utf-8")
    return soup


def grab_job_locs(soup):
    locs = []
    
    # Loop thru all the posting links
    for span in soup.find_all('span', {'class': 'location'}):
        # Since sponsored job postings are represented by "a target" instead of "a href", no need to worry here
        loc = span.text
        locs.append(loc)
    
    return locs



def grab_job_links_v1(soup, baseurl):
    """
    Grab all non-sponsored job posting links from a Indeed search result page using the given soup object
    
    Parameters:
        soup: the soup object corresponding to a search result page
                e.g. https://ca.indeed.com/jobs?q=data+scientist&l=Toronto&start=20
    
    Returns:
        urls: a python list of job posting urls
    
    """
    urls = []
    
    # Loop thru all the posting links
    for link in soup.find_all('h2', {'class': 'jobtitle'}):
          
        # Since sponsored job postings are represented by "a target" instead of "a href", no need to worry here
        partial_url = link.a.get('href')
        # This is a partial url, we need to attach the prefix
        url = baseurl + partial_url
        # Make sure this is not a sponsored posting
        
        urls.append(url)
    
    for link in soup.find_all('div', {'class': 'title'}):
      
        # Since sponsored job postings are represented by "a target" instead of "a href", no need to worry here
        partial_url = link.a.get('href')
        # This is a partial url, we need to attach the prefix
        url = baseurl + partial_url
        # Make sure this is not a sponsored posting
        
        urls.append(url)
    
    
    #print("grab_job_links num_urls:", len(urls))
    return urls


def grab_job_links(soup, baseurl):
    """
    Grab all non-sponsored job posting links from a Indeed search result page using the given soup object
    
    Parameters:
        soup: the soup object corresponding to a search result page
                e.g. https://ca.indeed.com/jobs?q=data+scientist&l=Toronto&start=20
    
    Returns:
        urls: a python list of job posting urls
    
    """
    urls = []
    
    for div_row in soup.find_all(name="div", attrs={"class":"row"}): 
        
        #check sponsored   
        div_sponsor = div_row.find('span', {'class': 'sponsoredGray'})
        div_resultlink = div_row.find('span', {'class': 'result-link'})
        if ((div_sponsor is not None) and (div_resultlink is None)):
            sponsored_val = 'Sponsored'    
        else:
            sponsored_val = ''
        
        
        #salary
        div_salary = div_row.find('div', {'class':'salarySnippet'})
        if (div_salary is not None):
            salary = div_salary.get_text(' ')    
        else:
            salary = ''
        
        
        #summary
        div_summary = div_row.find('div', {'class':'summary'})
        span_summary = div_row.find('span', {'class':'summary'})
        if (div_summary is not None):
            summary = div_summary.get_text(' ')    
        elif (span_summary is not None):
            summary = span_summary.get_text(' ') 
        else:
            summary = ''
        
        
        url = ''
        
        h_title = div_row.find('h2', {'class':'jobtitle'})
        if (h_title is not None):
            partial_url = h_title.a.get('href')
            url = baseurl + partial_url
        
        div_title = div_row.find('div', {'class': 'title'})
        if (div_title is not None):
            partial_url = div_title.a.get('href')
            url = baseurl + partial_url
            
        if (url != ''):
            urls.append({'url':url,'sponsored_flg':sponsored_val, 'salary': salary, 'summary':summary})
            
    
#     # Loop thru all the posting links
#     for link in soup.find_all('h2', {'class': 'jobtitle'}):
          
#         # Since sponsored job postings are represented by "a target" instead of "a href", no need to worry here
#         partial_url = link.a.get('href')
#         # This is a partial url, we need to attach the prefix
#         url = baseurl + partial_url
#         # Make sure this is not a sponsored posting
        
#         urls.append(url)
    
#     for link in soup.find_all('div', {'class': 'title'}):
      
#         # Since sponsored job postings are represented by "a target" instead of "a href", no need to worry here
#         partial_url = link.a.get('href')
#         # This is a partial url, we need to attach the prefix
#         url = baseurl + partial_url
#         # Make sure this is not a sponsored posting
        
#         urls.append(url)
    
    
    #print("grab_job_links num_urls:", len(urls))
    return urls



def get_urls(query, num_pages, location, base_site):
    """
    Get all the job posting URLs resulted from a specific search.
    
    Parameters:
        query: job title to query
        num_pages: number of pages needed
        location: city to search in
    
    Returns:
        urls: a list of job posting URL's (when num_pages valid)
        max_pages: maximum number of pages allowed ((when num_pages invalid))
    """
    # We always need the first page
    base_url = '{}/jobs?q={}&l={}'.format(base_site, query, location)
    soup = get_soup(base_url)
    
    urls = []
    locs = []
    
    print("base url:",base_url)
    # Get the total number of postings found 
    posting_count_string = soup.find(name='div', attrs={'id':"searchCount"}).get_text()
    posting_count_string = posting_count_string[posting_count_string.find('of')+2:].strip().replace(",","")
    
    try:
        posting_count = int(posting_count_string)
    except ValueError: # deal with special case when parsed string is "360 jobs"
        posting_count = int(re.search('\d+', posting_count_string).group(0))
    
    # Limit nunmber of pages to get
    max_pages = round(posting_count / 10)
    if num_pages > max_pages:
        #print('returning max_pages!!')
        #return max_pages
        num_pages = max_pages
    
    print("posting count:{}  num pages:{}".format(posting_count, num_pages))
    # Additional work is needed when more than 1 page is requested

    for i in range(num_pages):
        num = (i) * 10
        base_url = '{}/jobs?q={}&l={}&start={}'.format(base_site,query, location, num)
        #print("base:", base_url)
        soup = get_soup(base_url)
        # We always combine the results back to the list
        urls += grab_job_links(soup, base_site)
        locs += grab_job_locs(soup)
        print("Grab Progress: {}/{}  {:2.0f}%".format(i+1, num_pages, 100*((i+1)/num_pages)), end='\r')

    # Check to ensure the number of urls gotten is correct
    #assert len(urls) == num_pages * 10, "There are missing job links, check code!"

    #print("urls:", len(urls))
    return {'urls': urls, 'locs': locs}     



def get_posting(url):
    """
    Get the text portion including both title and job description of the job posting from a given url
    
    Parameters:
        url: The job posting link
        
    Returns:
        title: the job title (if "data scientist" is in the title)
        posting: the job posting content    
    """
    # Get the url content as BS object
    soup = get_soup(url)
    
    # The job title is held in the h3 tag
    title = soup.find(name='h3').get_text()
    posting = soup.find(name='div', attrs={'class': "jobsearch-JobComponent-description"}).get_text(' ')
    
    div = soup.find('div', {'class': 'jobsearch-InlineCompanyRating'})
    company_divs = div.find_all('div')
    
    company = ''
    if (len(company_divs) > 0):
        company = company_divs[0].get_text(' ')
    
    loc = ''
    if (len(company_divs) > 1):
        loc = company_divs[-1].get_text(' ')
    
    return title, posting, company, loc

        
    #if 'data scientist' in title:  # We'll proceed to grab the job posting text if the title is correct
        # All the text info is contained in the div element with the below class, extract the text.
        #posting = soup.find(name='div', attrs={'class': "jobsearch-JobComponent"}).get_text()
        #return title, posting.lower()
    #else:
        #return False
    
        # Get rid of numbers and symbols other than given
        #text = re.sub("[^a-zA-Z'+#&]", " ", text)
        # Convert to lower case and split to list and then set
        #text = text.lower().strip()
    
        #return text



def get_data(query, num_pages, location='', base_site='https://ca.indeed.com', file_prefix = 'CA'):
    
    print("start:", datetime.datetime.now())
    """
    Get all the job posting data and save in a json file using below structure:
    
    {<count>: {'title': ..., 'posting':..., 'url':...}...}
    
    The json file name has this format: ""<query>.json"
    
    Parameters:
        query: Indeed query keyword such as 'Data Scientist'
        num_pages: Number of search results needed
        location: location to search for
    
    Returns:
        postings_dict: Python dict including all posting data
    
    """
    # Convert the queried title to Indeed format
    query = '+'.join(query.lower().split())
    
    postings_dict = {}
    
    results = get_urls(query, num_pages, location, base_site)
    # print(results)
    urls = results['urls']
    #locs =  results['locs'] 
    
    print()
    
    #  Continue only if the requested number of pages is valid (when invalid, a number is returned instead of list)
    if isinstance(urls, list):
        num_urls = len(urls)
        for i, url in enumerate(urls):
            try:
                title, posting, company, loc = get_posting(url['url'])
                postings_dict[i] = {}
                postings_dict[i]['title'] = title
                postings_dict[i]['loc'] = loc
                postings_dict[i]['posting'] =  posting
                postings_dict[i]['company'] = company 
                postings_dict[i]['summary'] = url['summary']
                postings_dict[i]['salary'] = url['salary']
                postings_dict[i]['sponsored_flg'] = url['sponsored_flg']
                postings_dict[i]['url'] = url['url']
            
            except: 
                continue
            
            percent = (i+1) / num_urls
            # Print the progress the "end" arg keeps the message in the same line 
            print("Progress: {}/{}  {:2.0f}%".format(i+1, num_urls, 100*percent), end='\r')

        # Save the dict as json file
        file_name = file_prefix + '_' + query.replace('+', '_').replace("(", "").replace(")", "").replace('"','') + "_" + location + "_" + datetime.datetime.now().strftime("%Y%m%d%H%M") + '.json'
        with open(file_name, 'w') as f:
            json.dump(postings_dict, f)
        
        print('All {} postings have been scraped and saved!'.format(num_urls))    
        #return postings_dict
    else:
        print("Due to similar results, maximum number of pages is only {}. Please try again!".format(urls))

    print("finished:", datetime.datetime.now())
    


In [None]:
#extract data
query = '("data scientist") or ("machine learning")'
num_pages = 2000
get_data(query, num_pages, location='', base_site='https://ca.indeed.com', file_prefix='CA')
get_data(query, num_pages, location='', base_site='https://au.indeed.com', file_prefix='AU')
get_data(query, num_pages, location='', base_site='https://www.indeed.com.sg', file_prefix='SG')

start: 2019-04-10 23:11:28.509776
base url: https://ca.indeed.com/jobs?q=("data+scientist")+or+("machine+learning")&l=
posting count:1785  num pages:178
Grab Progress: 178/178  100%
All 2903 postings have been scraped and saved!
finished: 2019-04-11 00:36:57.435256
start: 2019-04-11 00:36:57.441239
base url: https://au.indeed.com/jobs?q=("data+scientist")+or+("machine+learning")&l=
posting count:737  num pages:74
Grab Progress: 74/74  100%
All 736 postings have been scraped and saved!
finished: 2019-04-11 01:02:37.009836
start: 2019-04-11 01:02:37.011831
base url: https://www.indeed.com.sg/jobs?q=("data+scientist")+or+("machine+learning")&l=
posting count:1057  num pages:106
Grab Progress: 106/106  100%
Progress: 319/1051  30%

In [None]:
 soup = BeautifulSoup('<br>aaa<br>bbb', 'html.parser') 

In [None]:
soup.get_text(' ')

In [None]:
all_text = ''.join(soup.findAll(text=True))
all_text

In [None]:
url = 'https://ca.indeed.com/viewjob?jk=eae45c8c691026b7&from=serp&vjs=3'
soup = get_soup(url)


In [None]:
# The job title is held in the h3 tag
title = soup.find(name='h3').get_text()
posting = soup.find(name='div', attrs={'class': "jobsearch-JobComponent-description"}).get_text(' ')
    
divs = soup.find('div', {'class': 'jobsearch-InlineCompanyRating'})
divs.find_all('div')



In [None]:
div_child = div.find("div")
div_child