In [2]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
import time
import pandas as pd

Below is function used to scrape UK and Canadian Job Listings. Glassdoor job listings for positions 'junior data scientist' and 'senior scientist'. The scraping function is adapted from [this helpful blog post by Ömer Sakarya](https://towardsdatascience.com/selenium-tutorial-scraping-glassdoor-com-in-10-minutes-3d0915c6d905).

In [3]:
def get_jobs(num_jobs,url,save_file,newData = True):
    
    '''Gathers jobs as a dataframe, scraped from Glassdoor'''
    
    #Initializing the webdriver
    options = webdriver.ChromeOptions()
    
    #Uncomment the line below if you'd like to scrape without a new Chrome window every time.
    options.add_argument('headless')
    
    #Change the path to where chromedriver is in your folder.
    driver = webdriver.Chrome(executable_path="../../../chromedriver", options=options)
    driver.set_window_size(1120, 1000)

    driver.get(url)
    (driver.page_source).encode('utf-8','ignore')
    
    
    columns =  ["Position",
                "Description",
                "Company",
                "Location",
                "Glassdoor Salary Estimate",
                "Rating",
                "Size",
                "Revenue",
                "Year Founded",
                "Industry",
                "Sector",
                "Type"]

    count = 0

    while count < num_jobs:  #If true, should be still looking for new jobs.
        
        jobs = []

        #Let the page load. Change this number based on your internet speed.
        #Or, wait until the webpage is loaded, instead of hardcoding it.
        time.sleep(4)

        #Test for the "Sign Up" prompt and get rid of it.
        try:
            driver.find_element_by_class_name("selected").click()
        except ElementClickInterceptedException:
            pass

        time.sleep(.1)

        try:
            driver.find_element_by_class_name("modal_closeIcon").click()   #clicking to the X.
        except NoSuchElementException:
            pass

        
        #Going through each job in this page
        job_buttons = driver.find_elements_by_class_name("jl")  #jl for Job Listing. These are the buttons we're going to click.
        for job_button in job_buttons:
            count +=1

            print("Progress: {}".format("" + str(count) + "/" + str(num_jobs)))
            if count >= num_jobs:
                break

            job_button.click()  #You might 
            time.sleep(1)
            collected_successfully = False
            
            while not collected_successfully:
                try:
                    company_name = driver.find_element_by_xpath('.//div[@class="employerName"]').text
                    location = driver.find_element_by_xpath('.//div[@class="location"]').text
                    job_title = driver.find_element_by_xpath('.//div[contains(@class, "title")]').text
                    job_description = driver.find_element_by_xpath('.//div[@class="jobDescriptionContent desc"]').text
                    collected_successfully = True
                except:
                    time.sleep(5)

            try:
                salary_estimate = driver.find_element_by_xpath('.//span[@class="gray small salary"]').text
            except NoSuchElementException:
                salary_estimate = None #You need to set a "not found value. It's important."
            
            try:
                rating = driver.find_element_by_xpath('.//span[@class="rating"]').text
            except NoSuchElementException:
                rating = None #You need to set a "not found value. It's important."

            #Going to the Company tab...
            #clicking on this:
            #<div class="tab" data-tab-type="overview"><span>Company</span></div>
            try:
                driver.find_element_by_xpath('.//div[@class="tab" and @data-tab-type="overview"]').click()

                try:
                    size = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Size"]//following-sibling::*').text
                except NoSuchElementException:
                    size = None

                try:
                    founded = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Founded"]//following-sibling::*').text
                except NoSuchElementException:
                    founded = None

                try:
                    industry = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Industry"]//following-sibling::*').text
                except NoSuchElementException:
                    industry = None

                try:
                    sector = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Sector"]//following-sibling::*').text
                except NoSuchElementException:
                    sector = None

                try:
                    revenue = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Revenue"]//following-sibling::*').text
                except NoSuchElementException:
                    revenue = None
                    
                try:
                    type_of_ownership = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Type"]//following-sibling::*').text
                except NoSuchElementException:
                    type_of_ownership = None


            except NoSuchElementException:  #Rarely, some job postings do not have the "Company" tab.
                size = None
                founded = None
                industry = None
                sector = None
                revenue = None
                type_of_ownership = None

            jobs.append({"Position" : job_title,
            "Description" : job_description,
            "Company" : company_name.split('\n')[0],
            "Location" : location,
            "Glassdoor Salary Estimate" : salary_estimate,
            "Rating" : rating,
            "Size" : size,
            "Revenue" : revenue,
            "Year Founded" : founded,
            "Industry" : industry,
            "Sector" : sector,
            "Type" : type_of_ownership})

        newJobs = pd.DataFrame(jobs)
        if newData:
            newJobs.to_csv(save_file)
            newData = False
        else:
            oldJobs = pd.read_csv(save_file)
            allJobs = pd.concat([oldJobs,newJobs])
            allJobs.to_csv(save_file)
            
        #Clicking on the "next page" button
        try:
            driver.find_element_by_xpath('.//li[@class="next"]//a').click()
        except NoSuchElementException:
            print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
            break
    return

In [5]:
url1 = 'https://www.glassdoor.ca/Job/canada-senior-data-scientist-jobs-SRCH_IL.0,6_IN3_KO7,28_IP1.htm'
dataFile1 = r'..\..\..\data\raw_data\jack_canada_senior.csv'
get_jobs(870,url1,dataFile1,newData = True)

Progress: 1/870
Progress: 2/870
Progress: 3/870
Progress: 4/870
Progress: 5/870
Progress: 6/870
Progress: 7/870
Progress: 8/870
Progress: 9/870
Progress: 10/870
Progress: 11/870
Progress: 12/870
Progress: 13/870
Progress: 14/870
Progress: 15/870
Progress: 16/870
Progress: 17/870
Progress: 18/870
Progress: 19/870
Progress: 20/870
Progress: 21/870
Progress: 22/870
Progress: 23/870
Progress: 24/870
Progress: 25/870
Progress: 26/870
Progress: 27/870
Progress: 28/870
Progress: 29/870
Progress: 30/870
Progress: 31/870
Progress: 32/870
Progress: 33/870
Progress: 34/870
Progress: 35/870
Progress: 36/870
Progress: 37/870
Progress: 38/870
Progress: 39/870
Progress: 40/870
Progress: 41/870
Progress: 42/870
Progress: 43/870
Progress: 44/870
Progress: 45/870
Progress: 46/870
Progress: 47/870
Progress: 48/870
Progress: 49/870
Progress: 50/870
Progress: 51/870
Progress: 52/870
Progress: 53/870
Progress: 54/870
Progress: 55/870
Progress: 56/870
Progress: 57/870
Progress: 58/870
Progress: 59/870
Progre

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Progress: 62/870
Progress: 63/870
Progress: 64/870
Progress: 65/870
Progress: 66/870
Progress: 67/870
Progress: 68/870
Progress: 69/870
Progress: 70/870
Progress: 71/870
Progress: 72/870
Progress: 73/870
Progress: 74/870
Progress: 75/870
Progress: 76/870
Progress: 77/870
Progress: 78/870
Progress: 79/870
Progress: 80/870
Progress: 81/870
Progress: 82/870
Progress: 83/870
Progress: 84/870
Progress: 85/870
Progress: 86/870
Progress: 87/870
Progress: 88/870
Progress: 89/870
Progress: 90/870
Progress: 91/870
Progress: 92/870
Progress: 93/870
Progress: 94/870
Progress: 95/870
Progress: 96/870
Progress: 97/870
Progress: 98/870
Progress: 99/870
Progress: 100/870
Progress: 101/870
Progress: 102/870
Progress: 103/870
Progress: 104/870
Progress: 105/870
Progress: 106/870
Progress: 107/870
Progress: 108/870
Progress: 109/870
Progress: 110/870
Progress: 111/870
Progress: 112/870
Progress: 113/870
Progress: 114/870
Progress: 115/870
Progress: 116/870
Progress: 117/870
Progress: 118/870
Progress: 11

Progress: 520/870
Progress: 521/870
Progress: 522/870
Progress: 523/870
Progress: 524/870
Progress: 525/870
Progress: 526/870
Progress: 527/870
Progress: 528/870
Progress: 529/870
Progress: 530/870
Progress: 531/870
Progress: 532/870
Progress: 533/870
Progress: 534/870
Progress: 535/870
Progress: 536/870
Progress: 537/870
Progress: 538/870
Progress: 539/870
Progress: 540/870
Progress: 541/870
Progress: 542/870
Progress: 543/870
Progress: 544/870
Progress: 545/870
Progress: 546/870
Progress: 547/870
Progress: 548/870
Progress: 549/870
Progress: 550/870
Progress: 551/870
Progress: 552/870
Progress: 553/870
Progress: 554/870
Progress: 555/870
Progress: 556/870
Progress: 557/870
Progress: 558/870
Progress: 559/870
Progress: 560/870
Progress: 561/870
Progress: 562/870
Progress: 563/870
Progress: 564/870
Progress: 565/870
Progress: 566/870
Progress: 567/870
Progress: 568/870
Progress: 569/870
Progress: 570/870
Progress: 571/870
Progress: 572/870
Progress: 573/870
Progress: 574/870
Progress: 

In [7]:
url2 = 'https://www.glassdoor.ca/Job/canada-junior-data-scientist-jobs-SRCH_IL.0,6_IN3_KO7,28_P13.htm'
dataFile2 = r'..\..\..\data\raw_data\jack_canada_junior.csv'
get_jobs(30,url2,dataFile2,newData=False)

Progress: 1/30
Progress: 2/30
Progress: 3/30
Progress: 4/30
Progress: 5/30
Progress: 6/30
Progress: 7/30
Progress: 8/30
Progress: 9/30
Progress: 10/30
Progress: 11/30
Progress: 12/30
Progress: 13/30
Progress: 14/30
Progress: 15/30
Progress: 16/30
Progress: 17/30
Progress: 18/30
Progress: 19/30
Progress: 20/30
Progress: 21/30
Progress: 22/30
Progress: 23/30
Progress: 24/30
Progress: 25/30
Progress: 26/30
Progress: 27/30
Progress: 28/30
Progress: 29/30
Progress: 30/30


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [9]:
url3 = 'https://www.glassdoor.ca/Job/uk-junior-data-scientist-jobs-SRCH_IL.0,2_IN2_KO3,24.htm'
dataFile3 = r'..\..\..\data\raw_data\jack_uk_junior.csv'
get_jobs(210,url3,dataFile3,newData=True)

Progress: 1/210
Progress: 2/210
Progress: 3/210
Progress: 4/210
Progress: 5/210
Progress: 6/210
Progress: 7/210
Progress: 8/210
Progress: 9/210
Progress: 10/210
Progress: 11/210
Progress: 12/210
Progress: 13/210
Progress: 14/210
Progress: 15/210
Progress: 16/210
Progress: 17/210
Progress: 18/210
Progress: 19/210
Progress: 20/210
Progress: 21/210
Progress: 22/210
Progress: 23/210
Progress: 24/210
Progress: 25/210
Progress: 26/210
Progress: 27/210
Progress: 28/210
Progress: 29/210
Progress: 30/210
Progress: 31/210
Progress: 32/210
Progress: 33/210
Progress: 34/210
Progress: 35/210
Progress: 36/210
Progress: 37/210
Progress: 38/210
Progress: 39/210
Progress: 40/210
Progress: 41/210
Progress: 42/210
Progress: 43/210
Progress: 44/210
Progress: 45/210
Progress: 46/210
Progress: 47/210
Progress: 48/210
Progress: 49/210
Progress: 50/210
Progress: 51/210
Progress: 52/210
Progress: 53/210
Progress: 54/210
Progress: 55/210
Progress: 56/210
Progress: 57/210
Progress: 58/210
Progress: 59/210
Progre

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Progress: 61/210
Progress: 62/210
Progress: 63/210
Progress: 64/210
Progress: 65/210
Progress: 66/210
Progress: 67/210
Progress: 68/210
Progress: 69/210
Progress: 70/210
Progress: 71/210
Progress: 72/210
Progress: 73/210
Progress: 74/210
Progress: 75/210
Progress: 76/210
Progress: 77/210
Progress: 78/210
Progress: 79/210
Progress: 80/210
Progress: 81/210
Progress: 82/210
Progress: 83/210
Progress: 84/210
Progress: 85/210
Progress: 86/210
Progress: 87/210
Progress: 88/210
Progress: 89/210
Progress: 90/210
Progress: 91/210
Progress: 92/210
Progress: 93/210
Progress: 94/210
Progress: 95/210
Progress: 96/210
Progress: 97/210
Progress: 98/210
Progress: 99/210
Progress: 100/210
Progress: 101/210
Progress: 102/210
Progress: 103/210
Progress: 104/210
Progress: 105/210
Progress: 106/210
Progress: 107/210
Progress: 108/210
Progress: 109/210
Progress: 110/210
Progress: 111/210
Progress: 112/210
Progress: 113/210
Progress: 114/210
Progress: 115/210
Progress: 116/210
Progress: 117/210
Progress: 118

In [13]:
url4 = 'https://www.glassdoor.ca/Job/uk-senior-data-scientist-jobs-SRCH_IL.0,2_IN2_KO3,24_P28.htm'
dataFile4 = r'..\..\..\data\raw_data\jack_uk_senior.csv'
get_jobs(90,url4,dataFile4,newData=False)

Progress: 1/90
Progress: 2/90
Progress: 3/90
Progress: 4/90
Progress: 5/90
Progress: 6/90
Progress: 7/90
Progress: 8/90
Progress: 9/90
Progress: 10/90
Progress: 11/90
Progress: 12/90
Progress: 13/90
Progress: 14/90
Progress: 15/90
Progress: 16/90
Progress: 17/90
Progress: 18/90
Progress: 19/90
Progress: 20/90
Progress: 21/90
Progress: 22/90
Progress: 23/90
Progress: 24/90
Progress: 25/90
Progress: 26/90
Progress: 27/90
Progress: 28/90
Progress: 29/90
Progress: 30/90


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Progress: 31/90
Progress: 32/90
Progress: 33/90
Progress: 34/90
Progress: 35/90
Progress: 36/90
Progress: 37/90
Progress: 38/90
Progress: 39/90
Progress: 40/90
Progress: 41/90
Progress: 42/90
Progress: 43/90
Progress: 44/90
Progress: 45/90
Progress: 46/90
Progress: 47/90
Progress: 48/90
Progress: 49/90
Progress: 50/90
Progress: 51/90
Progress: 52/90
Progress: 53/90
Progress: 54/90
Progress: 55/90
Progress: 56/90
Progress: 57/90
Progress: 58/90
Progress: 59/90
Progress: 60/90
Progress: 61/90
Progress: 62/90
Progress: 63/90
Progress: 64/90
Progress: 65/90
Progress: 66/90
Progress: 67/90
Progress: 68/90
Progress: 69/90
Progress: 70/90
Progress: 71/90
Progress: 72/90
Progress: 73/90
Progress: 74/90
Progress: 75/90
Progress: 76/90
Progress: 77/90
Progress: 78/90
Progress: 79/90
Progress: 80/90
Progress: 81/90
Progress: 82/90
Progress: 83/90
Progress: 84/90
Progress: 85/90
Progress: 86/90
Progress: 87/90
Progress: 88/90
Progress: 89/90
Progress: 90/90
