In [None]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
import time
import pandas as pd

Below is function used to scrape U.S. Glassdoor job listings for positions 'junior data scientist' and 'senior scientist'. The scraping function is adapted from [this helpful blog post by Ömer Sakarya](https://towardsdatascience.com/selenium-tutorial-scraping-glassdoor-com-in-10-minutes-3d0915c6d905).

The 2 dataframes (one for Jr. data scientist, one for Sr. data scientist) are merged to result in a data frame with 1805 job listings (rows) and 12 columns (attributes).

In [None]:
def get_listings(job_title, country, num_scrape, debug = True):
    '''
    arguments 
        - job_title - str, delimiter '-', job position searched e.g. 'junior-data-scientist'
        - country - str, country searched e.g. 'us'
        - num_scrape - number of job listings to scrape
        
    return - pandas dataframe of Glassdoor job listing attributes, size =  `num_job` by ?? (# attributes)
    '''
    
    # FIX: use here::here() for driver path
    # initialize Chrome driver saved in local path
    driver = webdriver.Chrome(executable_path = "../../../chromedriver")
    
    # go to glassdoor search results
    url = 'https://www.glassdoor.ca/Job/' + country + '-' + job_title + '-jobs-SRCH_IL.0,2_IN1_KO3,24.htm'
    driver.get(url)
    (driver.page_source).encode('utf-8','ignore') # gets rid of special char. in job description 
    
    jobs = []
    
    # scrape listing until total of `num_scrape` reached
    while len(jobs) < num_scrape:
        
        time.sleep(3)
        try:
            driver.find_element_by_class_name("selected").click()
        except ElementClickInterceptedException:
            pass
        
        time.sleep(1)
        
        # close sign-in popup / Glassdoor salary popup
        try: 
            driver.find_element_by_class_name("modal_closeIcon").click()
        except NoSuchElementException: 
            pass
        
        time.sleep(0.7)

        # go through listings
        listings = driver.find_elements_by_class_name("jl")
        
        for l in listings: 
            
            # print scraping progress
            print("Scraped:{} ".format("" + str(len(jobs)) + "/" + str(num_scrape)))
            
            # terminate condition: if target <= current scraped
            if num_scrape <= len(jobs):
                break
                
            # click a listing     
            l.click()  
            time.sleep(1.1)
            
            # special case for handling popup
            if job_title == 'senior-data-scientist':
                if len(jobs) == 0 or len(jobs) == 1 or len(jobs) == 4 or len(jobs) == 3:
                    try: 
                        driver.find_element_by_class_name("modal_closeIcon").click()
                    except NoSuchElementException: 
                        pass
            
            ## start scraping
            # get the key info assumed to be on EVERY company's listing
            keyInfo_done = False
            while not keyInfo_done:
                try:
                    job_position = driver.find_element_by_xpath('.//div[contains(@class, "title")]').text
                    company = driver.find_element_by_xpath('.//div[@class="employerName"]').text
                    location = driver.find_element_by_xpath('.//div[@class="location"]').text
                    description = driver.find_element_by_xpath('.//div[@class="jobDescriptionContent desc"]').text
                    keyInfo_done = True
                
                # if any key info above is missing in current listing
                except: 
                    time.sleep(3)
            
            time.sleep(1.9)
            
            # get glassdoor salary estimate
            try:
                salary_est = driver.find_element_by_xpath('.//div[@class="salary"]/span[@class="css-1uyte9r css-hca4ks e1wijj242"]').text
            except NoSuchElementException:
                salary_est = 'NF' # if not found
            
           # get company rating (out of 5)
            try:
                rate = driver.find_element_by_xpath('.//span[@class="rating"]').text 
            except NoSuchElementException:
                rate = 'NF'
                
            time.sleep(1.3)
            
            # debugging pt.1
            if debug:
                print("Job Title: {}".format(job_position))
                print("Salary Estimate: {}".format(salary_est))
                print("Job Description: {}".format(description[:200]))
                print("Rating: {}".format(rate))
                print("Company Name: {}".format(company))
                print("Location: {}".format(location))
            
            # special case for handling popup
            if job_title == 'senior-data-scientist':
                if len(jobs) == 0 or len(jobs) == 1 or len(jobs) == 4 or len(jobs) == 3:
                    try: 
                        driver.find_element_by_class_name("modal_closeIcon").click()
                    except NoSuchElementException: 
                        pass
                    
            # click company tab & scrape
            try:
                driver.find_element_by_xpath('.//div[@class="tab" and @data-tab-type="overview"]').click() # click tab
                
                time.sleep(0.6)
                # company size 
                try:
                    size = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Size"]//following-sibling::*').text
                except NoSuchElementException:
                    size = 'NF'

                # year founded
                try:
                    yr_found = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Founded"]//following-sibling::*').text
                except NoSuchElementException:
                    yr_found = 'NF'
                
                time.sleep(1.6)
                
                # company type (e.g. public, private)
                try:
                    company_type = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Type"]//following-sibling::*').text
                except NoSuchElementException:
                    company_type = 'NF'
                # industry
                try:
                    industry = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Industry"]//following-sibling::*').text
                except NoSuchElementException:
                    industry = 'NF'
                
                time.sleep(0.9)
                
                # sector
                try:
                    sector = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Sector"]//following-sibling::*').text
                except NoSuchElementException:
                    sector = 'NF'
                # annual revenue
                try:
                    revenue = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Revenue"]//following-sibling::*').text
                except NoSuchElementException:
                    revenue = 'NF'
                time.sleep(2.3)

            # if no company tab exists
            except NoSuchElementException:  
                size = 'NF'
                yr_found = 'NF'
                company_type = 'NF'
                industry = 'NF'
                sector = 'NF'
                revenue = 'NF'
            
            # debugging pt. 2
            if debug:
                print("Size: {}".format(size))
                print("Founded: {}".format(yr_found))
                print("Type of Ownership: {}".format(company_type))
                print("Industry: {}".format(industry))
                print("Sector: {}".format(sector))
                print("Revenue: {}".format(revenue))
                print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
                
            # append current listing
            jobs.append({"Position" : job_position, 
                         "Description" : description,
                         "Company" : company.split('\n')[0],
                         "Location" : location,
                         "Glassdoor Salary Estimate" : salary_est,
                         "Rating" : rate, 
                         "Size" : size,
                         "Revenue" : revenue,
                         "Type" : company_type, 
                         "Year Founded" : yr_found, 
                         "Industry" : industry,
                         "Sector" : sector,
                        })

            time.sleep(1.2) # pause before click next listing
            
        # Click on `next page` after all listings on current page scraped
        try:
            driver.find_element_by_xpath('.//li[@class="next"]//a').click()
        except NoSuchElementException: # reach end of all listings available, but <`num_scrape`
            print("Stopped scraping. Target {} listings, actually scraped {} listings.".format(num_scrape, len(jobs)))
            break
        
        # terminate webdriver
           # driver.quit() # use only if web driver opens > 2 browsing windows when program runs
        #driver.close()

    # dataframe of listings
    return pd.DataFrame(jobs, columns=jobs[0].keys())  



In [None]:
# junior data scientist in U.S.  | 30 pages of listings, each with 30 or 32 listings (do 30 * 32 = 960 max. to be safe)
df1 = get_listings('junior-data-scientist', 'us', 960, debug = False) 

In [None]:
# senior data scientist in U.S. 
df2 = get_listings('senior-data-scientist', 'us', 960, debug = False)

In [None]:
# merge 2 dfs
# write to csv (U.S.)
df = pd.concat([df1, df2], axis = 0)
df.to_csv('elisa_us.csv')