## README

### Task

Scrape Data Science jobs in Barcelona from LinkedIn. 


### Approach

Used selenium to navigate through LinkedIn and log in, then used predefined search link to access jobs. Finally scrapped data from LinkedIn.


### Requirements
* Selenium (pip install selenium)

A Chrome Driver was used to thus you can download/update yours [here](https://chromedriver.chromium.org/downloads)

In [1]:
# Import Libraries
import re
import sys
import time
import requests
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchWindowException, StaleElementReferenceException, NoSuchElementException

from bs4 import BeautifulSoup as BS

import numpy as np
import pandas as pd

import traceback

In [2]:
# Log in to Linkedin
def login(driver, usr, pwd):
    # Go to LinkedIn
    driver.get('http://linkedin.com')
    
    # Let Selenium wait till the input field for the username shows up
    WebDriverWait(driver, 30, 5).until(
            EC.presence_of_element_located((By.ID, 'session_key'))
        )

    # Wait for 5 secs
    driver.implicitly_wait(5)
    
    # Get input box for entering email and password
    email = driver.find_element(By.ID, 'session_key')
    password = driver.find_element(By.ID, 'session_password')
    
    # Enter email and password
    email.send_keys(usr)
    password.send_keys(pwd)

    # Wait for 5 secs
    driver.implicitly_wait(5)
    
    # Get the signin button
    signin = driver.find_element(By.CLASS_NAME, 'sign-in-form__submit-button')
    
    # CLick the signin button
    action = ActionChains(driver)
    action.click(on_element=signin)
    action.perform()
    
    # Let Selenium wait till the page loads up
    WebDriverWait(driver, 30, 15).until(
            EC.presence_of_element_located((By.TAG_NAME, 'input'))
        )
    
    driver.implicitly_wait(5)
    
    return driver

In [3]:
def searchJobs(driver, url):
    # Go to search page
    driver.get(url)
    
    # Wait for page to load
    WebDriverWait(driver, 30, 15).until(
        EC.presence_of_element_located((By.CLASS_NAME, "jobs-search-results-list"))
    )
    
    # Confirm that you're not on a company page and go back otherwise
    try:
        driver.find_element(By.CLASS_NAME, "org-top-card__primary-content")
        driver.back()
    except:
        pass
    
    # Scrolling to end of the page
    while True:
        x = 250
        driver.execute_script(f"""
            container = document.querySelector('.jobs-search-results-list');
            container.scrollBy(0, {x});
            """)
        driver.implicitly_wait(1) # Wait for 1sec for text and images to render properly

        # Check the page for elements at the end of the page to verify of we've scrolled to the end/close
        try:
            pages = driver.find_element(By.CLASS_NAME, "global-footer-compact__content")
            break
        except:
            pass
    
    # Get all the jobs listed in the left pane
    selenium_jobs_list = driver.find_elements(By.CLASS_NAME, "jobs-search-results__list-item")
    
    return driver, selenium_jobs_list

In [4]:
def getData(driver, selenium_job_list):
    job_titles = []
    companies = []
    locations = []
    states = []
    posting_dates = []
    offer_urls = []
    applicants_count = []
    workspace_list = []
    promotions = []
    jobs = []
    seniorities = []
    emp_types = []
    industries = []
    python_reqs = []
    easy_apply_list = []
    employee_counts = []
    followers_count = []

    state_map = {
        'early applicant': 'Early Applications',
        'actively hiring': 'On-going',
        'actively recruiting': 'On-going'
    }

    for job in selenium_jobs_list:
        details_path = job.find_element(By.CLASS_NAME, 'job-card-list__title')
    
        # Extract job title
        try:
            job_title = job.find_element(By.CLASS_NAME, 'job-card-list__title').text
        except: # Make job title None if it does not exist in the extracted page
            job_title = None
        job_titles.append(job_title)

        # Extract company name
        try:
            company = job.find_element(By.CLASS_NAME, 'artdeco-entity-lockup__subtitle').text
        except: # Make company None if it does not exist in the extracted page
            company = None
        companies.append(company)

        # Extract job location
        try:
            location = job.find_element(By.CLASS_NAME, 'artdeco-entity-lockup__caption')\
                          .find_element(By.TAG_NAME, 'ul')\
                          .find_element(By.TAG_NAME, 'li')\
                          .text
        except: # Make location None if it does not exist in the extracted page
            location = None
        locations.append(location)

        # Extract the offer url for the job
        try:
            offer_url = job.find_element(By.CLASS_NAME, 'job-card-container__link').get_property('href')
        except:
            offer_url = None
        offer_urls.append(offer_url)

        # Extract the workspace for the job
        try:
            workspace = job.find_element(By.CLASS_NAME, 'job-card-container__metadata-item--workplace-type').text 
        except:
            workspace = None
        workspace_list.append(workspace)

        # Extract the promotion status of the job
        try:
            promoted = job.find_element(By.CLASS_NAME, 'job-card-list__footer-wrapper').text
            promoted = True if 'Promoted' in promoted else False
        except:
            promoted = False
        promotions.append(promoted)

        # Click on job on the page to get the job details
        details_path.click()

        try:
            # Wait for page to load
            WebDriverWait(driver, 30, 15).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'jobs-search__job-details--container'))
            )

            # Get details section of job from the right pane
            details = driver.find_element(By.CLASS_NAME, 'jobs-search__job-details--container')
        except:
            driver.back()


        # Extract job posting date
        try:
            date = details.find_element(By.CLASS_NAME, 'jobs-unified-top-card__posted-date').text
        except:
            date = None
        posting_dates.append(date)

        # Get number of applicants that applied to job
        try:
            applicant_number = details.find_element(By.CLASS_NAME, 'jobs-unified-top-card__applicant-count').text
            applicants = int(re.findall(r'\d+', applicant_number)[0]) # Extract number from text
        except:
            try:
                applicant_number = details.find_element(By.CLASS_NAME, 'jobs-unified-top-card__applicant-count').text
                applicants = int(re.findall(r'\d+', applicant_number)[0]) # Extract number from text
            except:
                applicants = None

        applicants_count.append(applicants)

        seniority = None
        emp_type = None
        industry = None
        employees = None
        state = None

        try:
            role_insights = details.find_elements(By.CLASS_NAME, 'jobs-unified-top-card__job-insight')
            senior_emp = role_insights[0].text.split('·')

            # Get seniority and employment type
            if len(senior_emp) > 1: # 2 role insights stated (seniority and employment type)
                seniority = senior_emp[1].strip()
            emp_type = senior_emp[0].strip()

            emp_industry = role_insights[1].text.split('·')

            # Get number of employee and industry
            if len(emp_industry) > 1: # 2 role insights stated (number of employees and industry)
                industry = emp_industry[1].strip()
            employees = emp_industry[0].strip().split('-')[0]
            employees = int(''.join(re.findall(r'\d+', employees))) # Extract numbers from text

            # Extract job state
            try:
                state = role_insights[-1].text.lower()
            except AttributeError: # Make state None if it does not exist in the extracted page
                state = 'others'

        except:
            pass

        seniorities.append(seniority)
        emp_types.append(emp_type)
        industries.append(industry)
        employee_counts.append(employees)
        states.append(state_map.get(state) if state_map.get(state) else 'Others')

        # Check if you can apply through linkedin
        try:
            easy_apply = details.find_element(By.CLASS_NAME, 'jobs-apply-button').text.lower()
            easy_apply = True if easy_apply == 'easy apply' else False
        except:
            easy_apply = False
        easy_apply_list.append(easy_apply)

        # Check if python is required for the job
        try:
            description = details.find_element(By.CLASS_NAME, 'jobs-description__content').text.lower()
            python_req = True if 'python' in description else False
        except:
            python_req = False
        python_reqs.append(python_req)

        scroll_time = time.time()

        # Scrolling to end of the page
        while True:
            try:
                x = 250
                driver.execute_script(f"""
                    container = document.querySelector('.jobs-search__job-details--container');
                    container.scrollBy(0, {x});
                    """)
                driver.implicitly_wait(1) # Wait for 1sec for text and images to render properly

                # Check the page for elements at the end of the page to verify of we've scrolled to the end/close
                pages = driver.find_element(By.CLASS_NAME, 'jobs-company__footer')
                company_details = driver.find_element(By.CLASS_NAME, 'jobs-company__box')

                driver.execute_script(f"""
                    container = document.querySelector('.jobs-search__job-details--container');
                    container.scrollBy(0, {x});
                    """)

                # Get details section of job from the right pane again
                details = driver.find_element(By.CLASS_NAME, 'jobs-search__job-details--container')

                break
            except:
                pass

            if (time.time() - scroll_time) > 30: break # Stop scrolling if you've been on the page for more than 30s

        # Get the number of followers the company has
        try:
            company_details = details.find_element(By.CLASS_NAME, 'jobs-company__box')
            followers = company_details.find_element(By.CLASS_NAME, 'artdeco-entity-lockup__subtitle').text
            followers_count.append(int(''.join(re.findall(r'\d+', followers))))
        except:
            followers_count.append(None)

    df = pd.DataFrame({
        'Job Title': job_titles, 
        'Company Name': companies, 
        'Location': locations, 
        'State': states, 
        'Posting Date': posting_dates, 
        'Offer URL': offer_urls, 
        'Number of Applicants': applicants_count,
        'Promoted': promotions,
        'Workspace': workspace_list, 
        'Seniority': seniorities, 
        'Employment Type': emp_types, 
        'Industry': industries, 
        'Python Required': python_reqs,
        'Application through Linkedin': easy_apply_list,
        'Number of Employees': employee_counts,
        'Followers': followers_count
    })
     
    return df

In [None]:
usr = 'kdodor@ymail.com'
pwd = 'A$$1gnm€nt'

driver = webdriver.Chrome('./chromedriver')
driver = login(driver, usr, pwd)

url = 'https://www.linkedin.com/jobs/search/?keywords=data scientist, barcelona&location=Barcelona, Catalonia, Spain&refresh=true'
page_start = 25

df = pd.DataFrame()

start_extract = time.time()
print('Begin Extraction')
while page_start <= 1000:
    print(f'Scrapping Data From Page: {page_start/25}')
    try:
        start_time = time.time()
        driver, selenium_jobs_list = searchJobs(driver, url)
        print(f'Search time = {(time.time() - start_time)/60}mins')
        
        data_extract_start = time.time()
        data = getData(driver, selenium_jobs_list)
        print(f'Data Extraction time = {(time.time() - data_extract_start)/60}mins')
        df = pd.concat([df, data], ignore_index=True)
        
        start = driver.current_url.find('&start') if driver.current_url.find('&start') > 0 else len(driver.current_url)
        url = f'{driver.current_url[:start]}&start={page_start}'
        
        end_time = (time.time() - start_time)
        print('Extraction duration:', end_time/60, 'mins')
        print(url)
        print(page_start)
        print(df.isna().sum())
        
        page_start += 25
    except (NoSuchWindowException, StaleElementReferenceException, NoSuchElementException, ValueError):
        print('Retrying')
        continue
    except Exception as e:
        print('Quitting')
        break
        
print(f'Total Extraction time:{(time.time() - start_extract)/3600}hrs')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 970 entries, 0 to 969
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Job Title                     970 non-null    object 
 1   Company Name                  970 non-null    object 
 2   Location                      970 non-null    object 
 3   State                         970 non-null    object 
 4   Posting Date                  970 non-null    object 
 5   Offer URL                     970 non-null    object 
 6   Number of Applicants          880 non-null    float64
 7   Promoted                      970 non-null    bool   
 8   Workspace                     730 non-null    object 
 9   Seniority                     792 non-null    object 
 10  Employment Type               970 non-null    object 
 11  Industry                      927 non-null    object 
 12  Python Required               970 non-null    bool   
 13  Appli

In [None]:
df.isna().sum()