In [1]:
#!pip install selenium
#!pip install webdriver_manager
#!pip install openpyxl

In [2]:
username_linkedin = ""
password_linkedin = ""
keyword = "Data Engineer"
location = "Greater Paris Metropolitan Region"
post_ages = 1 # job posted last 1 days
time_sleeps = 1 # wait 1 seconds before each action

In [3]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from typing import Union
import urllib.parse

In [4]:
def get_job_search_url(search_period: Union[int, None] = None):
    encoded_url_key_word = urllib.parse.quote(keyword)
    encoded_url_location = urllib.parse.quote(location)
    if type(search_period) is int:
        return f'https://www.linkedin.com/jobs/search/?f_TPR=r{search_period*86400}&keywords={encoded_url_key_word}&location={encoded_url_location}'
    else:
        return f'https://www.linkedin.com/jobs/search/?keywords={encoded_url_key_word}&location={encoded_url_location}'

In [5]:
s=Service(ChromeDriverManager().install())
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(service=s,options=options)
driver.maximize_window()
driver.get("https://www.linkedin.com/login?fromSignIn=true&trk=guest_homepage-basic_nav-header-signin")

time.sleep(time_sleeps)
username = WebDriverWait(driver, time_sleeps).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='session_key']")))
password = WebDriverWait(driver, time_sleeps).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='session_password']")))

username.clear()
username.send_keys(username_linkedin)
password.clear()
password.send_keys(password_linkedin)

time.sleep(time_sleeps)
button = WebDriverWait(driver, time_sleeps).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[type='submit']"))).click()

print("You are logged in!")


You are logged in!


In [6]:
url = get_job_search_url(post_ages)
print(f'Search url: {url}')

Search url: https://www.linkedin.com/jobs/search/?f_TPR=r86400&keywords=Data%20Engineer&location=Greater%20Paris%20Metropolitan%20Region


In [7]:
driver.get(url)
time.sleep(time_sleeps)

In [8]:
total_results = WebDriverWait(driver, time_sleeps).until(EC.visibility_of_all_elements_located((By.XPATH, ".//small")))
if len(total_results) == 1:
    print(total_results[0].text)

51 results


In [9]:
number_of_pages = int(total_results[0].text.split()[0].replace(",","")) // 25
print(f'{int(number_of_pages)} pages')

2 pages


In [10]:
job_ids = []
job_titles = []
company_names = []
job_ages = []
job_descriptions = []

In [11]:
def scrap_data():
    search_results = WebDriverWait(driver, time_sleeps).until(EC.visibility_of_all_elements_located((By.XPATH, './/ul[@itemtype="http://schema.org/ItemList"]')))
    jobs = search_results[0].find_elements(By.XPATH, ".//li[@data-occludable-job-id]")

    for job in jobs:
        job_ids.append(job.get_attribute('data-occludable-job-id'))

        job.click()

        job_description = WebDriverWait(driver, time_sleeps).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[contains(@class, 'jobs-description--reformatted')]")))
        job_descriptions.append(job_description[0].text)

        job_title = WebDriverWait(driver, time_sleeps).until(EC.visibility_of_all_elements_located((By.XPATH, "//h2[contains(@class, 'jobs-unified-top-card__job-title')]")))
        job_titles.append(job_title[0].text)

        company_name = WebDriverWait(driver, time_sleeps).until(EC.visibility_of_all_elements_located((By.XPATH, "//span[contains(@class, 'jobs-unified-top-card__company-name')]")))
        company_names.append(company_name[0].text)

        job_age = WebDriverWait(driver, time_sleeps).until(EC.visibility_of_all_elements_located((By.XPATH, "//span[contains(@class, 'jobs-unified-top-card__posted-date')]")))
        job_ages.append(job_age[0].text)

        time.sleep(time_sleeps)


In [12]:
scrap_data()
for i in range(number_of_pages):
    new_url = f'{url}&start={(i+1)*25}'
    print(f'Search url: {new_url}')
    driver.get(new_url)
    time.sleep(time_sleeps)
    scrap_data()

Search url: https://www.linkedin.com/jobs/search/?f_TPR=r86400&keywords=Data%20Engineer&location=Greater%20Paris%20Metropolitan%20Region&start=25
Search url: https://www.linkedin.com/jobs/search/?f_TPR=r86400&keywords=Data%20Engineer&location=Greater%20Paris%20Metropolitan%20Region&start=50


In [13]:
import pandas as pd
data = {
    'job_id': job_ids,
    'job_title': job_titles,
    'company_name': company_names,
    'job_age': job_ages,
    'job_description': job_descriptions
}
df = pd.DataFrame(data, columns=['job_id', 'job_title', 'company_name', 'job_age', 'job_description'])
df

Unnamed: 0,job_id,job_title,company_name,job_age,job_description
0,3183379237,Data Engineer II,American Express Global Business Travel,1 hour ago,As a member of Egencia’s Data Engineering grou...
1,3183364581,Senior Data Engineer - Reply France,Jobs via eFinancialCareers,2 hours ago,Senior Data Engineer (h/f)\n\nVos Missions:\n\...
2,2924917026,Sr. Data Visualisation Engineer (Remote),CrowdStrike,13 hours ago,#WeAreCrowdStrike and our mission is to stop b...
3,3185827333,Software Engineer at medGo,join.com,20 hours ago,medGo’s vision: change the daily life of 10 mi...
4,3186017521,System Engineer,DYGIMA,19 hours ago,Description du poste\nPour intervenir en tant ...
5,3183342587,Data engineer,Dynamis RH,5 hours ago,Notre client est une start-up évoluant dans le...
6,3122502372,Ingénieur Data - H/F - Bussy - CDI,Tech Data,14 hours ago,Are you ready to make a BIG impact with TD SYN...
7,3153399306,Data Engineer with Cloud,ITMAGINATION,13 hours ago,
8,3184156649,Backend Services . Software Engineer,Jobs via eFinancialCareers,9 hours ago,
9,3044688621,Software Development Engineer,Criteo,14 hours ago,"Who We Are\n\nAt Criteo, our culture is as uni..."


In [14]:
#saving the data
df.to_excel(f'Linkedin_Job_{keyword}.xlsx', index = False)

In [15]:
driver.close()
print("Your search is done!")

Your search is done!
