In [5]:
## Data analysis libraries
import pandas as pd

## Web Scraping libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time
import random

In [2]:
## Following: https://www.geeksforgeeks.org/scrape-linkedin-using-selenium-and-beautiful-soup-in-python/

In [51]:
# Creating a webdriver instance
driver = webdriver.Chrome()

In [52]:
# Opening linkedIn's login page
driver.get("https://linkedin.com/uas/login")

# waiting for the page to load
time.sleep(5)
 


### Login through the browser...

We could programmatically enter our credentials with something like the below

```python
username = driver.find_element(By.ID, "username")
```

In [54]:
# open the jobs URL
jobs_url = "https://www.linkedin.com/jobs/"

driver.get(jobs_url)        # this will open the link

# waiting for the page to load
time.sleep(5)

### Search for IDs in the page:

```python
ids = driver.find_elements(By.XPATH, '//*[@id]')

for ii in ids:
    print('Tag:  ' + ii.tag_name)
    print('ID:  ' + ii.get_attribute('id'))
    print('Name:  ' + str(ii.get_attribute('name')))
```

In [68]:
ids = driver.find_elements(By.XPATH, '//*[@id]')

for ii in ids:
    if 'jobs-search-box' in ii.get_attribute('id'):
        print('Tag:  ' + ii.tag_name)
        print('ID:  ' + ii.get_attribute('id'))
        print('Name:  ' + str(ii.get_attribute('name')))
        if 'jobs-search-box-keyword' in ii.get_attribute('id'):
            search_keyword_id = ii.get_attribute('id')
        if 'jobs-search-box-location' in ii.get_attribute('id'):
            location_keyword_id = ii.get_attribute('id')

Tag:  input
ID:  jobs-search-box-keyword-id-ember1190
Name:  
Tag:  input
ID:  jobs-search-box-location-id-ember1190
Name:  


In [69]:
# entering search term
search_term = driver.find_element(By.ID, search_keyword_id)
search_term.clear()
search_term.send_keys("data_scientist")
search_term = driver.find_element(By.ID, search_keyword_id)
search_term.send_keys(Keys.RETURN)

# waiting for the page to load
time.sleep(5)

In [70]:
location_term = driver.find_element(By.ID, location_keyword_id) 
location_term.clear()
location_term.send_keys("New York, United States")
location_term.send_keys(Keys.RETURN)

### Parse the search results page

In [71]:
def assemble_page_of_listings(soup):


    job_ids = []
    job_titles = []
    job_company = []
    job_location = []
    job_salaries = []
    
    for tag in soup.find_all(class_="job-card-list"):
        job_ids.append(tag.get('data-job-id'))
        (job_titles.append(soup.find('div', {'data-job-id': tag.get('data-job-id')})
                           .find('a', {'class': 'job-card-list__title'}).text.strip()))
        ## The company for the job is under the job-card-container__primary-description
        (job_company.append(soup.find('div', {'data-job-id': tag.get('data-job-id')})
                            .find('span', {'class': 'job-card-container__primary-description'}).text.strip()))
        ## The first item in the metadata is typically the job location:
        (job_location.append(soup.find('div', {'data-job-id': tag.get('data-job-id')})
                           .find('li', {'class': 'job-card-container__metadata-item'}).text.strip())) 
        ## The second item in the list, if present, will be the salary
        try:
            job_salary = (soup.find('div', {'data-job-id': tag.get('data-job-id')})
                               .find_all('li', {'class': 'job-card-container__metadata-item'})[1].text.strip())
            if "$" in job_salary: # if the second metadata point has a dollar sign
                if "-" in job_salary:  # If it's a salary range
                    # strip everything after the second "/yr"
                    job_salaries.append("".join(job_salary.split("/yr", 2)[:2]))
                if ("-" not in job_salary) and "yr" in job_salary:
                    # strip everything after the first "/yr"
                    job_salaries.append("".join(job_salary.split("/yr", 2)[:2]))
            else:
                job_salaries.append("No_Salary_Listed")
        except IndexError:
            job_salaries.append("No_Salary_Listed")

    job_listings = pd.DataFrame(list(zip(job_ids, job_titles, job_company, job_location, job_salaries)),
               columns =['Job_ID', 'Title', 'Company', 'Location', 'Salary'])
    return job_listings

In [72]:
src = driver.page_source

# Now using beautiful soup
soup = BeautifulSoup(src, "html.parser")

job_listings = assemble_page_of_listings(soup)

In [73]:
job_listings

Unnamed: 0,Job_ID,Title,Company,Location,Salary
0,3730709243,Specialist Solutions Architect - Machine Learn...,Databricks,"New York, NY (Remote)",$139K - $248K
1,3725824958,"Software Engineer, Machine Learning - Monetiza...",Meta,"New York, NY",$172K - $241K
2,3726531301,Data Engineering Manager,HelloFresh,"New York, NY",$170K - $220K
3,3720387716,"Senior Applied Scientist, Sponsored Products",Amazon,"New York, NY",$136K - $260K
4,3719409171,AI/ML Data Scientist,Experfy,United States (Remote),No_Salary_Listed
5,3689653323,"Senior Product Manager, Generative Artificial ...",MongoDB,"New York, NY",No_Salary_Listed
6,3726772383,Site Reliability Manager - Oracle Health Data ...,Oracle,United States (Remote),No_Salary_Listed


### Click into an individual Item

```python
clickable = driver.findElement(By.id("clickable"));
clickable.click()
```

## Scroll on the search results page and get the next set of results

In [None]:
# no_of_jobs = int(driver.find_element_by_css_selector(‘h1>span’).get_attribute(‘innerText’))

In [None]:
# no_of_jobs

In [98]:
'''
i = 2
while i <= int(no_of_jobs/25)+1: 
    wd.execute_script(“window.scrollTo(0, document.body.scrollHeight);”)
     i = i + 1
     try:
     wd.find_element_by_xpath
(‘/html/body/main/div/section/button’).click()
     time.sleep(5)
     except:
         pass
         time.sleep(5)
'''

'\ni = 2\nwhile i <= int(no_of_jobs/25)+1: \n    wd.execute_script(“window.scrollTo(0, document.body.scrollHeight);”)\n     i = i + 1\n     try:\n     wd.find_element_by_xpath\n(‘/html/body/main/div/section/button’).click()\n     time.sleep(5)\n     except:\n         pass\n         time.sleep(5)\n'

## Click the next page

## Go to each page and get the job text


In [93]:
def get_job_description(job_id = '3722171802'):
    # open the jobs URL
    job_url = f"https://www.linkedin.com/jobs/view/{job_id}/"
    driver.get(job_url)        # this will open the link
    
    # waiting for the page to load
    time.sleep(random.randint(4, 7))  ## Wait a random amount of time between 4 and 9 seconds

    # Open up the job description
    see_job_description = driver.find_element(By.CLASS_NAME, "jobs-description__footer-button")
    see_job_description.click()

    src = driver.page_source
    soup = BeautifulSoup(src, "html.parser")

    # Get the job description
    job_description = soup.find('div', {'class':"jobs-box__html-content"}).text.strip()

    return job_description, job_url

In [94]:
def append_job_description(df):
    # iterate through each row and select, get the job ID, add append it to a list
    job_descriptions = []
    job_urls = []
    for ind in df.index:
        print(f'Index: {ind}, For Job ID: {df.Job_ID[ind]}, Title: {df.Title[ind]}')
        description, job_url = get_job_description(df['Job_ID'][ind])
        job_descriptions.append(description)
        job_urls.append(job_url)

    # add the list to the dataframe as a new column
    df['Job_URL'] = job_urls
    df['Job_Description'] = job_descriptions
    
    return df

In [95]:
job_listings_with_description = append_job_description(job_listings)

Index: 0, For Job ID: 3730709243, Title: Specialist Solutions Architect - Machine Learning (Manufacturing)
Index: 1, For Job ID: 3725824958, Title: Software Engineer, Machine Learning - Monetization
Index: 2, For Job ID: 3726531301, Title: Data Engineering Manager
Index: 3, For Job ID: 3720387716, Title: Senior Applied Scientist, Sponsored Products
Index: 4, For Job ID: 3719409171, Title: AI/ML Data Scientist
Index: 5, For Job ID: 3689653323, Title: Senior Product Manager, Generative Artificial Intelligence (GenAI)
Index: 6, For Job ID: 3726772383, Title: Site Reliability Manager - Oracle Health Data & Intelligence Platform


In [96]:
job_listings_with_description

Unnamed: 0,Job_ID,Title,Company,Location,Salary,Job_Description,Job_URL
0,3730709243,Specialist Solutions Architect - Machine Learn...,Databricks,"New York, NY (Remote)",$139K - $248K,About the job\n \n\n \nFEQ124R39Thi...,https://www.linkedin.com/jobs/view/3730709243/
1,3725824958,"Software Engineer, Machine Learning - Monetiza...",Meta,"New York, NY",$172K - $241K,About the job\n \n\n \nFrom making ...,https://www.linkedin.com/jobs/view/3725824958/
2,3726531301,Data Engineering Manager,HelloFresh,"New York, NY",$170K - $220K,About the job\n \n\n \nThe roleWe a...,https://www.linkedin.com/jobs/view/3726531301/
3,3720387716,"Senior Applied Scientist, Sponsored Products",Amazon,"New York, NY",$136K - $260K,About the job\n \n\n \nDescriptionA...,https://www.linkedin.com/jobs/view/3720387716/
4,3719409171,AI/ML Data Scientist,Experfy,United States (Remote),No_Salary_Listed,About the job\n \n\n \nThis is a 10...,https://www.linkedin.com/jobs/view/3719409171/
5,3689653323,"Senior Product Manager, Generative Artificial ...",MongoDB,"New York, NY",No_Salary_Listed,About the job\n \n\n \nThe worldwid...,https://www.linkedin.com/jobs/view/3689653323/
6,3726772383,Site Reliability Manager - Oracle Health Data ...,Oracle,United States (Remote),No_Salary_Listed,About the job\n \n\n \nJob Descript...,https://www.linkedin.com/jobs/view/3726772383/


In [97]:
job_listings_with_description.to_csv('linkedin_job_listings_test.csv', index=False)