In [1]:
import requests
from bs4 import BeautifulSoup
import json
import time
import random
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json


def scrape_linkedin_job(url):
    """
    Scrape job details from a LinkedIn job posting
    
    Args:
        url (str): URL of the LinkedIn job posting
        
    Returns:
        dict: Job details including title, company, location, description, etc.
    """
    # Validate URL
    if not url.startswith("http"):
        url = "https://" + url
        
    # Ensure the URL is properly formatted for LinkedIn job posts
    if "linkedin.com" not in url:
        print("Warning: This doesn't appear to be a LinkedIn URL")
        return {"error": "Invalid LinkedIn URL"}
        
    # Setup Chrome options for headless browsing
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920x1080")
    chrome_options.add_argument("--disable-notifications")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    # Add a user agent to avoid detection
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    
    # Initialize the driver with error handling
    try:
        # Try the newer method first (for newer Selenium versions)
        try:
            from webdriver_manager.chrome import ChromeDriverManager
            from selenium.webdriver.chrome.service import Service
            service = Service(ChromeDriverManager().install())
            driver = webdriver.Chrome(service=service, options=chrome_options)
        except Exception:
            # Fall back to the older method
            driver = webdriver.Chrome(options=chrome_options)
    except Exception as e:
        print(f"Error initializing Chrome driver: {e}")
        print("Try updating your ChromeDriver to match your Chrome version")
        return {"error": "ChromeDriver initialization failed"}
    
    try:
        # Navigate to the job posting
        driver.get(url)
        
        # Wait for the page to load
        time.sleep(random.uniform(3, 5))
        
        # Try to click the "Show more" button to expand the job description if it exists
        try:
            show_more_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, ".show-more-less-html__button"))
            )
            show_more_button.click()
            time.sleep(1)  # Give time for the description to expand
        except:
            pass  # If button is not found or not clickable, just continue
        
        # Get the page source after JavaScript execution
        page_source = driver.page_source
        
        # Parse the HTML with BeautifulSoup
        soup = BeautifulSoup(page_source, 'html.parser')
        
        # Extract job details
        job_data = {}
        
        # Job title
        try:
            job_data['title'] = soup.select_one('.top-card-layout__title').text.strip()
        except:
            try:
                job_data['title'] = soup.select_one('h1.topcard__title').text.strip()
            except:
                job_data['title'] = "Not found"
        
        # Company name
        try:
            job_data['company'] = soup.select_one('.topcard__org-name-link').text.strip()
        except:
            try:
                job_data['company'] = soup.select_one('.top-card-layout__card .topcard__flavor-row span:not(.location)').text.strip()
            except:
                try:
                    job_data['company'] = soup.select_one('.topcard__org-name').text.strip()
                except:
                    job_data['company'] = "Not found"
        
        # Location
        try:
            job_data['location'] = soup.select_one('.topcard__flavor--bullet').text.strip()
        except:
            try:
                job_data['location'] = soup.select_one('.top-card-layout__card .topcard__flavor-row .location').text.strip()
            except:
                try:
                    job_data['location'] = soup.select_one('.topcard__subline-location').text.strip()
                except:
                    job_data['location'] = "Not found"
                
        # Posted date
        try:
            job_data['posted_date'] = soup.select_one('.posted-time-ago__text').text.strip()
        except:
            try:
                job_data['posted_date'] = soup.select_one('.top-card-layout__card .topcard__flavor-row span.posted-time-ago__text').text.strip()
            except:
                try:
                    job_data['posted_date'] = soup.select_one('.topcard__flavor--metadata').text.strip()
                except:
                    job_data['posted_date'] = "Not found"
        
        # Job description
        try:
            job_data['description'] = soup.select_one('.description__text').text.strip()
        except:
            try:
                job_data['description'] = soup.select_one('.show-more-less-html__markup').text.strip()
            except:
                try:
                    # Try a more generic approach
                    description_div = soup.select_one('div[class*="description"]')
                    if description_div:
                        job_data['description'] = description_div.text.strip()
                    else:
                        job_data['description'] = "Not found"
                except:
                    job_data['description'] = "Not found"
        
        # Job criteria (seniority, employment type, job function, industries)
        job_criteria = {}
        criteria_section = soup.select('.description__job-criteria-item')
        
        for criteria in criteria_section:
            try:
                criteria_header = criteria.select_one('.description__job-criteria-subheader').text.strip()
                criteria_value = criteria.select_one('.description__job-criteria-text').text.strip()
                job_criteria[criteria_header] = criteria_value
            except:
                continue
        
        job_data['criteria'] = job_criteria
        
        # Get skills
        # try:
        #     skills = soup.select('.skill-pill')
        #     job_data['skills'] = [skill.text.strip() for skill in skills]
        # except:
        #     job_data['skills'] = []
            
        # Get salary information if available
        try:
            job_data['salary'] = soup.select_one('.compensation__salary').text.strip()
        except:
            try:
                salary_element = soup.select_one('div[class*="salary"]')
                if salary_element:
                    job_data['salary'] = salary_element.text.strip()
                else:
                    # Try to find salary in the description
                    desc_text = job_data['description'].lower()
                    salary_idx = desc_text.find("salary range")
                    if salary_idx > -1:
                        # Extract approximately 100 characters after "salary range"
                        snippet = desc_text[salary_idx:salary_idx+150]
                        job_data['salary'] = snippet
                    else:
                        job_data['salary'] = "Not found"
            except:
                job_data['salary'] = "Not found"
        
        # Number of applicants 
        try:
            job_data['applicants'] = soup.select_one('.num-applicants__caption').text.strip()
        except:
            try:
                applicants_element = soup.select_one('span[class*="applicant"]')
                if applicants_element:
                    job_data['applicants'] = applicants_element.text.strip()
                else:
                    job_data['applicants'] = "Not found"
            except:
                job_data['applicants'] = "Not found"
        
        return job_data
    
    except Exception as e:
        print(f"Error: {e}")
        return {"error": str(e)}
    
    finally:
        # Close the browser
        driver.quit()


# def save_to_json(job_data, filename="linkedin_job_data.json"):
#     """Save job data to a JSON file"""
#     with open(filename, 'w', encoding='utf-8') as f:
#         json.dump(job_data, f, indent=4, ensure_ascii=False)
#     print(f"Data saved to {filename}")

def save_to_json(job_data_list, filename="linkedin_jobs_data.json"):
    """
    Save multiple job data entries to a JSON file
    
    Args:
        job_data_list (list): List of job data dictionaries
        filename (str): Output JSON filename
    """
    # Ensure job_data_list is a list
    if not isinstance(job_data_list, list):
        job_data_list = [job_data_list]
        
    # Check if file already exists to append instead of overwrite
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            existing_data = json.load(f)
            
            # Ensure existing_data is a list
            if not isinstance(existing_data, list):
                existing_data = [existing_data]
                
            # Combine the lists
            combined_data = existing_data + job_data_list
            
    except (FileNotFoundError, json.JSONDecodeError):
        # File doesn't exist or is invalid, use only new data
        combined_data = job_data_list
        
    # Write the combined data
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(combined_data, f, indent=4, ensure_ascii=False)
    
    print(f"Data saved to {filename}")

def scrape_multiple_jobs(job_urls, output_json="linkedin_jobs.json"):
    """
    Scrape multiple job postings and save combined results to JSON
    
    Args:
        job_urls (list): List of LinkedIn job posting URLs
        output_json (str): Filename for JSON output
    """
    all_jobs = []
    
    for i, url in enumerate(job_urls):
        print(f"Scraping job {i+1}/{len(job_urls)}: {url}")
        job_data = scrape_linkedin_job(url)
        
        if "error" not in job_data:
            all_jobs.append(job_data)
            print(f"Successfully scraped job data for: {job_data.get('title', 'Unknown Title')}")
        else:
            print(f"Failed to scrape job: {job_data.get('error', 'Unknown error')}")
        
        # Add a random delay between requests to avoid detection
        if i < len(job_urls) - 1:
            delay = random.uniform(3, 8)
            print(f"Waiting {delay:.2f} seconds before next request...")
            time.sleep(delay)
    
    # Save combined results to JSON
    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(all_jobs, f, indent=4, ensure_ascii=False)
    print(f"All job data saved to {output_json}")


if __name__ == "__main__":
    try:
        # For single job scraping
        # job_url = f"https://www.linkedin.com/jobs/view/avp-digital-measurement-analytics-at-synchrony-4193700745/?position=39&pageNum=0&refId=FCkmm70%2FC892Nn1Wbgg%2B8A%3D%3D&trackingId=bvNUgUaVN7N8AAhpAYIpwg%3D%3D"
        with open("parsed_links.json", "r") as file:
           data = json.load(file)     
        for url in data:
            job_url = url['url']
            print(f"Scraping data from: {job_url}")
            
            # Validate URL before proceeding
            if not job_url or len(job_url) < 10:
                print("Error: Please enter a valid LinkedIn job posting URL")
                exit(1)
                
            job_data = scrape_linkedin_job(job_url)
            
            # Check if there was an error
            if "error" in job_data:
                print(f"Error occurred: {job_data['error']}")
                exit(1)
                
            # Save the data to JSON only
            save_to_json(job_data)
            print("Job data successfully saved to JSON")
        
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

Scraping data from: https://www.linkedin.com/jobs/view/avp-digital-measurement-analytics-at-synchrony-4193700745?position=39&pageNum=0&refId=FCkmm70%2FC892Nn1Wbgg%2B8A%3D%3D&trackingId=bvNUgUaVN7N8AAhpAYIpwg%3D%3D
Data saved to linkedin_jobs_data.json
Job data successfully saved to JSON
Scraping data from: https://www.linkedin.com/jobs/view/data-scientist-at-transunion-4193702494?position=49&pageNum=0&refId=FCkmm70%2FC892Nn1Wbgg%2B8A%3D%3D&trackingId=%2FYy7oIZpscjM0m4wspkC1A%3D%3D


KeyboardInterrupt: 

In [None]:
# import json
# with open("parsed_links.json", "r") as file:
#     data = json.load(file)

In [None]:
# for url in data:
#     print(url['url'])


https://www.linkedin.com/jobs/view/avp-digital-measurement-analytics-at-synchrony-4193700745?position=39&pageNum=0&refId=FCkmm70%2FC892Nn1Wbgg%2B8A%3D%3D&trackingId=bvNUgUaVN7N8AAhpAYIpwg%3D%3D
https://www.linkedin.com/jobs/view/data-scientist-at-transunion-4193702494?position=49&pageNum=0&refId=FCkmm70%2FC892Nn1Wbgg%2B8A%3D%3D&trackingId=%2FYy7oIZpscjM0m4wspkC1A%3D%3D
https://www.linkedin.com/jobs/view/avp-digital-measurement-analytics-at-synchrony-4193700751?position=30&pageNum=0&refId=FCkmm70%2FC892Nn1Wbgg%2B8A%3D%3D&trackingId=j5gAN37Nv%2B2dC87VHb6coA%3D%3D
https://www.linkedin.com/jobs/view/avp-digital-measurement-analytics-at-synchrony-4193299984?position=17&pageNum=0&refId=FCkmm70%2FC892Nn1Wbgg%2B8A%3D%3D&trackingId=kOYPCY7Ov4QKoHrBCYgx4Q%3D%3D
https://www.linkedin.com/jobs/view/avp-digital-measurement-analytics-at-synchrony-4193704313?position=34&pageNum=0&refId=FCkmm70%2FC892Nn1Wbgg%2B8A%3D%3D&trackingId=9MWqvZFnY4UWD%2Bbh0ZWouw%3D%3D
https://www.linkedin.com/jobs/view/avp-dig