In [2]:
#import packages
#!pip install selenium

Collecting selenium
  Downloading selenium-4.29.0-py3-none-any.whl (9.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting trio-websocket~=0.9
  Downloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Collecting websocket-client~=1.8
  Downloading websocket_client-1.8.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.8/58.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trio~=0.17
  Downloading trio-0.29.0-py3-none-any.whl (492 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.9/492.9 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting sniffio>=1.3.0
  Using cached sniffio-1.3.1-py3-none-any.whl (10 kB)
Collecting exceptiongroup
  Downloading exceptiongroup-1.2.2-py3-none-any.whl (16 kB)
Collecting outcome
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl (10 kB)
Collect

In [3]:
!pip install webdriver-manager

Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, webdriver-manager
Successfully installed python-dotenv-1.0.1 webdriver-manager-4.0.2


In [1]:
import time
import pandas as pd
import os

In [4]:
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException, StaleElementReferenceException

In [5]:
# URL for Data Analyst jobs in Sweden
url = 'https://www.linkedin.com/jobs/search/?currentJobId=4034151343&geoId=105117694&keywords=Data%20Analyst&origin=JOB_SEARCH_PAGE_SEARCH_BUTTON&refresh=true&trk=public_jobs_jobs-search-bar_search-submit'

In [6]:
# Setup WebDriver with automatic ChromeDriver management
service = Service(ChromeDriverManager().install())
options = webdriver.ChromeOptions()

In [7]:
# Uncomment the next line if you want to run the browser in headless mode
# options.add_argument('--headless')
options.add_argument('--disable-blink-features=AutomationControlled')  # Try to avoid detection
options.add_argument('--start-maximized')  # Start with maximized window
options.add_experimental_option('excludeSwitches', ['enable-automation'])  # Hide automation
options.add_experimental_option('useAutomationExtension', False)  # Disable automation extension


In [9]:
driver = webdriver.Chrome(service=service, options=options)
# Open LinkedIn jobs page
driver.get(url)
print("Waiting for page to load...")
time.sleep(5)  # Give time for the page to load

Waiting for page to load...


In [10]:
# Lists to store job data
job_titles = []
company_names = []
cities = []
posted_dates = []
applicant_counts = []
seniority_levels = []
employment_types = []
job_functions = []
industries_list = []

In [11]:
#Safely find an element with proper wait and error handling
def safe_find_element(parent, by, value, wait_time=3):
    try:
        element = WebDriverWait(parent, wait_time).until(
            EC.presence_of_element_located((by, value))
        )
        return element
    except (TimeoutException, NoSuchElementException):
        return None

In [12]:
#Safely get text from an element
def safe_get_text(element):
    if element:
        try:
            return element.text.strip()
        except:
            pass
    return "N/A"

In [13]:
#Collect basic job data from the listing card
def collect_basic_job_data(job_listing):
    try:
        title_element = safe_find_element(job_listing, By.CSS_SELECTOR, 'h3.base-search-card__title')
        company_element = safe_find_element(job_listing, By.CSS_SELECTOR, 'h4.base-search-card__subtitle')
        location_element = safe_find_element(job_listing, By.CSS_SELECTOR, 'span.job-search-card__location')
        
        title = safe_get_text(title_element)
        company = safe_get_text(company_element)
        location = safe_get_text(location_element)
        
        return {
            "title": title,
            "company": company, 
            "location": location
        }
    except Exception as e:
        print(f"Error collecting basic job data: {e}")
        return {"title": "N/A", "company": "N/A", "location": "N/A"}

In [14]:
#Extract detailed information for the currently selected job
def extract_job_details():
    time.sleep(2)  # Wait for job details to load
    
    # Get job details pane
    details_pane = safe_find_element(driver, By.CSS_SELECTOR, 'div.details-pane_content.details-pane_content--show')
    
    if not details_pane:
        print("Job details pane not found")
        return {
            "posted_date": "N/A",
            "applicant_count": "N/A",
            "seniority_level": "N/A",
            "employment_type": "N/A",
            "job_function": "N/A",
            "industries": "N/A"
        }
    
    # Extract posted date
    try:
        posted_date_element = safe_find_element(
            details_pane, 
            By.CSS_SELECTOR, 
            'span.posted-time-ago_text.topcard_flavor--metadata'
        )
        posted_date = safe_get_text(posted_date_element)
    except:
        posted_date = "N/A"
        
    # Extract applicant count
    try:
        applicant_element = safe_find_element(
            details_pane, 
            By.CSS_SELECTOR, 
            'figure.num-applicants_figure.topcardflavor--metadata.topcard_flavor--bullet figcaption'
        )
        applicant_count = safe_get_text(applicant_element)
    except:
        applicant_count = "N/A"
    
    # Find the job criteria section
    job_criteria_list = safe_find_element(
        details_pane, 
        By.CSS_SELECTOR, 
        'ul.description__job-criteria-list'
    )
    
    # Initialize job criteria variables
    seniority_level = "N/A"
    employment_type = "N/A"
    job_function = "N/A"
    industries = "N/A"
    
    # Extract job criteria if section exists
    if job_criteria_list:
        criteria_items = job_criteria_list.find_elements(By.CSS_SELECTOR, 'li.description__job-criteria-item')
        
        for item in criteria_items:
            # Get the criteria item label
            label_element = safe_find_element(item, By.CSS_SELECTOR, 'h3.description__job-criteria-subheader')
            criteria_text = safe_find_element(item, By.CSS_SELECTOR, 'span.description__job-criteria-text')
            
            label = safe_get_text(label_element)
            value = safe_get_text(criteria_text)
            
            if "seniority" in label.lower():
                seniority_level = value
            elif "employment" in label.lower():
                employment_type = value
            elif "function" in label.lower():
                job_function = value
            elif "industr" in label.lower():
                industries = value
    
    return {
        "posted_date": posted_date,
        "applicant_count": applicant_count,
        "seniority_level": seniority_level,
        "employment_type": employment_type,
        "job_function": job_function,
        "industries": industries
    }

In [15]:
#Scroll element into view
def scroll_to_element(element):
    try:
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
        time.sleep(0.5)
    except:
        pass

In [16]:
#Process job listings and extract detailed information
def process_job_listings():
    try:
        # Find the job results list
        results_list = safe_find_element(
            driver, 
            By.CSS_SELECTOR, 
            'ul.jobs-search__results-list'
        )
        
        if not results_list:
            print("Results list not found")
            return False
        
        # Find all job listings
        job_listings = results_list.find_elements(By.TAG_NAME, 'li')
        print(f"Found {len(job_listings)} job listings")
        
        # Process each job listing in the current view
        for job_index, job in enumerate(job_listings):
            # Skip jobs we've already processed
            if job_index < len(job_titles):
                continue
                
            try:
                # First collect the basic data
                basic_data = collect_basic_job_data(job)
                
                # Find the job card link (this is the full card link that's clickable)
                job_card_link = safe_find_element(job, By.CSS_SELECTOR, 'a.base-card__full-link')
                
                if not job_card_link:
                    print(f"Job card link not found for job #{job_index + 1}")
                    continue
                
                # Scroll to make the link visible
                scroll_to_element(job_card_link)
                
                # Using JavaScript to click the link (more reliable than selenium click)
                try:
                    driver.execute_script("arguments[0].click();", job_card_link)
                    print(f"Clicked on job {job_index + 1} using JavaScript")
                except Exception as e:
                    print(f"JavaScript click failed: {e}")
                    continue
                
                # Extract detailed job information
                details = extract_job_details()
                
                # Add all data to our lists
                job_titles.append(basic_data["title"])
                company_names.append(basic_data["company"])
                cities.append(basic_data["location"])
                posted_dates.append(details["posted_date"])
                applicant_counts.append(details["applicant_count"])
                seniority_levels.append(details["seniority_level"])
                employment_types.append(details["employment_type"])
                job_functions.append(details["job_function"])
                industries_list.append(details["industries"])
                
                print(f"Processed job {job_index + 1}: {basic_data['title']} | {basic_data['company']} | {basic_data['location']}")
                
                # Wait a moment before processing the next job
                time.sleep(2)
                
            except StaleElementReferenceException:
                # If elements become stale, break and try again with fresh elements
                print("Stale element reference, refreshing job list")
                break
                
            except Exception as e:
                print(f"Error processing job listing #{job_index + 1}: {e}")
                continue
                
        return True
    
    except Exception as e:
        print(f"Error in process_job_listings: {e}")
        return False

In [17]:
#Click on the 'See more jobs' button if available
def click_see_more_jobs(): 
    try:
        # First scroll to the bottom of the page
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        
        # Look for the 'See more jobs' button
        see_more_button = safe_find_element(
            driver, 
            By.CSS_SELECTOR, 
            'button.infinite-scroller_show-more-button.infinite-scroller_show-more-button--visible'
        )
        
        if see_more_button and see_more_button.is_displayed():
            scroll_to_element(see_more_button)
            
            # Try to click with JavaScript (more reliable)
            try:
                driver.execute_script("arguments[0].click();", see_more_button)
                print("Clicked 'See more jobs' button with JavaScript")
                time.sleep(3)  # Wait for more jobs to load
                return True
            except Exception as e:
                print(f"JavaScript click failed for 'See more jobs' button: {e}")
                return False
        else:
            print("'See more jobs' button not found or not visible")
            return False
            
    except Exception as e:
        print(f"Error in click_see_more_jobs: {e}")
        return False

In [18]:
# Main scraping loop
max_iterations = 5  # Maximum number of iterations
no_new_jobs_count = 0  # Counter for iterations with no new jobs
max_no_new_jobs = 2  # Maximum allowed iterations with no new jobs

for iteration in range(max_iterations):
    print(f"\nIteration {iteration + 1}/{max_iterations}")
    
    # Track current job count
    current_count = len(job_titles)
    
    # Process job listings
    process_job_listings()
    
    # Check if we got new jobs
    if len(job_titles) == current_count:
        no_new_jobs_count += 1
        print(f"No new jobs found in this iteration. Count: {no_new_jobs_count}/{max_no_new_jobs}")
    else:
        no_new_jobs_count = 0
        print(f"Found {len(job_titles) - current_count} new jobs")
    
    # If we had several iterations with no new jobs, we're probably at the end
    if no_new_jobs_count >= max_no_new_jobs:
        print("No new jobs found for several iterations, ending scraping")
        break
    
    # Try to click "See more jobs" button
    clicked = click_see_more_jobs()
    
    # If we couldn't click the button and we've processed all visible jobs, we're done
    if not clicked and len(job_titles) == current_count:
        print("No more jobs to load, ending scraping")
        break
        
    print(f"Total jobs collected so far: {len(job_titles)}")

# Create DataFrame with the collected data
print(f"\nCreating DataFrame with {len(job_titles)} jobs")
jobs_data = {
    'Title': job_titles,
    'Company': company_names,
    'City': cities,
    'Posted Date': posted_dates,
    'Applicant Count': applicant_counts,
    'Seniority Level': seniority_levels,
    'Employment Type': employment_types,
    'Job Function': job_functions,
    'Industries': industries_list
}

df = pd.DataFrame(jobs_data)

# Save to CSV
csv_filename = 'linkedin_data_analyst_jobs_scraped.csv'
df.to_csv(csv_filename, index=False)
print(f"Data saved to {csv_filename}")

# Display first few rows of data
print("\nPreview of collected data:")
print(df.head())

# Display data collection statistics
print(f"\nTotal jobs collected: {len(df)}")
print(f"Unique companies: {df['Company'].nunique()}")
print(f"Unique cities: {df['City'].nunique()}")
print(f"Unique seniority levels: {df['Seniority Level'].nunique()}")
print(f"Unique employment types: {df['Employment Type'].nunique()}")


Iteration 1/5
Found 58 job listings
Clicked on job 1 using JavaScript
Job details pane not found
Processed job 1: Data Analyst | LTIMindtree | Stockholm County, Sweden
Clicked on job 2 using JavaScript
Job details pane not found
Processed job 2: Business & Data Analyst | Karo Healthcare | Stockholm, Stockholm County, Sweden
Clicked on job 3 using JavaScript
Job details pane not found
Processed job 3: Data Analyst | Avy | Stockholm, Stockholm County, Sweden
Clicked on job 4 using JavaScript
Job details pane not found
Processed job 4: Data Analyst | RED Global | Stockholm, Stockholm County, Sweden
Clicked on job 5 using JavaScript
Job details pane not found
Processed job 5: Data Analyst | Capgemini | Stockholm, Stockholm County, Sweden
Clicked on job 6 using JavaScript
Job details pane not found
Processed job 6: Senior Data Analyst | CloudArt Solutions | Gothenburg, Västra Götaland County, Sweden
Clicked on job 7 using JavaScript
Job details pane not found
Processed job 7: Data Analyst 

Job details pane not found
Processed job 53: Data Engineer | Parallel Consulting | Stockholm, Stockholm County, Sweden
Clicked on job 54 using JavaScript
Job details pane not found
Processed job 54: Data Analyst inom hållbarhet på medarbetarägt konsultbolag | TechSeed | Gothenburg, Västra Götaland County, Sweden
Clicked on job 55 using JavaScript
Job details pane not found
Processed job 55: Data Engineer | Apoteket AB | Solna, Stockholm County, Sweden
Clicked on job 56 using JavaScript
Job details pane not found
Processed job 56: Junior Analytics Engineer | IPercept | Stockholm, Stockholm County, Sweden
Clicked on job 57 using JavaScript
Job details pane not found
Processed job 57: Data Modeller | Capgemini | Stockholm, Stockholm County, Sweden
Clicked on job 58 using JavaScript
Job details pane not found
Processed job 58: Commercial and Business Data Analyst | CSV Rating AB | Stockholm, Stockholm County, Sweden
Found 58 new jobs
'See more jobs' button not found or not visible
Total jo

Clicked on job 104 using JavaScript
Job details pane not found
Processed job 104: Business Intelligence Konsult | Nexer Group | Gothenburg, Västra Götaland County, Sweden
Clicked on job 105 using JavaScript
Job details pane not found
Processed job 105: Data Engineer | SSC - Swedish Space Corporation | Solna, Stockholm County, Sweden
Clicked on job 106 using JavaScript
Job details pane not found
Processed job 106: Application Specialist | Arctic Business | Luleå, Norrbotten County, Sweden
Clicked on job 107 using JavaScript
Job details pane not found
Processed job 107: Data Engineer | CGI | Karlstad, Värmland County, Sweden
Clicked on job 108 using JavaScript
Job details pane not found
Processed job 108: Senior Data Engineer/Analyst | Xenit AB | Malmo, Skåne County, Sweden
Clicked on job 109 using JavaScript
Job details pane not found
Processed job 109: Data Engineer | B3 Indes | Stockholm, Stockholm County, Sweden
Clicked on job 110 using JavaScript
Job details pane not found
Processed

In [19]:
# Close the browser
driver.quit()

In [22]:
data = pd.read_csv("linkedin_data.csv")
data.head(10)

Unnamed: 0,Title,Company,City,Posted Date,Applicant Count,Seniority Level,Employment Type,Job Function,Industries
0,Data Analyst,LTIMindtree,"Stockholm County, Sweden",2 weeks ago,98 applicants,Executive,Full-time,"Consulting, Information Technology, and Other","IT Services and IT Consulting, Information Ser..."
1,Business & Data Analyst,Karo Healthcare,"Stockholm, Stockholm County, Sweden",1 week ago,Over 200 applicants,Mid-Senior level,Full-time,Analyst,Non-profit Organizations and Primary and Secon...
2,Data Analyst,Avy,"Stockholm, Stockholm County, Sweden",1 month ago,Over 200 applicants,Entry level,Full-time,Information Technology,Real Estate
3,Data Analyst,RED Global,"Stockholm, Stockholm County, Sweden",1 week ago,99 applicants,Not Applicable,Contract,Information Technology,Staffing and Recruiting
4,Data Analyst,Capgemini,"Stockholm, Stockholm County, Sweden",6 days ago,Over 200 applicants,Entry level,Full-time,Information Technology,IT Services and IT Consulting
5,Data Analyst,Deploja,"Solna, Stockholm County, Sweden",1 week ago,57 applicants,Entry level,Full-time,Information Technology,IT Services and IT Consulting
6,Data Analyst,Deploja,"Malmo, Sk√•ne County, Sweden",5 days ago,68 applicants,Entry level,Full-time,Information Technology,IT Services and IT Consulting
7,Stanowisko ds. zarzƒÖdzania obszarem BI i HD-s...,ZUS,Sweden,4 days ago,Be among the first 25 applicants,Not Applicable,Full-time,Other,Insurance
8,Data Analyst,Stegra,"Stockholm, Stockholm County, Sweden",2 days ago,141 applicants,Entry level,Full-time,Information Technology,Primary Metal Manufacturing
9,Senior Data Analyst,CloudArt Solutions,"Gothenburg, V√§stra G√∂taland County, Sweden",1 week ago,51 applicants,Mid-Senior level,Full-time,"Engineering, Information Technology, and Analyst",IT Services and IT Consulting
