In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time
import random
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
import csv
import pandas as pd
from datetime import datetime

In [5]:
def get_url(job_title, city, state, page = 0):
    # Generate Indeed search URL with optional page number
    template = "https://www.indeed.com/jobs?q={}&l={}"
    if page > 0:
        template += "start={}".format(page*10) # Indeed uses 10 -> 2nd page, 20 -> 3rd page.
    job_title = job_title.strip().replace(" ", "+")
    city = city.strip().replace(" ", "+")
    state = state.strip().replace(" ", "+")
    city_state = city + "+" + state
    url = template.format(job_title, city_state)
    return url        

In [6]:
# Only one thread prints at a time
print_lock = Lock()

def safe_print(message):
    # Thread-safe print
    with print_lock:
        print(message)

In [7]:
# Create and configure a new webdriver instance
def create_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized") # Launches chrome in maximized window mode, so we don't have to resize
    # Anti-detection
    options.add_argument("--disable-blink-features=AutomationControlled") # Tells chrome not to reveal it's controlled by automation
    options.add_experimental_option("excludeSwitches", ["enable-automation"]) # Removes AutomationControlled flag
    options.add_experimental_option('useAutomationExtension', False) # Stop default automation from Selenium
    # Optional: turned off only if you would like see all windows
    options.add_argument("--headless")
    
    driver = webdriver.Chrome(options = options)
    
    driver.execute_cdp_cmd('Network.setUserAgentOverride', {
# The code below is from Chrome console by typing navigator.userAgent
        "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
    })
    
    return driver

In [8]:
def get_job_basic_info(post):
    try:
        title = post.find_element(By.CSS_SELECTOR, "h2.jobTitle").text
        company = post.find_element(By.CSS_SELECTOR, "span[data-testid='company-name']").text
        location = post.find_element(By.CSS_SELECTOR, "div[data-testid='text-location']").text
        job_url = post.find_element(By.CSS_SELECTOR, "h2.jobTitle a").get_attribute("href")
        
        # Try to get salary
        try:
            salary = post.find_element(By.CSS_SELECTOR, "h2.mosaic-provider-jobcards-5vqdjd").text
        except NoSuchElementException:
            salary = ""
        
        return (title, company, location, salary, job_url)
    except Exception as e:
        print(f"Error extracting basic info: {e}")
        return None

In [9]:
def get_job_description(job_url):
    # Get job description by opening url from a fresh browser instance for anti-detection pupose
    driver = create_driver()
    
    try:
        driver.get(job_url)
        time.sleep(random.randint(2, 4)) # Pauses between 2 and 4 sendons to simulate a human browsing behavior and also let the page load fully
        
        WebDriverWait(driver, 10).until( # Waits up to 10 secs for elements appeared
            EC.presence_of_element_located((By.ID, "jobDescriptionText"))
        )
        job_description = driver.find_element(By.ID, "jobDescriptionText").text
        
    except (NoSuchElementException, TimeoutException):
        job_description = "None"
    except Exception as e:
        safe_print(f"Error getting job description: {e}")
        job_description = "None"
    finally:
        driver.quit()
    
    return job_description        

In [None]:
def process_job_with_description(job_data, index, total):
    title, company, location, salary, job_url = job_data
    
    safe_print(f"[Thread] Processing job {index + 1}/{total}: {title} at {company}")
    
    # Add slight random delay to stagger requests
    time.sleep(random.uniform(0.5, 1.5))
    
    # Get job description with fresh browser for anti-detection
    job_description = get_job_description(job_url)
    
    record = (title, company, location, salary, job_url, job_description)
    safe_print(f"[Thread] Completed job {index + 1}/{total}: {title}")
    
    return record

In [None]:
def main():
    # User Input
    print("Enter job title: ", end="")
    job_title = input()
    print(job_title)
    
    print("Enter city: ", end="")
    city = input()
    print(city)
    
    print("Enter state: ", end="")
    state = input()
    print(state)
    
    print("Enter starting page (0 for first page, 1 for second page, etc.): ", end="")
    start_page_input = input()
    start_page = int(start_page_input) if start_page_input.strip() else 0
    print(f"Starting from page {start_page + 1}")
    
    print("Enter number of pages to scrape (default 1): ", end="")
    pages_input = input()
    num_pages = int(pages_input) if pages_input.strip() else 1
    print(f"Will scrape {num_pages} pages")
    
    # print("Enter number of parallel threads (default 3, max 5 recommended): ", end="")
    print("Enter number of parallel threads (default 3, 5 recommended, max 15 ): ", end="")
    threads_input = input()
    max_workers = int(threads_input) if threads_input.strip() else 3
    max_workers = min(max_workers, 15)
    # max_workers = min(max_workers, 5)  # Cap at 5 to avoid detection
    print(f"Using {max_workers} parallel threads")
    
    # Generate URL
    url = get_url(job_title, city, state, start_page)
    print(f"Search URL: {url}")
    
    # Setup Selenium WebDriver
    driver = create_driver()
    
    records = []
    
    try:
        for page_num in range(num_pages):
            current_page = start_page + page_num
            print(f"\nScraping page {current_page + 1}...")
            
            # Navigate to URL
            if page_num == 0:
                driver.get(url)
            else:
                # Use saved next page URL or generate new one
                if next_page_url:
                    driver.get(next_page_url)
                else:
                    url = get_url(job_title, city, state, current_page)
                    driver.get(url)
            
            # Wait for job listings to load
            time.sleep(random.randint(3, 5))
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "job_seen_beacon"))
            )
            
            # Find all job posts
            posts = driver.find_elements(By.CLASS_NAME, "job_seen_beacon")
            print(f"Found {len(posts)} jobs on page {page_num + 1}")
            
            # First pass: collect basic info and URLs
            job_basics = []
            for i, post in enumerate(posts):
                print(f"Collecting basic info for job {i + 1}/{len(posts)}...")
                basic_info = get_job_basic_info(post)
                if basic_info:
                    job_basics.append(basic_info)
            
            print(f"Collected {len(job_basics)} job listings from page {current_page + 1}")
            
            # Save the next page URL before closing browser
            next_page_url = None
            if page_num < num_pages - 1:
                try:
                    next_button = driver.find_element(By.CSS_SELECTOR, "a[data-testid='pagination-page-next']")
                    next_page_url = next_button.get_attribute("href")
                    print(f"Next page URL saved: {next_page_url}")
                except NoSuchElementException:
                    print("No next page button found")
            
            # Close the listing page browser
            driver.quit()
            print("Closed listing page browser to avoid detection")
            
            # Second pass: get job descriptions in parallel with multiple threads
            print(f"\nFetching job descriptions using {max_workers} parallel threads...")
            start_time = time.time()
            
            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                # Submit all jobs to the thread pool
                future_to_job = {
                    executor.submit(process_job_with_description, job_data, i, len(job_basics)): job_data 
                    for i, job_data in enumerate(job_basics)
                }
                
                # Collect results as they complete
                for future in as_completed(future_to_job):
                    try:
                        record = future.result()
                        records.append(record)
                    except Exception as e:
                        safe_print(f"Error processing job: {e}")
            
            elapsed_time = time.time() - start_time
            print(f"\nCompleted {len(job_basics)} jobs in {elapsed_time:.2f} seconds")
            print(f"Average time per job: {elapsed_time/len(job_basics):.2f} seconds")
            
            print(f"Completed page {current_page + 1}. Total jobs collected: {len(records)}")
            
            # Create new driver for next page if needed
            if page_num < num_pages - 1 and next_page_url:
                print("\nCreating new browser for next page...")
                driver = create_driver()
                time.sleep(random.randint(2, 4))
            elif page_num < num_pages - 1:
                print("No next page available, stopping pagination")
                break
        
        # Display results
        print(f"\n{'='*80}")
        print(f"SCRAPING COMPLETE - Collected {len(records)} jobs")
        print(f"{'='*80}\n")
        
        for i, record in enumerate(records, 1):
            print(f"\nJob {i}:")
            print(f"Title: {record[0]}")
            print(f"Company: {record[1]}")
            print(f"Location: {record[2]}")
            print(f"Salary: {record[3]}")
            print(f"URL: {record[4]}")
            print(f"Description: {record[5][:200]}..." if len(record[5]) > 200 else f"Description: {record[5]}")
            print("-" * 80)
    
    finally:
        # Make sure driver is closed
        try:
            driver.quit()
        except:
            pass
    
    return records

if __name__ == "__main__":
    records = main()

Enter job title: machine learning engineer
Enter city: Pittsburgh
Enter state: PA
Enter starting page (0 for first page, 1 for second page, etc.): Starting from page 1
Enter number of pages to scrape (default 1): Will scrape 1 pages
Enter number of parallel threads (default 3, 5 recommended, max 15 ): Using 15 parallel threads
Search URL: https://www.indeed.com/jobs?q=machine+learning+engineer&l=Pittsburgh+PA

Scraping page 1...
Found 16 jobs on page 1
Collecting basic info for job 1/16...
Collecting basic info for job 2/16...
Collecting basic info for job 3/16...
Collecting basic info for job 4/16...
Collecting basic info for job 5/16...
Collecting basic info for job 6/16...
Collecting basic info for job 7/16...
Collecting basic info for job 8/16...
Collecting basic info for job 9/16...
Collecting basic info for job 10/16...
Collecting basic info for job 11/16...
Collecting basic info for job 12/16...
Collecting basic info for job 13/16...
Collecting basic info for job 14/16...
Collec

In [106]:
def save_to_csv(records, filename="jobs.csv"):
    """Save job records to CSV"""
    headers = ["Title", "Company", "Location", "Salary", "URL", "Description"]
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(headers)
        writer.writerows(records)
    print(f"\nData saved to {filename}")

save_to_csv(records, "indeed_jobs.csv")    


Data saved to indeed_jobs.csv
