# # Main Scraper
# Execute the complete scraping workflow

## Load All Dependencies

In [86]:
%run config.ipynb
%run utils.ipynb
%run scraping_functions.ipynb

✓ Configuration loaded successfully
  - Default threads: 3
  - Max threads: 15
  - WebDriver timeout: 60s
✓ Configuration loaded successfully
  - Default threads: 3
  - Max threads: 15
  - WebDriver timeout: 60s
✓ Test URL: https://www.indeed.com/jobs?q=data+analyst&l=New+York+NY
✓ Utility functions loaded successfully
✓ Configuration loaded successfully
  - Default threads: 3
  - Max threads: 15
  - WebDriver timeout: 60s
✓ Configuration loaded successfully
  - Default threads: 3
  - Max threads: 15
  - WebDriver timeout: 60s
✓ Test URL: https://www.indeed.com/jobs?q=data+analyst&l=New+York+NY
✓ Utility functions loaded successfully
✓ Scraping functions loaded successfully
  - get_job_basic_info()
  - get_job_description()
  - process_job_with_description()


## User Input Section


In [87]:
# Job search parameters
job_title = input("Enter job title: ")
city = input("Enter city: ")
state = input("Enter state: ")

In [88]:
# Pagination settings
start_page_input = input("Enter starting page (0 for first page, 1 for second page, etc.): ")
start_page = int(start_page_input) if start_page_input.strip() else 0
print(f"Starting from page {start_page + 1}")

pages_input = input("Enter number of pages to scrape (default 1): ")
num_pages = int(pages_input) if pages_input.strip() else 1
print(f"Will scrape {num_pages} pages")


Starting from page 1
Will scrape 3 pages


In [89]:
# Threading settings
threads_input = input(f"Enter number of parallel threads (default {DEFAULT_THREADS}, max {MAX_THREADS}): ")
max_workers = int(threads_input) if threads_input.strip() else DEFAULT_THREADS
max_workers = min(max_workers, MAX_THREADS)
print(f"Using {max_workers} parallel threads")

Using 15 parallel threads


## Main Scraping Loop

In [90]:
# Generate initial URL
url = get_url(job_title, city, state, start_page)
print(f"\nSearch URL: {url}")


Search URL: https://www.indeed.com/jobs?q=Data+Scientist&l=+


In [91]:
# Setup WebDriver
driver = create_driver()

In [92]:
# Store all job records
records = []
next_page_url = None

try:
    for page_num in range(num_pages):
        current_page = start_page + page_num
        print(f"\n{'='*80}")
        print(f"SCRAPING PAGE {current_page + 1}")
        print(f"{'='*80}")
        
        # Navigate to URL
        if page_num == 0:
            driver.get(url)
        else:
            if next_page_url:
                driver.get(next_page_url)
            else:
                url = get_url(job_title, city, state, current_page)
                driver.get(url)
        
        # Wait for page to load
        time.sleep(random.randint(PAGE_LOAD_MIN, PAGE_LOAD_MAX))
        WebDriverWait(driver, WEBDRIVER_TIMEOUT).until(
            EC.presence_of_element_located((By.CLASS_NAME, "job_seen_beacon"))
        )
        
        # Find all job postings
        posts = driver.find_elements(By.CLASS_NAME, "job_seen_beacon")
        print(f"Found {len(posts)} jobs on page {current_page + 1}")
        
        # Phase 1: Collect basic info quickly
        print("\nPhase 1: Collecting basic job information...")
        job_basics = []
        for i, post in enumerate(posts):
            print(f"  Collecting job {i + 1}/{len(posts)}...", end="\r")
            basic_info = get_job_basic_info(post)
            if basic_info:
                job_basics.append(basic_info)
        
        print(f"\n✓ Collected {len(job_basics)} job listings")
        
        # Save next page URL before closing
        next_page_url = None
        if page_num < num_pages - 1:
            try:
                next_button = driver.find_element(By.CSS_SELECTOR, "a[data-testid='pagination-page-next']")
                next_page_url = next_button.get_attribute("href")
                print(f"✓ Next page URL saved")
            except NoSuchElementException:
                print("⚠ No next page button found")
        
        # Close browser to avoid detection
        driver.quit()
        print("✓ Closed listing page browser")
        
        # Phase 2: Fetch descriptions in parallel
        print(f"\nPhase 2: Fetching job descriptions ({max_workers} parallel threads)...")
        start_time = time.time()
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_job = {
                executor.submit(process_job_with_description, job_data, i, len(job_basics)): job_data 
                for i, job_data in enumerate(job_basics)
            }
            
            for future in as_completed(future_to_job):
                try:
                    record = future.result()
                    records.append(record)
                except Exception as e:
                    safe_print(f"Error processing job: {e}")
        
        elapsed_time = time.time() - start_time
        print(f"\n✓ Completed {len(job_basics)} jobs in {elapsed_time:.2f} seconds")
        print(f"  Average: {elapsed_time/len(job_basics):.2f} seconds per job")
        print(f"\n✓ Total jobs collected so far: {len(records)}")
        
        # Create new driver for next page
        if page_num < num_pages - 1 and next_page_url:
            print("\nPreparing for next page...")
            driver = create_driver()
            time.sleep(random.randint(PAGE_SWITCH_MIN, PAGE_SWITCH_MAX))
        elif page_num < num_pages - 1:
            print("⚠ No next page available, stopping pagination")
            break

finally:
    try:
        driver.quit()
    except:
        pass


SCRAPING PAGE 1
Found 16 jobs on page 1

Phase 1: Collecting basic job information...
  Collecting job 16/16...
✓ Collected 16 job listings
✓ Next page URL saved
✓ Closed listing page browser

Phase 2: Fetching job descriptions (15 parallel threads)...
[Thread] Processing job 1/16: Data Engineer/ Python/ ETL Developer at Combined Computer Resources
[Thread] Processing job 2/16: Data Scientist at FanDuel
[Thread] Processing job 3/16: Data Scientist II at AccuWeather Careers
[Thread] Processing job 4/16: Data Scientist at Boosted.ai
[Thread] Processing job 5/16: Data Scientist at Fulcrum Analytics
[Thread] Processing job 6/16: Analytics Engineer at Virtuoso, Ltd.
[Thread] Processing job 7/16: Applied Scientist – Research Products at Thomson Reuters
[Thread] Processing job 8/16: Bioinformatics/Data Scientist at Axle
[Thread] Processing job 9/16: Principal, Software Engineering at NIKE
[Thread] Processing job 10/16: Data Scientist Advanced Development Program at Vanguard
[Thread] Processi

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=141.0.7390.65)
Stacktrace:
0   chromedriver                        0x0000000102bcb598 cxxbridge1$str$ptr + 2894960
1   chromedriver                        0x0000000102bc34d4 cxxbridge1$str$ptr + 2861996
2   chromedriver                        0x00000001026e95ec _RNvCs47EqcsrPRmA_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 74324
3   chromedriver                        0x00000001026c3198 chromedriver + 143768
4   chromedriver                        0x00000001027593fc _RNvCs47EqcsrPRmA_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 532580
5   chromedriver                        0x0000000102771fb8 _RNvCs47EqcsrPRmA_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 633888
6   chromedriver                        0x0000000102725178 _RNvCs47EqcsrPRmA_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 318944
7   chromedriver                        0x0000000102b8f2e4 cxxbridge1$str$ptr + 2648508
8   chromedriver                        0x0000000102b928c0 cxxbridge1$str$ptr + 2662296
9   chromedriver                        0x0000000102b6fd64 cxxbridge1$str$ptr + 2520124
10  chromedriver                        0x0000000102b931a8 cxxbridge1$str$ptr + 2664576
11  chromedriver                        0x0000000102b614d4 cxxbridge1$str$ptr + 2460588
12  chromedriver                        0x0000000102bb2b94 cxxbridge1$str$ptr + 2794092
13  chromedriver                        0x0000000102bb2d18 cxxbridge1$str$ptr + 2794480
14  chromedriver                        0x0000000102bc3120 cxxbridge1$str$ptr + 2861048
15  libsystem_pthread.dylib             0x0000000188308c08 _pthread_start + 136
16  libsystem_pthread.dylib             0x0000000188303ba8 thread_start + 8


## Display Results Summary

In [None]:
print(f"\n{'='*80}")
print(f"SCRAPING COMPLETE")
print(f"{'='*80}")
print(f"Total jobs collected: {len(records)}")
print(f"\nFirst 3 jobs preview:")

for i, record in enumerate(records[:3], 1):
    print(f"\n--- Job {i} ---")
    print(f"Title: {record[0]}")
    print(f"Company: {record[1]}")
    print(f"Location: {record[2]}")
    print(f"Salary: {record[3]}")
    print(f"Description: {record[5][:150]}..." if len(record[5]) > 150 else f"Description: {record[5]}")

if len(records) > 3:
    print(f"\n... and {len(records) - 3} more jobs")

print(f"\n{'='*80}")
print("Next step: Run save_data.ipynb to export your data")
print(f"{'='*80}")


SCRAPING COMPLETE
Total jobs collected: 48

First 3 jobs preview:

--- Job 1 ---
Title: Data Analyst
Company: CGI Group, Inc.
Location: Arlington, VA 22201 
(Bluemont area)
Salary: $50,800 - $119,200 a year
Description: Position Description:
This is an exciting full-time opportunity to work in a fast-paced environment with a team of passionate technologists. We take a...

--- Job 2 ---
Title: Data Governance Foundation Senior Analyst, AVP (Hybrid)
Company: Citi
Location: Tampa, FL 33601
Salary: $87,280 - $130,920 a year
Description: The Data Governance Foundation Senior Analyst is responsible for contributing to compliance of Citi Data Governance Policy with a focus on Data Concer...

--- Job 3 ---
Title: Associate eCommerce Data Analyst
Company: Uline
Location: Pleasant Prairie, WI 53158
Salary: Full-time
Description: Associate eCommerce Data Analyst
Corporate Headquarters
12575 Uline Drive, Pleasant Prairie, WI 53158
As one of the largest e-commerce websites in the...

... and 45 mo

## Store Records for Later Use
# This saves the 'records' variable so you can access it in other notebooks

In [None]:
%store records
print(f"\n✓ Stored {len(records)} records in Jupyter storage")
print("You can now access this in save_data.ipynb using: %store -r records")

Stored 'records' (list)

✓ Stored 16 records in Jupyter storage
You can now access this in save_data.ipynb using: %store -r records
