# # Scraping Functions
# Core functions for extracting job data from Indeed

## Load Dependencies

In [1]:
%run config.ipynb
%run utils.ipynb

✓ Configuration loaded successfully
  - Default threads: 3
  - Max threads: 15
  - WebDriver timeout: 10s
✓ Configuration loaded successfully
  - Default threads: 3
  - Max threads: 15
  - WebDriver timeout: 10s
✓ Test URL: https://www.indeed.com/jobs?q=data+analyst&l=New+York+NY
✓ Utility functions loaded successfully


## Extract Basic Job Information

In [2]:
def get_job_basic_info(post):
    """
    Extract basic information from a job posting card
    
    Args:
        post: Selenium WebElement of job card
    
    Returns:
        Tuple: (title, company, location, salary, job_url) or None if error
    """
    try:
        title = post.find_element(By.CSS_SELECTOR, "h2.jobTitle").text
        company = post.find_element(By.CSS_SELECTOR, "span[data-testid='company-name']").text
        location = post.find_element(By.CSS_SELECTOR, "div[data-testid='text-location']").text
        job_url = post.find_element(By.CSS_SELECTOR, "h2.jobTitle a").get_attribute("href")
        
        # Try to get salary (optional field)
        try:
            salary = post.find_element(By.CSS_SELECTOR, "div[data-testid='attribute_snippet_testid']").text
        except NoSuchElementException:
            salary = ""
        
        return (title, company, location, salary, job_url)
    except Exception as e:
        print(f"Error extracting basic info: {e}")
        return None

## Extract Full Job Description

In [3]:
def get_job_description(job_url):
    """
    Get full job description by opening the job URL
    Uses a fresh browser instance for anti-detection
    
    Args:
        job_url: URL of the job posting
    
    Returns:
        Job description text or "None" if error
    """
    driver = create_driver()
    
    try:
        driver.get(job_url)
        # Simulate human browsing behavior
        time.sleep(random.randint(JOB_LOAD_MIN, JOB_LOAD_MAX))
        
        # Wait for description to load
        WebDriverWait(driver, WEBDRIVER_TIMEOUT).until(
            EC.presence_of_element_located((By.ID, "jobDescriptionText"))
        )
        job_description = driver.find_element(By.ID, "jobDescriptionText").text
        
    except (NoSuchElementException, TimeoutException):
        job_description = "None"
    except Exception as e:
        safe_print(f"Error getting job description: {e}")
        job_description = "None"
    finally:
        driver.quit()
    
    return job_description

## Process Job with Threading

In [4]:
def process_job_with_description(job_data, index, total):
    """
    Process a single job: fetch description and create record
    Designed for parallel execution with ThreadPoolExecutor
    
    Args:
        job_data: Tuple of basic job info
        index: Job index (for progress tracking)
        total: Total number of jobs
    
    Returns:
        Complete job record tuple
    """
    title, company, location, salary, job_url = job_data
    
    safe_print(f"[Thread] Processing job {index + 1}/{total}: {title} at {company}")
    
    # Stagger requests to avoid detection
    time.sleep(random.uniform(THREAD_DELAY_MIN, THREAD_DELAY_MAX))
    
    # Get full job description
    job_description = get_job_description(job_url)
    
    record = (title, company, location, salary, job_url, job_description)
    safe_print(f"[Thread] Completed job {index + 1}/{total}: {title}")
    
    return record

In [5]:
print("✓ Scraping functions loaded successfully")
print("  - get_job_basic_info()")
print("  - get_job_description()")
print("  - process_job_with_description()")

✓ Scraping functions loaded successfully
  - get_job_basic_info()
  - get_job_description()
  - process_job_with_description()
