# # Scraping Functions
# Core functions for extracting job data from Indeed

## Load Dependencies

In [9]:
%run config.ipynb
%run utils.ipynb

✓ Configuration loaded successfully
  - Default threads: 3
  - Max threads: 15
  - WebDriver timeout: 10s
✓ Configuration loaded successfully
  - Default threads: 3
  - Max threads: 15
  - WebDriver timeout: 10s
✓ Test URL: https://www.indeed.com/jobs?q=data+analyst&l=New+York+NY
✓ Utility functions loaded successfully


## Extract Basic Job Information

In [10]:
def get_job_basic_info(post):
    """
    Extract basic information from a job posting card
    
    Args:
        post: Selenium WebElement of job card
    
    Returns:
        Tuple: (title, company, location, salary, job_url) or None if error
    """
    try:
        title = post.find_element(By.CSS_SELECTOR, "h2.jobTitle").text
        company = post.find_element(By.CSS_SELECTOR, "span[data-testid='company-name']").text
        location = post.find_element(By.CSS_SELECTOR, "div[data-testid='text-location']").text
        job_url = post.find_element(By.CSS_SELECTOR, "h2.jobTitle a").get_attribute("href")
        
        return (title, company, location, job_url)
    except Exception as e:
        print(f"Error extracting basic info: {e}")
        return None

## Extract Full Job Description

In [11]:
def get_job_description(job_url):
    """
    Get full job description and salary by opening the job URL
    Uses a fresh browser instance for anti-detection
    
    Args:
        job_url: URL of the job posting
    
    Returns:
        Tuple: (salary, job_description) where salary may be empty string
    """
    driver = create_driver()
    salary = ""
    job_description = "None"
    
    try:
        driver.get(job_url)
        time.sleep(random.randint(JOB_LOAD_MIN, JOB_LOAD_MAX))
        
        WebDriverWait(driver, WEBDRIVER_TIMEOUT).until(
            EC.presence_of_element_located((By.ID, "jobDescriptionText"))
        )
        job_description = driver.find_element(By.ID, "jobDescriptionText").text
        
        # NEW: Try to get salary from the detail page
        try:
            # Method 1: Try the salaryInfoAndJobType div with specific class
            salary_element = driver.find_element(By.CSS_SELECTOR, "span.css-1oc7tea")
            salary = salary_element.text.strip()
        except NoSuchElementException:
            try:
                # Method 2: Try alternative salary container
                salary_element = driver.find_element(By.ID, "salaryInfoAndJobType")
                salary_span = salary_element.find_element(By.CSS_SELECTOR, "span.css-1oc7tea")
                salary = salary_span.text.strip()
            except NoSuchElementException:
                try:
                    # Method 3: Try broader search in salary section
                    salary_container = driver.find_element(By.CSS_SELECTOR, "div#salaryInfoAndJobType")
                    salary_text = salary_container.text.split('-')[0].strip()
                    if salary_text:
                        salary = salary_container.text.split('\n')[0].strip()
                except NoSuchElementException:
                    salary = ""
        
    except (NoSuchElementException, TimeoutException):
        job_description = "None"
        salary = ""
    except Exception as e:
        safe_print(f"Error getting job details: {e}")
        job_description = "None"
        salary = ""
    finally:
        driver.quit()
    
    return (salary, job_description)

## Process Job with Threading

In [13]:
def process_job_with_description(job_data, index, total):
    title, company, location, job_url = job_data
    """
    Process a single job: fetch description and create record
    Designed for parallel execution with ThreadPoolExecutor
    
    Args:
        job_data: Tuple of basic job info
        index: Job index (for progress tracking)
        total: Total number of jobs
    
    Returns:
        Complete job record tuple
    """    
    safe_print(f"[Thread] Processing job {index + 1}/{total}: {title} at {company}")
    time.sleep(random.uniform(THREAD_DELAY_MIN, THREAD_DELAY_MAX))
    
    salary, job_description = get_job_description(job_url)
    
    record = (title, company, location, salary, job_url, job_description)
    safe_print(f"[Thread] Completed job {index + 1}/{total}: {title} | Salary: {salary if salary else 'Not listed'}")
    return record

In [14]:
print("✓ Scraping functions loaded successfully")
print("  - get_job_basic_info()")
print("  - get_job_description()")
print("  - process_job_with_description()")

✓ Scraping functions loaded successfully
  - get_job_basic_info()
  - get_job_description()
  - process_job_with_description()
