In [None]:
import csv
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# Function to process and clean the salary data
def process_salary(salary_text):
    # Extract all numbers from the salary string
    salary_numbers = re.findall(r'\d{1,3}(?:,\d{3})*(?:\.\d{2})?', salary_text)
    
    annual_salaries = []
    
    if "an hour" in salary_text.lower():  # Check if it's hourly
        hourly_wages = [float(s.replace(",", "")) for s in salary_numbers]
        
        # If two values are present (indicating a range), calculate their average
        if len(hourly_wages) == 2:
            average_hourly_wage = sum(hourly_wages) / 2
        elif len(hourly_wages) == 1:
            average_hourly_wage = hourly_wages[0]
        else:
            return "N/A"
        
        # Convert to annual salary equivalent based on 40 hours/week, 52 weeks/year
        annual_salary = int(average_hourly_wage * 40 * 52)
        return annual_salary
    
    else:  # Process as annual salary if not hourly
        for s in salary_numbers:
            salary_value = float(s.replace(",", ""))
            if salary_value >= 100:  # Consider values >=100 as annual salaries
                annual_salaries.append(int(salary_value))
        
        # If a range is present, return the average
        if len(annual_salaries) == 2:
            return int(sum(annual_salaries) // 2)
        
        # If only one annual salary is found, return it
        if len(annual_salaries) == 1:
            return annual_salaries[0]
    
    return "N/A"  # If no valid salary found, return "N/A"

# Set up the WebDriver using WebDriver Manager
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# Base URL for Indeed job search
#base_url = "https://ca.indeed.com/jobs?q=data%20analyst&l=Canada&start={}"
base_url = "https://www.indeed.com/jobs?q=data+analyst&from=searchOnDesktopSerp&start={}"


# Prepare CSV file for writing
with open('indeed_UPDATED-CONDITION.csv', mode='w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Job Title', 'Company', 'Location', 'Annual Salary Equivalent'])  # Write the header row

    # Loop through pages, adjusting the 'start' parameter
    for page in range(0, 15000, 10):  # Adjust as needed
        url = base_url.format(page)
        driver.get(url)
        wait = WebDriverWait(driver, 30)  # Wait for up to 30 seconds

        # Wait for job listings to load on each page
        try:
            job_listings = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'job_seen_beacon')))
        except Exception as e:
            print(f"Error retrieving job listings: {e}")
            break

        # Extract job listings and save them to the CSV file
        for listing in job_listings:
            try:
                # Use XPaths to locate elements
                job_title_element = listing.find_element(By.XPATH, './/h2[contains(@class, "jobTitle")]')
                company_element = listing.find_element(By.XPATH, './/div[contains(@class, "company_location")]//span[@data-testid="company-name"]')
                address_element = listing.find_element(By.XPATH, './/div[@data-testid="text-location"]')

                # Attempt to find the salary element using the new specified class
                try:
                    salary_element = listing.find_element(By.CLASS_NAME, 'salary-snippet-container')
                    salary_text = salary_element.text.strip() if salary_element else "N/A"
                    # Process the salary text using our function
                    annual_salary = process_salary(salary_text)
                except Exception:
                    annual_salary = "N/A"  # Default if salary not found

                # Get text and strip any unnecessary whitespace
                job_title = job_title_element.text.strip() if job_title_element else "N/A"
                company = company_element.text.strip() if company_element else "N/A"
                address = address_element.text.strip() if address_element else "N/A"

                # Write to CSV
                writer.writerow([job_title, company, address, annual_salary])

            except Exception as e:
                print(f"Error extracting job details: {e}")

        # Break if no more job listings found
        if not job_listings:
            break

# Close the browser
driver.quit()
