In [None]:
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import traceback

def setup_driver():
    """Set up and return Chrome WebDriver."""
    chrome_options = Options()
    # Uncomment to run in headless mode
    # chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    return driver

def extract_job_details_from_list(job):
    """Extract job details from the job list page."""
    try:
        job_link = "https://in.indeed.com" + job.select_one('h2 a')['href']
        job_title = job.select_one('h2 a span').get_text(strip=True)
        company_name = job.select_one('span[data-testid="company-name"]')
        company_name = company_name.get_text(strip=True) if company_name else 'Not available'
        
        company_location = job.select_one('div.company_location div[data-testid="text-location"]')
        company_location = company_location.get_text(strip=True) if company_location else 'Not available'
        
        try:
            salary = job.select_one('div.metadata.salary-snippet-container .css-18z4q2i')
            salary = salary.get_text(strip=True) if salary else 'Not available'
        except:
            salary = 'Not available'

        
        
        return {
            'job_link': job_link,
            'job_title': job_title,
            'company_name': company_name,
            'company_location': company_location,
            'salary': salary
        }
    except Exception as e:
        print(f"Error extracting job details from list: {e}")
        return None

def scrape_detailed_job_info(driver, job_url):
    """Scrape detailed job information from individual job page."""
    try:
        driver.get(job_url)
        
        # Wait for page to load with multiple potential selectors
        selectors = [
            'h1.jobsearch-JobInfoHeader-title',
            'h1[data-jobsearch-header="true"]',
            'div.jobsearch-JobComponent-title',
            'h2[data-e2e="job-title"]'
        ]
        
        job_title = None
        for selector in selectors:
            try:
                job_title_element = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, selector))
                )
                job_title = job_title_element.text
                break
            except:
                continue
        
        # Company name with multiple selectors
        company_selectors = [
            'div.jobsearch-JobInfoHeader-companyNameLink',
            'div[data-company-name="true"]',
            'div.jobsearch-CompanyInfoContainer a',
            'span[data-testid="company-name"]'
        ]
        
        company_name = None
        for selector in company_selectors:
            try:
                company_element = driver.find_element(By.CSS_SELECTOR, selector)
                company_name = company_element.text
                break
            except:
                continue
        
        # Location selectors
        location_selectors = [
            'div[data-testid="inlineHeader-companyLocation"]',
            'div.jobsearch-JobInfoHeader-subtitle div',
            'div.jobsearch-CompanyInfoContainer span'
        ]
        
        location = None
        for selector in location_selectors:
            try:
                location_element = driver.find_element(By.CSS_SELECTOR, selector)
                location = location_element.text
                break
            except:
                continue
        
        # Job Description
        description_selectors = [
            'div.jobsearch-JobComponent-description',
            'div#jobDescriptionText',
            'div[data-jobsearch-description="true"]'
        ]
        
        job_description = None
        for selector in description_selectors:
            try:
                description_element = driver.find_element(By.CSS_SELECTOR, selector)
                job_description = description_element.text
                break
            except:
                continue
        
        
        
        
        job_type = 'Not available'
        try:
            # Locate the section containing 'Job type' text
            job_type_section = driver.find_element(By.XPATH, '//h3[contains(text(), "Job type")]/following-sibling::div')
            job_type_elements = job_type_section.find_elements(By.CSS_SELECTOR, 'ul li')
            if job_type_elements:
                job_type = [element.text.strip() for element in job_type_elements]  # Collect all job types
                job_type = ', '.join(job_type)  # Combine job types into a single string
        except Exception as e:
            print(f"Error extracting job type: {e}")
        
        # Apply link extraction
        apply_link = 'Not available'
        try:
            apply_buttons = driver.find_elements(By.CSS_SELECTOR, 'div.css-kyg8or button')
            if apply_buttons:
                apply_link = apply_buttons[0].get_attribute('href')
        except Exception as e:
            print(f"Error finding apply link: {e}")
        
        
        # Compile job details
        job_details = {
            'Job Title': job_title or 'Not available',
            'Company': company_name or 'Not available',
            'Location': location or 'Not available',
            'Job Description': job_description or 'Not available',
            'Job Type': job_type or 'Not available',
            'Apply Link': apply_link or 'Not available',
            'Job Requirements': job_description or 'Not available'
        }

        print(job_details)


        
        return job_details
    
    except Exception as e:
        print(f"Detailed error scraping {job_url}: {e}")
        print(traceback.format_exc())
        return None

def main():
    # Set up the driver
    driver = setup_driver()
    
    # Parameters for job search
    job_search_keywords = ['Software+Engineer']
    location_search_keywords = ['Hyderabad']
    job_type_keywords = ['internship']  # Job types: internship, new graduates
    education_levels = ['attr%28HFDVW%29']  # Include bachelors or exclude
    
    # Pagination URL template
    pagination_url = "https://in.indeed.com/jobs?q={}&l={}&sc=0kf%3A{}jt%28{}%29%3B&start={}"
    
    # Open CSV file for writing
    with open('indeed_comprehensive_jobs.csv', 'w', newline='', encoding='utf-8') as f:
        csv_writer = csv.writer(f)
        # Define headers - now including more detailed columns
        headers = [
            'Job Title', 'Company', 'Location', 'Job Description', 
            'Job Type', 'Salary', 'Apply Link', 'Job Requirements', 
            'Original Job Link'
        ]
        csv_writer.writerow(headers)
        
        # Nested loops for different search parameters
        for job_keyword in job_search_keywords:
            for location_keyword in location_search_keywords:
                for job_type in job_type_keywords:
                    for education_level in education_levels:
                        all_jobs = []
                        
                        # Scrape multiple pages
                        for page_no in range(0, 10, 10):  # Scrape first 3 pages
                            url = pagination_url.format(job_keyword, location_keyword, education_level, job_type, page_no)
                            print(f"Scraping page: {url}")
                            
                            # Get page DOM
                            driver.get(url)
                            page_source = driver.page_source
                            page_dom = BeautifulSoup(page_source, 'html.parser')
                            jobs = page_dom.select('div.job_seen_beacon')
                            all_jobs.extend(jobs)
                        
                        # Process each job
                        for job in all_jobs:
                            try:
                                # Extract basic job info from list
                                list_job_info = extract_job_details_from_list(job)
                                if not list_job_info:
                                    continue
                                print(list_job_info['salary'])
                                # Scrape detailed job info
                                detailed_job_info = scrape_detailed_job_info(driver, list_job_info['job_link'])
                                
                                if detailed_job_info:
                                    # Add original job link to detailed info
                                    detailed_job_info['Original Job Link'] = list_job_info['job_link']
                                    detailed_job_info['salary'] = list_job_info['salary']
                                    print(list_job_info['salary'])
                                    # Write to CSV
                                    csv_writer.writerow([
                                        detailed_job_info.get('Job Title', 'Not available'),
                                        detailed_job_info.get('Company', 'Not available'),
                                        detailed_job_info.get('Location', 'Not available'),
                                        detailed_job_info.get('Job Description', 'Not available'),
                                        detailed_job_info.get('Job Type', 'Not available'),
                                        detailed_job_info.get('salary', 'Not available'),
                                        detailed_job_info.get('Apply Link', 'Not available'),
                                        detailed_job_info.get('Job Requirements', 'Not available'),
                                        detailed_job_info.get('Original Job Link', 'Not available')
                                    ])
                                    
                                    print(f"Saved job: {detailed_job_info.get('Job Title', 'Unknown')}")
                                
                                # Add a small delay to avoid overwhelming the server
                                time.sleep(1)
                            
                            except Exception as e:
                                print(f"Error processing job: {e}")
                                print(traceback.format_exc())
    
    # Close the driver
    driver.quit()

if __name__ == "__main__":
    main()

Scraping page: https://in.indeed.com/jobs?l=Hyderabad&sc=0kf%3Aattr%28HFDVW%29jt%28internship%29%3B&start=0
₹1,426 an hour
{'Job Title': 'Applied Mathematician (non - US)', 'Company': 'Outlier Ai', 'Location': 'Hyderabad, Telangana', 'Job Description': "Job details\nHere’s how the job details align with your profile\n.\nPay\n₹1,426 an hour\nJob type\nInternship\nFreelance\n&nbsp;\nLocation\nHyderabad, Telangana\n&nbsp;\nFull job description\nOutlier helps the world’s most innovative companies improve their AI models by providing human feedback. Are you an experienced Mathematics expert who would like to lend your expertise to train AI models?\n\nAbout the opportunity:\nOutlier is looking for talented Mathematics expert to help train generative artificial intelligence models\nThis freelance opportunity is remote and hours are flexible, so you can work whenever is best for you\nYou may contribute your expertise by…\nAssessing the factuality and relevance of domain-specific text produced 


KeyboardInterrupt

