In [0]:
!pip install openpyxl

In [0]:
import os
import csv
import pandas as pd
import random
import time
import requests
from bs4 import BeautifulSoup

NAME = '<Your_Name_Here>'
API_KEY = '<Your_SCRAPER_API_KEY_Here>'
SCRAPER_API_URL = 'https://api.scraperapi.com/'

# Define headers for HTTP requests
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def get_job_data(job_url):
    """Extract job data from a job URL using ScraperAPI"""
    try:
        payload = {
            'api_key': API_KEY,
            'url': job_url
        }
        response = requests.get(SCRAPER_API_URL, params=payload, headers=HEADERS)
        response.raise_for_status()

        # Parse the response text (HTML) and extract job data
        
        soup = BeautifulSoup(response.text, 'html.parser')

        job_data = {}

        # Extract job title
        title_elem = soup.select_one('h2[data-testid="simpler-jobTitle"]')
        job_data['title'] = title_elem.text.strip() if title_elem else 'N/A'

        # Extract company name
        company_elem = soup.select_one('a.jobsearch-JobInfoHeader-companyNameLink')
        job_data['company'] = company_elem.text.strip() if company_elem else 'N/A'

        # Extract location
        location_elem = soup.select_one('div[data-testid="jobsearch-JobInfoHeader-companyLocation"]')
        job_data['location'] = location_elem.text.strip() if location_elem else 'N/A'

        # Extract job description
        description_elem = soup.select_one('div.jobsearch-JobComponent-description')
        job_data['job_description'] = description_elem.text.strip() if description_elem else 'N/A'

        insights_elems = soup.select('div.js-match-insights-provider-16m282m.e37uo190')
        for insight in insights_elems:
            # Extract the title
            title_elem = insight.select_one('h3.js-match-insights-provider-11n8e9a')
            title_text = title_elem.get_text(strip=True) if title_elem else ""

            # Extract data based on the title
            if "Pay" in title_text:
                salary_elem = insight.select_one('span.js-match-insights-provider-4pmm6z')
                job_data['salary'] = salary_elem.get_text(strip=True) if salary_elem else "N/A"
            elif "Job type" in title_text:
                job_type_elems = insight.select('ul.js-match-insights-provider-h884c4 li span.js-match-insights-provider-4pmm6z')
                job_data['job_type'] = ', '.join(elem.get_text(strip=True) for elem in job_type_elems) if job_type_elems else "N/A"
            elif "Work setting" in title_text:
                work_setting_elem = insight.select_one('span.js-match-insights-provider-4pmm6z')
                job_data['work_setting'] = work_setting_elem.get_text(strip=True) if work_setting_elem else "N/A"
            elif "Shift and schedule" in title_text:
                shift_schedule_elem = insight.select_one('span.js-match-insights-provider-4pmm6z')
                job_data['shift_and_schedule'] = shift_schedule_elem.get_text(strip=True) if shift_schedule_elem else "N/A"
            elif "Medical specialty" in title_text:
                medical_specialty_elem = insight.select_one('span.js-match-insights-provider-4pmm6z')
                job_data['medical_specialty'] = medical_specialty_elem.get_text(strip=True) if medical_specialty_elem else "N/A"
            else:
                # Extract other information under different titles
                other_info_elem = insight.select_one('span.js-match-insights-provider-4pmm6z')
                job_data['other_info'] = other_info_elem.get_text(strip=True) if other_info_elem else "N/A"


        return job_data

    except Exception as e:
        print(f"Error scraping {job_url}: {e}")
        return None

def main():
    # Load job URLs from file
    job_links_file = f'job_links_{NAME}.xlsx'
    output_file = f'jobs_data_{NAME}_scraperapi.xlsx'

    if not os.path.exists(job_links_file):
        print(f"Error: {job_links_file} not found.")
        return

    job_urls = pd.read_excel(job_links_file)['Job URL'].tolist()
    job_urls = list(set([link for link in job_urls if 'pagead' not in link]))

    # Load existing data if available
    all_jobs = []
    if os.path.exists(output_file):
        existing_data = pd.read_excel(output_file)
        all_jobs = existing_data.to_dict(orient='records')
        already_scraped = existing_data['job_url'].tolist()
    else:
        already_scraped = []

    existing_data2 = pd.read_excel(f'jobs_data_{NAME}.xlsx')
    already_scraped2 = existing_data2['job_url'].tolist()

    job_urls = [url for url in job_urls if url not in already_scraped and url not in already_scraped2]
    print("Total job URLs to scrape:", len(job_urls))

    for i, job_url in enumerate(job_urls):
        print(f"Scraping job {i + 1}/{len(job_urls)}: {job_url}")
        job_data = get_job_data(job_url)
        if job_data:
            job_data['job_url'] = job_url
            all_jobs.append(job_data)
            new_data = pd.DataFrame(all_jobs)
            new_data.to_excel(output_file, index=False)
            print(f"Saved {len(all_jobs)} records to {output_file}.")
        # Wait to avoid rate-limiting
        time.sleep(random.uniform(1, 4))

if __name__ == '__main__':
    main()
