# Fetching Data Using Web Scaping

In [1]:
# Import the libraries
import requests
from bs4 import BeautifulSoup
import time
import random

# Define the base URL and the query parameters
base_url = "https://www.indeed.com/jobs"
params = {
    "q": "Python developer", # The keyword to search for
    "start": 0 # The start index for pagination (increment by 10 for each page)
}


# Define a function to scrape one page of results
def scrape_page(url, params, headers):
    # Send a GET request with the URL, parameters, and headers
    response = requests.get(url, params=params, headers=headers)
    # Check if the response status code is 200 (OK)
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, "html.parser")
        # Find all the job cards in the HTML
        job_cards = soup.find_all("div", class_="jobsearch-SerpJobCard")
        # Loop through each job card
        for job_card in job_cards:
            # Extract the job title, company, location, and link from the job card
            job_title = job_card.find("h2", class_="title").a.get("title")
            company = job_card.find("span", class_="company").text.strip()
            location = job_card.find("div", class_="location").text.strip()
            link = "https://www.indeed.com" + job_card.find("h2", class_="title").a.get("href")
            # Print the extracted data
            print(f"Job title: {job_title}")
            print(f"Company: {company}")
            print(f"Location: {location}")
            print(f"Link: {link}")
            print("-" * 80)
    else:
        # Print the response status code if not 200
        print(f"Response status code: {response.status_code}")

# Define the number of pages to scrape (10 results per page)
num_pages = 3

# Loop through each page
for page in range(num_pages):
    
    # Define the headers with the user-agent
    headers = {'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'}
    
    # Print the current page number
    print(f"Scraping page {page + 1}")
    # Scrape the current page
    scrape_page(base_url, params, headers)
    
    time.sleep(random.uniform(1, 3))
    # Increment the start parameter by 10 for the next page
    params["start"] += 10
    # Wait for 2 seconds before scraping the next page
    time.sleep(random.uniform(1, 3)) 
    
    


Scraping page 1
Response status code: 403
Scraping page 2
Response status code: 403
Scraping page 3
Response status code: 403


In [4]:
!pip install cloudscraper

Collecting cloudscraper
  Downloading cloudscraper-1.2.71-py2.py3-none-any.whl (99 kB)
     ---------------------------------------- 0.0/99.7 kB ? eta -:--:--
     ---- ----------------------------------- 10.2/99.7 kB ? eta -:--:--
     ---- ----------------------------------- 10.2/99.7 kB ? eta -:--:--
     ----------- -------------------------- 30.7/99.7 kB 330.3 kB/s eta 0:00:01
     ----------------------------------- -- 92.2/99.7 kB 585.1 kB/s eta 0:00:01
     -------------------------------------- 99.7/99.7 kB 519.8 kB/s eta 0:00:00
Collecting requests-toolbelt>=0.9.1 (from cloudscraper)
  Downloading requests_toolbelt-1.0.0-py2.py3-none-any.whl (54 kB)
     ---------------------------------------- 0.0/54.5 kB ? eta -:--:--
     ---------------------------------------- 54.5/54.5 kB 2.8 MB/s eta 0:00:00
Installing collected packages: requests-toolbelt, cloudscraper
Successfully installed cloudscraper-1.2.71 requests-toolbelt-1.0.0


In [5]:
import cloudscraper

scraper = cloudscraper.create_scraper()  # Create a Cloudscraper instance

def scrape_page(url, params, headers):
    response = scraper.get(url, params=params, headers=headers)
    # Check if the response status code is 200 (OK)
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, "html.parser")
        # Find all the job cards in the HTML
        job_cards = soup.find_all("div", class_="jobsearch-SerpJobCard")
        # Loop through each job card
        for job_card in job_cards:
            # Extract the job title, company, location, and link from the job card
            job_title = job_card.find("h2", class_="title").a.get("title")
            company = job_card.find("span", class_="company").text.strip()
            location = job_card.find("div", class_="location").text.strip()
            link = "https://www.indeed.com" + job_card.find("h2", class_="title").a.get("href")
            # Print the extracted data
            print(f"Job title: {job_title}")
            print(f"Company: {company}")
            print(f"Location: {location}")
            print(f"Link: {link}")
            print("-" * 80)
    else:
        # Print the response status code if not 200
        print(f"Response status code: {response.status_code}")

# Define the number of pages to scrape (10 results per page)
num_pages = 3
    # Loop through each page
for page in range(num_pages):
    
    # Define the headers with the user-agent
    headers = {'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'}
    
    # Print the current page number
    print(f"Scraping page {page + 1}")
    # Scrape the current page
    scrape_page(base_url, params, headers)
    
    time.sleep(random.uniform(1, 3))
    # Increment the start parameter by 10 for the next page
    params["start"] += 10
    # Wait for 2 seconds before scraping the next page
    time.sleep(random.uniform(1, 3)) 

Scraping page 1
Response status code: 403
Scraping page 2
Response status code: 403
Scraping page 3
Response status code: 403


In [8]:
!pip install playwright

Collecting playwright
  Downloading playwright-1.40.0-py3-none-win_amd64.whl.metadata (3.6 kB)
Collecting greenlet==3.0.1 (from playwright)
  Downloading greenlet-3.0.1-cp310-cp310-win_amd64.whl.metadata (3.8 kB)
Collecting pyee==11.0.1 (from playwright)
  Downloading pyee-11.0.1-py3-none-any.whl.metadata (2.7 kB)
Downloading playwright-1.40.0-py3-none-win_amd64.whl (29.3 MB)
   ---------------------------------------- 0.0/29.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/29.3 MB 1.4 MB/s eta 0:00:22
   ---------------------------------------- 0.2/29.3 MB 2.9 MB/s eta 0:00:11
    --------------------------------------- 0.5/29.3 MB 3.5 MB/s eta 0:00:09
    --------------------------------------- 0.6/29.3 MB 3.2 MB/s eta 0:00:09
    --------------------------------------- 0.6/29.3 MB 3.2 MB/s eta 0:00:09
   - -------------------------------------- 0.9/29.3 MB 3.4 MB/s eta 0:00:09
   - -------------------------------------- 1.1/29.3 MB 3.5 MB/s eta 0:00:08
   - -------

In [12]:
from playwright.chromium import async_playwright

# Create an async function to initialize the Playwright browser
async def init():
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        context = await browser.new_context()
        page = await context.new_page()
        # Now you can use 'page' for scraping

# Run the initialization function
init()


ModuleNotFoundError: No module named 'playwright.chromium'

In [9]:
from playwright.chromium import playwright

def scrape_page(url, params, headers):
    with playwright.chromium.launch() as p:
        browser = p.chromium.launch()
        page = browser.new_page()
        page.goto(url)

        # Wait for job cards to load dynamically
        job_cards = page.wait_for_selector(".jobsearch-SerpJobCard")

        for job_card in job_cards:
            # Extract data using Playwright's methods
            job_title = job_card.text_content(".title")
            company = job_card.text_content(".company")
            location = job_card.text_content(".location")
            link = job_card.get_attribute("href")
            # Print the extracted data
            print(f"Job title: {job_title}")
            print(f"Company: {company}")
            print(f"Location: {location}")
            print(f"Link: {link}")
            print("-" * 80)
    
        print(f"Response status code: {response.status_code}")
# Define the number of pages to scrape (10 results per page)
num_pages = 3
    # Loop through each page
for page in range(num_pages):
    
    # Define the headers with the user-agent
    headers = {'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'}
    
    # Print the current page number
    print(f"Scraping page {page + 1}")
    # Scrape the current page
    scrape_page(base_url, params, headers)
    
    time.sleep(random.uniform(1, 3))
    # Increment the start parameter by 10 for the next page
    params["start"] += 10
    # Wait for 2 seconds before scraping the next page
    time.sleep(random.uniform(1, 3)) 
browser.close()


ModuleNotFoundError: No module named 'playwright.chromium'