## Scrape Page HTML Contents

In [4]:
import requests
from bs4 import BeautifulSoup
import json
import time
from loguru import logger


logger.add("scraper.log", level="INFO", format="{time} - {level} - {message}")

# Define the base URL and headers
base_url = "https://jobinja.ir/companies?page="
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}

# Dictionary to store page content
page_contents = []

# Function to scrape a single page
def scrape_page(page_number):
    url = base_url + str(page_number)
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raises an HTTPError for bad responses
        page_contents.append({
            'page_number': page_number,
            'html_content': response.text
        })
        logger.info(f"Successfully scraped page {page_number}")
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred while scraping page {page_number}: {http_err}")
    except Exception as err:
        logger.error(f"An error occurred while scraping page {page_number}: {err}")

# Function to save results to a JSON file
def save_results(filename='page_contents.json'):
    try:
        with open(filename, 'w', encoding='utf-8') as file:
            json.dump(page_contents, file, ensure_ascii=False, indent=4)
        logger.info(f"Scraping completed and data saved to {filename}")
    except Exception as e:
        logger.error(f"An error occurred while saving the data to file: {e}")

# Scrape pages
def scrape_pages(start_page, end_page, delay=2):
    for page_number in range(start_page, end_page + 1):
        scrape_page(page_number)
        time.sleep(delay)  # Sleep to avoid overwhelming the server

# Scrape all pages from 1 to 1000
scrape_pages(start_page=1, end_page=120)

# Save the results to a JSON file
save_results("page_number_html_contents_1_120.json")


## Extract URLs From All Pages:

### Read page content data

In [33]:
# Load JSON data from file
with open('page_number_html_contents_1_120.json', 'r', encoding='utf-8') as file:
    page_number_html_contents = json.load(file)

### Function to extract links from HTML content

In [36]:
def extract_links(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    links = [a['href'] for a in soup.find_all('a', href=True,attrs={"class":"c-companyOverview"})]
    return links


In [6]:
# Process each page's HTML content
url_results = []
for item in page_number_html_contents:
    page_number = item['page_number']
    html_content = item['html_content']
    links = extract_links(html_content)
    
    url_results.append(
        {
            "page_number":page_number,
            "url_list": links
        }
    )
    print(f"Links on page {page_number}")

In [41]:
# Save the results to a JSON file
with open('url_list_page_number_1_120.json', 'w', encoding='utf-8') as file:
    json.dump(url_results, file, ensure_ascii=False, indent=4)

### Function to fetch HTML content from a URL

In [42]:
def fetch_html(url):
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to fetch {url} with status code {response.status_code}")
            return None
    except Exception as e:
        print(f"An error occurred while fetching {url}: {e}")
        return None