In [8]:
import requests
import time
import threading
from loguru import logger
from pymongo import MongoClient

# Configure logging
def configure_logging():
    logger.add("scraper.log", level="INFO", format="{time} - {level} - {message}")

# Set up MongoDB client and database
def setup_mongo_client():
    client = MongoClient("mongodb://localhost:27017/")
    db = client["scraper_db"]
    collection = db["page_contents"]
    return collection

# Define base URL and headers
base_url = "https://jobinja.ir/companies?page="
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}

# Function to scrape a single page
def scrape_page(page_number, collection):
    url = base_url + str(page_number)
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raises an HTTPError for bad responses
        page_data = {
            'page_number': page_number,
            'html_content': response.text
        }
        collection.insert_one(page_data)
        logger.info(f"Successfully scraped and stored page {page_number}")
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error occurred while scraping page {page_number}: {http_err}")
    except Exception as err:
        logger.error(f"An error occurred while scraping page {page_number}: {err}")

# Function to scrape pages using threading
def scrape_pages_threaded(start_page, end_page, collection, num_threads=10, delay=2):
    def thread_scrape(thread_id, page_range):
        for page_number in page_range:
            scrape_page(page_number, collection)
            time.sleep(delay)  # Sleep to avoid overwhelming the server

    page_ranges = [
        range(start_page + i * (end_page - start_page + 1) // num_threads, 
              start_page + (i + 1) * (end_page - start_page + 1) // num_threads)
        for i in range(num_threads)
    ]

    threads = []
    for i, page_range in enumerate(page_ranges):
        thread = threading.Thread(target=thread_scrape, args=(i, page_range))
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

# Main function to run the scraper
def run_scraper(start_page, end_page, num_threads=10, delay=2):
    configure_logging()
    collection = setup_mongo_client()
    scrape_pages_threaded(start_page, end_page, collection, num_threads, delay)

# Run the scraper
run_scraper(start_page=1, end_page=10, num_threads=10)


[32m2024-09-21 05:57:35.951[0m | [1mINFO    [0m | [36m__main__[0m:[36mscrape_page[0m:[36m35[0m - [1mSuccessfully scraped and stored page 4[0m
[32m2024-09-21 05:57:35.959[0m | [1mINFO    [0m | [36m__main__[0m:[36mscrape_page[0m:[36m35[0m - [1mSuccessfully scraped and stored page 5[0m
[32m2024-09-21 05:57:35.962[0m | [1mINFO    [0m | [36m__main__[0m:[36mscrape_page[0m:[36m35[0m - [1mSuccessfully scraped and stored page 7[0m
[32m2024-09-21 05:57:35.963[0m | [1mINFO    [0m | [36m__main__[0m:[36mscrape_page[0m:[36m35[0m - [1mSuccessfully scraped and stored page 9[0m
[32m2024-09-21 05:57:35.966[0m | [1mINFO    [0m | [36m__main__[0m:[36mscrape_page[0m:[36m35[0m - [1mSuccessfully scraped and stored page 3[0m
[32m2024-09-21 05:57:35.987[0m | [1mINFO    [0m | [36m__main__[0m:[36mscrape_page[0m:[36m35[0m - [1mSuccessfully scraped and stored page 1[0m
[32m2024-09-21 05:57:36.007[0m | [1mINFO    [0m | [36m__main__[0m:[36

In [9]:
from bs4 import BeautifulSoup
from loguru import logger
from pymongo import MongoClient

# Configure logging
def configure_logging():
    logger.add("scraper.log", level="INFO", format="{time} - {level} - {message}")

# Set up MongoDB client and databases
def setup_mongo_client():
    client = MongoClient("mongodb://localhost:27017/")
    db = client["scraper_db"]
    return db

# Function to extract links from HTML content
def extract_links(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    links = [a['href'] for a in soup.find_all('a', href=True, attrs={"class": "c-companyOverview"})]
    return links

# Function to process HTML content and save extracted links to MongoDB
def process_html_content(db):
    collection_html = db["page_contents"]
    collection_links = db["extracted_links"]

    url_results = []
    cursor = collection_html.find()

    for item in cursor:
        page_number = item['page_number']
        html_content = item['html_content']
        links = extract_links(html_content)
        
        url_results.append({
            "page_number": page_number,
            "url_list": links
        })
        logger.info(f"Extracted links on page {page_number}")
        
    # Insert the results into the new collection
    collection_links.insert_many(url_results)
    logger.info(f"All links have been extracted and saved to the database")

# Main function to run the extraction process
def run_extractor():
    configure_logging()
    db = setup_mongo_client()
    process_html_content(db)

# Run the extractor
if __name__ == "__main__":
    run_extractor()


[32m2024-09-21 05:57:44.076[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_html_content[0m:[36m38[0m - [1mExtracted links on page 4[0m
[32m2024-09-21 05:57:44.161[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_html_content[0m:[36m38[0m - [1mExtracted links on page 5[0m
[32m2024-09-21 05:57:44.181[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_html_content[0m:[36m38[0m - [1mExtracted links on page 9[0m
[32m2024-09-21 05:57:44.204[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_html_content[0m:[36m38[0m - [1mExtracted links on page 7[0m
[32m2024-09-21 05:57:44.224[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_html_content[0m:[36m38[0m - [1mExtracted links on page 3[0m
[32m2024-09-21 05:57:44.244[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_html_content[0m:[36m38[0m - [1mExtracted links on page 1[0m
[32m2024-09-21 05:57:44.263[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_html_content[0

In [10]:
import requests
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from loguru import logger
from pymongo import MongoClient

# Configure Loguru to write logs to a file
logger.add("script_logs.log", rotation="500 MB")

# Set up MongoDB client and Databases
def setup_mongo_client():
    client = MongoClient("mongodb://localhost:27017/")
    db = client["scraper_db"]
    return db['extracted_links'], db['detailed_pages']

# Function to fetch HTML content
def fetch_html(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers, allow_redirects=True)
        if response.status_code == 200:
            if response.url.endswith('/jobs'):
                logger.info(f"{url} redirects to /jobs")
            return response.text, response.url
        else:
            logger.error(f"Failed to fetch {url} with status code {response.status_code}")
            return None, url
    except Exception as e:
        logger.error(f"An error occurred while fetching {url}: {e}")
        return None, url

# Function to scrape URL detail pages
def scrape_url_detail_pages(url_list):
    results = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_url = {executor.submit(fetch_html, url): url for url in url_list}

        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                main_html, actual_url = future.result()
            except Exception as exc:
                logger.error(f"Error fetching {url}: {exc}")
                main_html, actual_url = None, url

            if main_html is None:
                results.append({"url": url, "main_page": {"html_content": ""}, "job_page": {"html_content": ""}, "about_page": {"html_content": ""}})
                continue

            result = {
                "url": url,
                "main_page": {"html_content": main_html if not actual_url.endswith('/jobs') else ""},
                "job_page": {"html_content": main_html if actual_url.endswith('/jobs') else ""},
                "about_page": {"html_content": ""}
            }

            if not actual_url.endswith('/jobs'):
                about_page_url = url + "/about"
                job_page_url = url + "/jobs"

                # Fetch about and jobs pages
                about_html, _ = fetch_html(about_page_url)
                jobs_html, _ = fetch_html(job_page_url)

                result["about_page"]["html_content"] = about_html
                result["job_page"]["html_content"] = jobs_html

            results.append(result)

    logger.info("Scraping completed for the given list of URLs")
    return results

# Function to store results in MongoDB
def store_results_in_mongo(url_detail_pages, collection):
    collection.insert_many(url_detail_pages)
    logger.info(f"Data has been stored in MongoDB")

# Main function to run the extraction process and store data in MongoDB
def run_extractor():
    configure_logging()
    collection_links, collection_pages = setup_mongo_client()

    # Process each document in the collection_links
    cursor = collection_links.find()
    for item in cursor:
        url_list = item['url_list']
        url_detail_pages = scrape_url_detail_pages(url_list)

        store_results_in_mongo(url_detail_pages, collection_pages)

        logger.success(f"Links on page {item['page_number']} have been processed and stored in MongoDB.")

# Function to configure logging
def configure_logging():
    logger.add("scraper.log", level="INFO", format="{time} - {level} - {message}")

# Run the extractor
if __name__ == "__main__":
    run_extractor()


[32m2024-09-21 05:57:51.418[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_html[0m:[36m25[0m - [1mhttps://jobinja.ir/companies/autokhatib/jobs redirects to /jobs[0m
[32m2024-09-21 05:57:49.728[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_html[0m:[36m25[0m - [1mhttps://jobinja.ir/companies/Dade%20Negar/jobs redirects to /jobs[0m
[32m2024-09-21 05:57:50.234[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_html[0m:[36m25[0m - [1mhttps://jobinja.ir/companies/ketabemarja/jobs redirects to /jobs[0m
[32m2024-09-21 05:57:50.747[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_html[0m:[36m25[0m - [1mhttps://jobinja.ir/companies/inoup/jobs redirects to /jobs[0m
[32m2024-09-21 05:57:51.330[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_html[0m:[36m25[0m - [1mhttps://jobinja.ir/companies/hmpco/jobs redirects to /jobs[0m
[32m2024-09-21 05:57:51.897[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_html[0m:[36m25[0m - [1mhttps:/